<a href="https://colab.research.google.com/github/flp-cmd/Notebooks_ML/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data Cleaning

In [118]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [119]:
data = pd.read_csv("train.csv")

In [120]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [121]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [122]:
data.corr()['Survived'].sort_values(ascending=False)

  data.corr()['Survived'].sort_values(ascending=False)


Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

In [123]:
from sklearn.model_selection import StratifiedShuffleSplit

In [124]:
split = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)
for train_index, test_index in split.split(data, data[['Survived', 'Pclass', 'Sex']]):
  strat_train_set = data.loc[train_index]
  strat_test_set = data.loc[test_index]

In [125]:
strat_train_set['Survived'].value_counts()/len(strat_train_set)

0    0.616372
1    0.383628
Name: Survived, dtype: float64

In [126]:
strat_test_set['Survived'].value_counts()/len(strat_test_set)

0    0.615672
1    0.384328
Name: Survived, dtype: float64

In [127]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [128]:
class AgeImputer(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    imputer = SimpleImputer(strategy = 'mean')
    X['Age'] = imputer.fit_transform(X[['Age']])
    return X

In [129]:
from sklearn.preprocessing import OneHotEncoder

class CategoryEncoder(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    X['Embarked'].fillna('S',inplace=True)
    ohe = OneHotEncoder()
    matrix_cat = ohe.fit_transform(X[['Sex','Embarked']]).toarray()
    features = pd.DataFrame(data = matrix_cat, columns=['female', 'male', 'C', 'Q', 'S'])
    X = X.reset_index().drop('index',axis=1)
    X = pd.concat([X, features],axis=1)
    return X

In [130]:
class FeatureDropper(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    return X.drop(['Embarked','Sex','Name','Ticket','Cabin'], axis=1)

In [131]:
class FeatureScalingSplitting(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    scaler = StandardScaler()
    X_transformed = X.drop('Survived',axis=1)
    y_transformed = X['Survived']

    X_transformed = scaler.fit_transform(X_transformed)
    y_transformed = y_transformed.to_numpy()

    return X_transformed, y_transformed

In [132]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('ageImputer', AgeImputer()),
                     ("categoryEncoder", CategoryEncoder()),
                     ("featureDropper", FeatureDropper())])

In [133]:
strat_train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
405,406,0,2,"Gale, Mr. Shadrach",male,34.0,1,0,28664,21.0,,S
415,416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S
620,621,0,3,"Yasbeck, Mr. Antoni",male,27.0,1,0,2659,14.4542,,C
332,333,0,1,"Graham, Mr. George Edward",male,38.0,0,1,PC 17582,153.4625,C91,S
752,753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33.0,0,0,345780,9.5,,S


In [134]:
from sklearn.model_selection import cross_val_score

def test_size_opt(model):
    scores_max = 0.01
    test_size_var = 0.05
    while test_size_var <= 0.6:
        split = StratifiedShuffleSplit(n_splits=1, test_size=test_size_var, random_state=42)
        for train_index, test_index in split.split(data, data[['Survived', 'Pclass', 'Sex']]):
          strat_train_set = data.loc[train_index]
          strat_test_set = data.loc[test_index]
        strat_train_set = pipeline.fit_transform(strat_train_set)
        fss = FeatureScalingSplitting()
        X_train, y_train = fss.fit_transform(strat_train_set)
        scores = cross_val_score(model, X_train, y_train, cv=10)
        test_size_var += 0.05
        if scores.mean() > scores_max:
            scores_max = scores.mean()
            best_test_size = test_size_var
    print(f"Melhor test_size: {best_test_size:.2f}\nMáxima média de score: {scores_max:.2f}")

In [135]:
strat_train_set = pipeline.fit_transform(strat_train_set)
fss = FeatureScalingSplitting()
X_train, y_train = fss.fit_transform(strat_train_set)

In [145]:
data_formatted = pipeline.fit_transform(data)
X_final, y_final = fss.fit_transform(data_formatted)

In [149]:
test_data = pd.read_csv("test.csv")
X_final_test = pipeline.fit_transform(test_data)

In [151]:
X_final_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Age          418 non-null    float64
 3   SibSp        418 non-null    int64  
 4   Parch        418 non-null    int64  
 5   Fare         417 non-null    float64
 6   female       418 non-null    float64
 7   male         418 non-null    float64
 8   C            418 non-null    float64
 9   Q            418 non-null    float64
 10  S            418 non-null    float64
dtypes: float64(7), int64(4)
memory usage: 36.0 KB


In [152]:
X_final_test = X_final_test.fillna(method='ffill')

scaler = StandardScaler()

X_final_test = scaler.fit_transform(X_final_test)

#RFC

In [136]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [156]:
rfc = RandomForestClassifier()
test_size_opt(rfc)

Melhor test_size: 0.30
Máxima média de score: 0.83


In [None]:
split = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)
for train_index, test_index in split.split(data, data[['Survived', 'Pclass', 'Sex']]):
  strat_train_set = data.loc[train_index]
  strat_test_set = data.loc[test_index]

In [137]:
grid_params = {"n_estimators":[300,350,400], "max_depth":[8,10,12], "min_samples_split":[10,11,12]}

grid_search_rfc = GridSearchCV(rfc, grid_params, cv=5, scoring='accuracy')
grid_search_rfc.fit(X_train, y_train)

In [138]:
best_rfc_model = grid_search_rfc.best_estimator_

In [139]:
best_rfc_model.fit(X_train, y_train)

In [140]:
best_rfc_model.score(X_train, y_train)

0.9020866773675762

In [141]:
strat_test_set = pipeline.fit_transform(strat_test_set)
X_test, y_test = fss.fit_transform(strat_test_set)

In [142]:
best_rfc_model.score(X_test, y_test)

0.8171641791044776

In [143]:
y_pred = best_rfc_model.predict(X_test)

In [144]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.86       165
           1       0.79      0.71      0.75       103

    accuracy                           0.82       268
   macro avg       0.81      0.80      0.80       268
weighted avg       0.82      0.82      0.81       268



In [146]:
final_rfc = RandomForestClassifier()
grid_params = {"n_estimators":[600,630,650], "max_depth":[15,16,17]}

grid_search_rfc_final = GridSearchCV(final_rfc, grid_params, cv=5, scoring='accuracy')

grid_search_rfc_final.fit(X_final, y_final)

In [147]:
final_best_rfc = grid_search_rfc_final.best_estimator_

In [148]:
final_best_rfc.fit(X_final, y_final)

In [153]:
final_prediction = final_best_rfc.predict(X_final_test)
final_df = pd.DataFrame(test_data['PassengerId'])
final_df['Survived'] = final_prediction

In [154]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


In [155]:
final_df.to_csv("final_predictions2.csv", index=False)

# KNN

In [70]:
from sklearn.neighbors import KNeighborsClassifier

In [71]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in split.split(data, data[['Survived', 'Pclass', 'Sex']]):
  strat_train_set = data.loc[train_index]
  strat_test_set = data.loc[test_index]

In [72]:
knn = KNeighborsClassifier()

In [73]:
test_size_opt(knn)

Melhor test_size: 0.15
Máxima média de score: 0.81


In [77]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
for train_index, test_index in split.split(data, data[['Survived', 'Pclass', 'Sex']]):
  strat_train_set = data.loc[train_index]
  strat_test_set = data.loc[test_index]

In [78]:
strat_train_set =  pipeline.fit_transform(strat_train_set)
fss = FeatureScalingSplitting()
X_train, y_train = fss.fit_transform(strat_train_set)

In [91]:
scores_max = 0.01
for i in range(1,30):
  knn = KNeighborsClassifier(n_neighbors = i)
  scores = cross_val_score(knn, X_train, y_train, cv=10)
  if scores.mean() > scores_max:
    scores_max = scores.mean()
  print(f"Score: {scores_max:.2f} com k = {i}")

Score: 0.78 com k = 1
Score: 0.78 com k = 2
Score: 0.79 com k = 3
Score: 0.80 com k = 4
Score: 0.80 com k = 5
Score: 0.81 com k = 6
Score: 0.81 com k = 7
Score: 0.82 com k = 8
Score: 0.82 com k = 9
Score: 0.82 com k = 10
Score: 0.82 com k = 11
Score: 0.82 com k = 12
Score: 0.82 com k = 13
Score: 0.82 com k = 14
Score: 0.82 com k = 15
Score: 0.82 com k = 16
Score: 0.82 com k = 17
Score: 0.82 com k = 18
Score: 0.82 com k = 19
Score: 0.82 com k = 20
Score: 0.82 com k = 21
Score: 0.82 com k = 22
Score: 0.82 com k = 23
Score: 0.82 com k = 24
Score: 0.82 com k = 25
Score: 0.82 com k = 26
Score: 0.82 com k = 27
Score: 0.82 com k = 28
Score: 0.82 com k = 29


In [92]:
grid_params_knn = {"n_neighbors":[21,22,23,24,25], 'weights':['uniform','distance'], 'algorithm':['ball_tree','kd_tree','brute'],"p":[1,2],"n_jobs":[-1,1]}

In [172]:
grid_search_knn = GridSearchCV(knn, grid_params_knn, cv=5)

In [173]:
grid_search_knn.fit(X_train, y_train)

In [174]:
grid_search_knn.best_estimator_

In [98]:
best_knn = grid_search_knn.best_estimator_

In [99]:
best_knn.fit(X_train, y_train)

In [101]:
strat_test_set = pipeline.fit_transform(strat_test_set)
X_test, y_test = fss.fit_transform(strat_test_set)

In [102]:
best_knn.score(X_test, y_test)

0.8059701492537313

In [103]:
final_prediction_knn = best_knn.predict(X_final_test)
final_df = pd.DataFrame(test_data['PassengerId'])
final_df['Survived'] = final_prediction_knn

In [104]:
final_df.to_csv("final_predictions_knn.csv", index=False)

#SVM

In [158]:
from sklearn.svm import SVC

In [159]:
svm = SVC()

In [161]:
test_size_opt(svm)

Melhor test_size: 0.15
Máxima média de score: 0.82


In [162]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
for train_index, test_index in split.split(data, data[['Survived', 'Pclass', 'Sex']]):
  strat_train_set = data.loc[train_index]
  strat_test_set = data.loc[test_index]

In [163]:
strat_train_set =  pipeline.fit_transform(strat_train_set)
fss = FeatureScalingSplitting()
X_train, y_train = fss.fit_transform(strat_train_set)

In [164]:
svm.fit(X_train, y_train)

In [165]:
svm.score(X_test, y_test)

0.8470149253731343

In [167]:
grid_params_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto', 0.1], 'class_weight': [None, 'balanced'], 'decision_function_shape': ['ovr', 'ovo'], 'shrinking': [True, False]}

In [168]:
grid_search_svm = GridSearchCV(svm, grid_params_svm, cv=5)

In [176]:
grid_search_svm.fit(X_train, y_train)

In [178]:
best_svm = grid_search_svm.best_estimator_

In [179]:
best_svm.fit(X_train, y_train)

In [180]:
best_svm.score(X_test, y_test)

0.8470149253731343

In [181]:
final_prediction_svm = best_svm.predict(X_final_test)
final_df = pd.DataFrame(test_data['PassengerId'])
final_df['Survived'] = final_prediction_svm

In [182]:
final_df.to_csv("final_predictions_svm.csv", index=False)