In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

%matplotlib inline

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [4]:
df_test.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            332
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
dtype: int64

In [5]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [7]:
# Deletando as colunas PassengerId, Ticket pois são valores unicos 
# e Cabin porque tem a grande maioria de valores nulos

df_train = df_train.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)
df_test = df_test.drop(['Ticket', 'Cabin'], axis=1)

In [8]:
# Como existe a falta de informação de embarque de duas pessoas, vale a pena colocar como valor nessas duas pessoas
# o valor da maioria, ou seja Southampton

df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace=True)

In [9]:
# Mesmo raciocínio para a único valor que falta na variável Fare no dataframe de teste

df_test['Fare'].fillna(df_test['Fare'].mode()[0], inplace=True)

In [10]:
# Preenchendo as idades que faltam com a média das idades

df_train['Age'].fillna(df_train['Age'].mean(), inplace=True)
df_test['Age'].fillna(df_test['Age'].mean(), inplace=True)

In [11]:
# Aplicando o log em uma nova coluna "LogFare", para minimizar a distorção dos valores

df_train['LogFare'] = np.log(df_train['Fare']).replace([np.inf, -np.inf],0)
df_test['LogFare'] = np.log(df_test['Fare']).replace([np.inf, -np.inf],0)

In [12]:
#Criando uma nova coluna "TreatmentForm" com a forma de tratamento por cada pessoa, o valor agrupado

def split_title(row):
    return row.split(',')[1].split('.')[0].strip()
    

df_train['TreatmentForm'] = df_train['Name'].apply(split_title)
df_test['TreatmentForm'] = df_test['Name'].apply(split_title)

In [13]:
# Retirando as colunas Name e Fare e criando variáveis dummy para as variáveis categoricas

df_train.drop(['Name', 'Fare'], axis=1, inplace=True)
df_test.drop(['Name', 'Fare'], axis=1, inplace=True)

df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

In [14]:
df_train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,LogFare,Sex_female,Sex_male,Embarked_C,Embarked_Q,...,TreatmentForm_Master,TreatmentForm_Miss,TreatmentForm_Mlle,TreatmentForm_Mme,TreatmentForm_Mr,TreatmentForm_Mrs,TreatmentForm_Ms,TreatmentForm_Rev,TreatmentForm_Sir,TreatmentForm_the Countess
0,0,3,22.0,1,0,1.981001,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1,1,38.0,1,0,4.266662,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1,3,26.0,0,0,2.070022,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1,35.0,1,0,3.972177,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0,3,35.0,0,0,2.085672,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,LogFare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,TreatmentForm_Col,TreatmentForm_Dona,TreatmentForm_Dr,TreatmentForm_Master,TreatmentForm_Miss,TreatmentForm_Mr,TreatmentForm_Mrs,TreatmentForm_Ms,TreatmentForm_Rev
0,892,3,34.5,0,0,2.05786,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,893,3,47.0,1,0,1.94591,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,894,2,62.0,0,0,2.270836,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,895,3,27.0,0,0,2.159003,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,896,3,22.0,1,1,2.508582,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [16]:
df_train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,LogFare,Sex_female,Sex_male,Embarked_C,Embarked_Q,...,TreatmentForm_Master,TreatmentForm_Miss,TreatmentForm_Mlle,TreatmentForm_Mme,TreatmentForm_Mr,TreatmentForm_Mrs,TreatmentForm_Ms,TreatmentForm_Rev,TreatmentForm_Sir,TreatmentForm_the Countess
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,2.893846,0.352413,0.647587,0.188552,0.08642,...,0.044893,0.204265,0.002245,0.001122,0.580247,0.140292,0.001122,0.006734,0.001122,0.001122
std,0.486592,0.836071,13.002015,1.102743,0.806057,1.002899,0.47799,0.47799,0.391372,0.281141,...,0.207186,0.40339,0.047351,0.033501,0.493796,0.347485,0.033501,0.08183,0.033501,0.033501
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,2.068177,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,3.0,29.699118,0.0,0.0,2.670985,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,3.0,35.0,1.0,0.0,3.433987,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,3.0,80.0,8.0,6.0,6.238967,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
df_test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,LogFare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,TreatmentForm_Col,TreatmentForm_Dona,TreatmentForm_Dr,TreatmentForm_Master,TreatmentForm_Miss,TreatmentForm_Mr,TreatmentForm_Mrs,TreatmentForm_Ms,TreatmentForm_Rev
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,2.944218,0.363636,0.636364,0.244019,0.110048,0.645933,0.004785,0.002392,0.002392,0.050239,0.186603,0.574163,0.172249,0.002392,0.004785
std,120.810458,0.841838,12.634534,0.89676,0.981429,1.007149,0.481622,0.481622,0.430019,0.313324,0.478803,0.069088,0.048912,0.048912,0.2187,0.390059,0.495062,0.378049,0.048912,0.069088
min,892.0,1.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,996.25,1.0,23.0,0.0,0.0,2.066331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1100.5,3.0,30.27259,0.0,0.0,2.670985,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,1204.75,3.0,35.75,1.0,0.0,3.449093,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1309.0,3.0,76.0,8.0,9.0,6.238967,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
df_train.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'LogFare', 'Sex_female',
       'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'TreatmentForm_Capt', 'TreatmentForm_Col', 'TreatmentForm_Don',
       'TreatmentForm_Dr', 'TreatmentForm_Jonkheer', 'TreatmentForm_Lady',
       'TreatmentForm_Major', 'TreatmentForm_Master', 'TreatmentForm_Miss',
       'TreatmentForm_Mlle', 'TreatmentForm_Mme', 'TreatmentForm_Mr',
       'TreatmentForm_Mrs', 'TreatmentForm_Ms', 'TreatmentForm_Rev',
       'TreatmentForm_Sir', 'TreatmentForm_the Countess'],
      dtype='object')

In [19]:
df_test.columns

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'LogFare',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'TreatmentForm_Col', 'TreatmentForm_Dona', 'TreatmentForm_Dr',
       'TreatmentForm_Master', 'TreatmentForm_Miss', 'TreatmentForm_Mr',
       'TreatmentForm_Mrs', 'TreatmentForm_Ms', 'TreatmentForm_Rev'],
      dtype='object')

In [20]:
columns = ['Pclass', 'Age', 'Sex_female', 'Sex_male', 'SibSp', 'Parch', 'LogFare', 'Embarked_C', 
            'Embarked_Q', 'Embarked_S', 'TreatmentForm_Master', 'TreatmentForm_Miss', 
            'TreatmentForm_Mr', 'TreatmentForm_Ms', 'TreatmentForm_Rev', 'TreatmentForm_Mrs']


X_train = df_train.loc[:,  columns].values
y_train = df_train.iloc[:, 0].values

In [21]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_classif

lr = LogisticRegression()
rfe = RFE(lr, n_features_to_select=1)
rfe.fit(X_train,y_train)

print(sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), columns)), end="\n\n")

selector = SelectKBest(f_classif, k=2)
selector.fit(X_train, y_train)
print(sorted(zip(map(lambda x: round(x, 4), selector.scores_), columns), reverse=True))

[(1, 'Sex_female'), (2, 'TreatmentForm_Master'), (3, 'Pclass'), (4, 'Sex_male'), (5, 'TreatmentForm_Rev'), (6, 'Embarked_C'), (7, 'TreatmentForm_Mr'), (8, 'SibSp'), (9, 'LogFare'), (10, 'Parch'), (11, 'TreatmentForm_Mrs'), (12, 'Embarked_Q'), (13, 'TreatmentForm_Ms'), (14, 'TreatmentForm_Miss'), (15, 'Embarked_S'), (16, 'Age')]

[(383.9455, 'TreatmentForm_Mr'), (372.4057, 'Sex_male'), (372.4057, 'Sex_female'), (115.4611, 'TreatmentForm_Mrs'), (115.0313, 'Pclass'), (109.9825, 'LogFare'), (106.5091, 'TreatmentForm_Miss'), (25.896, 'Embarked_C'), (20.3745, 'Embarked_S'), (6.5036, 'TreatmentForm_Master'), (5.9635, 'Parch'), (4.3535, 'Age'), (3.7705, 'TreatmentForm_Rev'), (1.6064, 'TreatmentForm_Ms'), (1.1106, 'SibSp'), (0.0118, 'Embarked_Q')]


In [22]:
pipelines = []

pipelines.append(('Scaled-LR', Pipeline([('Scaler', StandardScaler()), ('PCA', PCA(n_components=8)), ('LR', LogisticRegression())])))
pipelines.append(('Scaled-LDA', Pipeline([('Scaler', StandardScaler()), ('PCA', PCA(n_components=8)), ('LDA', LinearDiscriminantAnalysis())])))
pipelines.append(('Scaled-KNN', Pipeline([('Scaler', StandardScaler()), ('PCA', PCA(n_components=8)), ('KNN', KNeighborsClassifier())])))
pipelines.append(('Scaled-CART', Pipeline([('Scaler', StandardScaler()), ('PCA', PCA(n_components=8)), ('CART', DecisionTreeClassifier())])))
pipelines.append(('Scaled-NB', Pipeline([('Scaler', StandardScaler()), ('PCA', PCA(n_components=8)), ('NB', GaussianNB())])))
pipelines.append(('Scaled-SVM', Pipeline([('Scaler', StandardScaler()), ('PCA', PCA(n_components=8)), ('SVM', SVC())])))

for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state = 0)
    cross_val_result = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    text = "%s: %f (%f)" % (name, cross_val_result.mean(), cross_val_result.std())
    print(text)

Scaled-LR: 0.813708 (0.027949)
Scaled-LDA: 0.793483 (0.032292)
Scaled-KNN: 0.821610 (0.036334)
Scaled-CART: 0.739700 (0.038787)
Scaled-NB: 0.765493 (0.036240)
Scaled-SVM: 0.826042 (0.043568)


In [23]:
ensembles = []
ensembles.append(('AB_PCA', Pipeline([('Scaler', StandardScaler()), ('PCA', PCA(n_components=8)), ('AB_PCA', AdaBoostClassifier())])))
ensembles.append(('GBM_PCA', Pipeline([('Scaler', StandardScaler()), ('PCA', PCA(n_components=8)), ('GBM_PCA', GradientBoostingClassifier())])))
ensembles.append(('RF_PCA', Pipeline([('Scaler', StandardScaler()), ('PCA', PCA(n_components=8)), ('RF_PCA', RandomForestClassifier())])))
ensembles.append(('ET_PCA', Pipeline([('Scaler', StandardScaler()), ('PCA', PCA(n_components=8)), ('ET_PCA', ExtraTreesClassifier())])))
ensembles.append(('AB', AdaBoostClassifier()))
ensembles.append(('GBM', GradientBoostingClassifier()))
ensembles.append(('RF', RandomForestClassifier()))
ensembles.append(('ET', ExtraTreesClassifier()))

# Percorrendo cada um dos modelos
for name, model in ensembles:
    kfold = KFold(n_splits=10, random_state = 0)
    cross_val_result = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    text = "%s: %f (%f)" % (name, cross_val_result.mean(), cross_val_result.std())
    print(text)

AB_PCA: 0.810400 (0.040174)
GBM_PCA: 0.811523 (0.043876)
RF_PCA: 0.779001 (0.046737)
ET_PCA: 0.770000 (0.042738)
AB: 0.815968 (0.030841)
GBM: 0.835069 (0.032628)
RF: 0.822684 (0.033928)
ET: 0.796879 (0.034099)


In [24]:
X_test = df_test.loc[:,  columns].values
y_test = df_test.iloc[:, 0].values

In [27]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

pca = PCA(n_components = 8)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

classifier = SVC(C=1, kernel='rbf', gamma=0.3, random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.3, kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [26]:
parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print(best_accuracy, best_parameters)

0.8316498316498316 {'C': 1, 'kernel': 'rbf', 'gamma': 0.3}


In [28]:
predict = classifier.predict(X_test)

In [29]:
csv = {'PassengerId': df_test['PassengerId'], 'Survived': predict}
pd.DataFrame(data=csv).to_csv('Titanic.csv', delimiter=',' , line_terminator='\n')