In [1]:
import pandas as pd
import numpy as np

from sklearn_pandas import DataFrameMapper as DFM

from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import GridSearchCV as KCV
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.svm import SVC

In [2]:
#carico dati
df_train = pd.read_csv('../data/train.csv', header = 0, index_col = 'PassengerId')
df_test = pd.read_csv('../data/test.csv', header = 0, index_col = 'PassengerId')
df = pd.concat([df_train, df_test], keys=['train', 'test'], sort=False)

#nuove colonne
df['Title'] = df['Name'].apply(lambda name: name[name.index(',') + 2 : name.index('.')])
df['LastName'] = df['Name'].apply(lambda name: name[0:name.index(',')])
df['FamilySize'] = (df['SibSp'] + df['Parch'] + 1).astype(float)
df.Pclass = df.Pclass.astype(float)

#fillna:
df.Embarked.fillna(df.Embarked.mode()[0], inplace=True)
#
df.Fare = df.groupby(['Pclass', 'Embarked']).Fare.apply(lambda x : x.fillna(x.loc['train'].mean()))
#
median_age_by_title = pd.DataFrame(df.groupby('Title')['Age'].median())
median_age_by_title.rename(columns = {'Age': 'MedianAgeByTitle'}, inplace=True)
df = df.merge(median_age_by_title, left_on='Title', right_index=True)
df.Age.fillna(df.MedianAgeByTitle, inplace=True)
df.drop(columns=['MedianAgeByTitle'], inplace=True)

#elimino colonne che non uso per addestramento
df.drop(columns=['Cabin', 'Parch', 'SibSp', 'Name', 'LastName', 'Ticket'], inplace=True)

#separo train e test, dati e target
train_df, test_df = df.loc['train'], df.loc['test']
#
train_targets = train_df.pop('Survived')
train_data = train_df
#
test_data = test_df.drop(columns=['Survived'])
test_IDs = test_df.index.values

#costruisco mapper e trasformo
mapper = DFM([(['Age', 'Fare', 'FamilySize', 'Pclass'], StandardScaler()),
              *[(s, LabelBinarizer()) for s in ['Embarked', 'Sex', 'Title']]])
train_data = mapper.fit_transform(train_data)
test_data = mapper.transform(test_data)

In [3]:
#addestramento e validazione
def cv_report(cv, n_top=3):
    """
    report crossvalidation results
    """
    results = cv.cv_results_
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print('Model with rank: {0}'.format(i))
            print('Mean validation score: {0:.3f} (std: {1:.3f})'.format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print('Parameters: {0}'.format(results['params'][candidate]))
            print('')

#svm
kcv = KCV(SVC(),
          param_grid=[{'C'     : [2**i for i in range(-5,6)],
                       'gamma' : [2**i for i in range(-5,6)]}],
          cv=10).fit(train_data, train_targets)

cv_report(kcv)

Model with rank: 1
Mean validation score: 0.817 (std: 0.198)
Parameters: {'C': 4, 'gamma': 0.03125}

Model with rank: 2
Mean validation score: 0.815 (std: 0.196)
Parameters: {'C': 2, 'gamma': 0.0625}

Model with rank: 3
Mean validation score: 0.813 (std: 0.173)
Parameters: {'C': 0.25, 'gamma': 0.25}



In [4]:
#preparo sottomissione
sub = pd.DataFrame({'PassengerId': test_IDs,
                    'Survived'   : kcv.best_estimator_.predict(test_data).astype(int)})

sub.to_csv('../data/subs/ciccio.csv', index=False)