In [3]:
import pandas as pd
import pandas_profiling

train_df = pd.read_csv("datasets/titanic/train.csv")
pandas_profiling.ProfileReport(train_df)





In [451]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

def calc_nan_age(columns):
    age = columns[0]
    name = columns[1]
    
    if pd.isna(age):
        if 'Miss' in name:
            return 18.0
        else:
            return 35.0
    else:
        return age
    
df = pd.concat([
    train_df.drop(['Cabin', 'Embarked', 'Sex', 'Name', 'Ticket', 'PassengerId', 'Fare'], axis=1),
    pd.get_dummies(train_df['Sex'],drop_first=True)
    #, pd.get_dummies(train_df['Embarked'],drop_first=True)
    ],axis=1)

#df['Age'].fillna(22.0, inplace=True)
df['Age'] = train_df[['Age','Name']].apply(calc_nan_age,axis=1)


X_train, X_test, y_train, y_test = train_test_split(
    df.drop('Survived',axis=1), df['Survived'], test_size=0.2, random_state=42)

mean_train = X_train.mean(axis=0)
std_train = X_train.std(axis=0)
X_train_scaled = (X_train - mean_train) / std_train
X_test_scaled = (X_test - mean_train) / std_train

lr = LogisticRegression(C=0.1, max_iter=10000).fit(X_train, y_train)

print("Training set score : {:.3f}".format(lr.score(X_train, y_train)))
print("Test set score : {:.3f}".format(lr.score(X_test, y_test)))


Training set score : 0.799
Test set score : 0.844


In [448]:
test_df = pd.read_csv("datasets/titanic/test.csv")
X_predict = pd.concat([
    test_df.drop(['Cabin', 'Embarked', 'Sex', 'Name', 'Ticket', 'PassengerId', 'Fare'], axis=1),
    pd.get_dummies(test_df['Sex'],drop_first=True)
    #, pd.get_dummies(test_df['Embarked'],drop_first=True)
    ], axis=1)

X_predict['Age'] = test_df[['Age','Name']].apply(calc_nan_age, axis=1)
X_predict.fillna(0, inplace=True)

X_predict_scaled = (X_predict - mean_train) / std_train

predictions = lr.predict(X_predict)

In [488]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_predict_scaled = scaler.transform(X_predict)

In [374]:
import numpy as np

train_df[train_df['Age'].isna()]
train_df[~train_df['Name'].str.contains(pat = 'Miss')]['Age'].median()


30.0

In [227]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=7, random_state=42)
tree.fit(X_train, y_train)

print("Training set score : {:.3f}".format(tree.score(X_train, y_train)))
print("Test set score : {:.3f}".format(tree.score(X_test, y_test)))

Training set score : 0.874
Test set score : 0.807


In [443]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100, max_depth=4, max_features=3, random_state=42)
forest.fit(X_train, y_train)

print("Training set score : {:.3f}".format(forest.score(X_train, y_train)))
print("Test set score : {:.3f}".format(forest.score(X_test, y_test)))

Training set score : 0.840
Test set score : 0.825


In [489]:
from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(max_depth=3, random_state=42)
gbrt.fit(X_train_scaled, y_train)

print("Training set score : {:.3f}".format(gbrt.score(X_train_scaled, y_train)))
print("Test set score : {:.3f}".format(gbrt.score(X_test_scaled, y_test)))

predictions = gbrt.predict(X_predict_scaled)

Training set score : 0.868
Test set score : 0.832


In [491]:
from sklearn.svm import LinearSVC

lsvc = LinearSVC(max_iter=10000, random_state=42)
lsvc.fit(X_train_scaled, y_train)

print("Training set score : {:.3f}".format(lsvc.score(X_train_scaled, y_train)))
print("Test set score : {:.3f}".format(lsvc.score(X_test_scaled, y_test)))

Training set score : 0.799
Test set score : 0.793


In [490]:
lr = LogisticRegression(C=0.1, max_iter=10000).fit(X_train_scaled, y_train)

print("Training set score : {:.3f}".format(lr.score(X_train_scaled, y_train)))
print("Test set score : {:.3f}".format(lr.score(X_test_scaled, y_test)))

Training set score : 0.792
Test set score : 0.782


In [494]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(solver='lbfgs', max_iter=1000, activation='tanh', alpha=3,
                    hidden_layer_sizes=[10, 10, 10], random_state=42)
mlp.fit(X_train_scaled, y_train)

print("Training set score : {:.5f}".format(mlp.score(X_train_scaled, y_train)))
print("Test set score : {:.5f}".format(mlp.score(X_test_scaled, y_test)))

predictions = mlp.predict(X_predict_scaled)

Training set score : 0.82163
Test set score : 0.83240


In [449]:
out = {'PassengerId': test_df['PassengerId'], 'Survived': predictions }
out_df = pd.DataFrame(out)
out_df.to_csv('/tmp/submission.csv', encoding='utf-8', index=False)