In [79]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [80]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def clean_data(data, features): #function to clean data
    sex = pd.get_dummies(data['Sex'], drop_first=True)
    data = pd.concat([data, sex], axis=1)

    data['Age'].fillna(data['Age'].mean(), inplace=True)
    data['Fare'].fillna(data['Fare'].mean(), inplace=True)
    embarked = pd.get_dummies(data['Embarked'], drop_first=True)
    data = pd.concat([data, embarked], axis=1)

    data = data[features]
    return data

# Getting the most important features to include in models
train = pd.read_csv("./train.csv")
y = train['Survived']
features = ['Age', 'Fare', "male", "Pclass", "SibSp", "Parch", "Q", "S"]
cleaned_train = clean_data(train, features)

X_train, X_test, y_train, y_test = train_test_split(cleaned_train, y, random_state=42)

model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)

feature_importances_df = pd.DataFrame({"feature": features, "importance": model_rf.feature_importances_})
feature_importances_df.sort_values(by="importance", ascending=False, inplace=True, ignore_index=True)
print(feature_importances_df)

  feature  importance
0    Fare    0.275884
1    male    0.262903
2     Age    0.260550
3  Pclass    0.076925
4   SibSp    0.052968
5   Parch    0.037321
6       S    0.022532
7       Q    0.010917


In [86]:
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

# Trying a bunch of different models to see which one is best
best_features = ['Age', 'Fare', "male"]
cleaned_train = clean_data(train, best_features)
X_train, X_test, y_train, y_test = train_test_split(cleaned_train, y, random_state=42)

models = [LogisticRegression, SVC, MultinomialNB, GaussianNB, SGDClassifier, KNeighborsClassifier, DecisionTreeClassifier, RandomForestClassifier, GradientBoostingClassifier]
model_names = ["LogisticRegression", "SVC", "MultinomialNB", "GaussianNB","SGDClassifier", "KNeighborsClassifier", "DecisionTreeClassifier", "RandomForestClassifier", "GradientBoostingClassifier"]
fitted_models = []
model_scores = {}

for model in models:
    model = model()
    model.fit(X_train, y_train)
    fitted_models.append(model)
    model_scores[model] = model.score(X_test, y_test)

model_score_df = pd.DataFrame({"model": model_names, "score": model_scores.values()})
model_score_df.sort_values(by="score", ascending=False, inplace=True, ignore_index=True)
print(model_score_df)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     891 non-null    float64
 1   Fare    891 non-null    float64
 2   male    891 non-null    bool   
dtypes: bool(1), float64(2)
memory usage: 14.9 KB
                        model     score
0  GradientBoostingClassifier  0.811659
1                  GaussianNB  0.775785
2          LogisticRegression  0.771300
3      RandomForestClassifier  0.762332
4      DecisionTreeClassifier  0.726457
5               MultinomialNB  0.713004
6        KNeighborsClassifier  0.699552
7                         SVC  0.659193
8               SGDClassifier  0.645740


In [99]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_train, y, random_state=42)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

#Obtaining best params for each model (except logistic regression)
param_grid = {'n_estimators': sp_randint(100, 1000), 'criterion': ['gini', 'entropy', 'log_loss']}
random_search_rfc = RandomizedSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy', return_train_score=True, verbose=True)
random_search_rfc.fit(X_train, y_train)
print(random_search_rfc.best_estimator_)
## Best params: RandomForestClassifier(n_estimators=444)

param_grid = {'n_estimators': sp_randint(100, 1000), 'max_depth': sp_randint(1, 10), 'learning_rate': sp_uniform(), 'criterion': ['friedman_mse', 'squared_error']}
random_search_gbc = RandomizedSearchCV(GradientBoostingClassifier(), param_grid, cv=5, scoring='accuracy', return_train_score=True, verbose=True)
random_search_gbc.fit(X_train, y_train)
print(random_search_gbc.best_estimator_)
## Best params: GradientBoostingClassifier(learning_rate=0.3628220866251246, max_depth=6, n_estimators=264)

#Using voting classifier to combine the best models
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[("gbc", random_search_gbc.best_estimator_), ("rfc", random_search_rfc.best_estimator_), ("lg", LogisticRegression())], voting='hard')
voting_clf.fit(X_train, y_train)
print(voting_clf.score(X_test, y_test))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
RandomForestClassifier(n_estimators=444)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
GradientBoostingClassifier(learning_rate=0.3628220866251246, max_depth=6,
                           n_estimators=264)
0.7623318385650224


In [100]:
#Using the voting classifier to predict on test data
test = pd.read_csv("./test.csv")
passenger_ids = test['PassengerId']
test = clean_data(test, best_features)

test.info()

voting_clf = VotingClassifier(estimators=[("gbc", random_search_gbc.best_estimator_), ("rfc", random_search_rfc.best_estimator_), ("lg", LogisticRegression())], voting='hard')
voting_clf.fit(X_train, y_train)
y_test = voting_clf.predict(test[best_features])


submission = pd.DataFrame({'PassengerId':passenger_ids,'Survived':y_test})
submission.to_csv('submission.csv',index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     418 non-null    float64
 1   Fare    418 non-null    float64
 2   male    418 non-null    bool   
dtypes: bool(1), float64(2)
memory usage: 7.1 KB
