In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df = pd.read_csv("titanic.csv")
df_test = pd.read_csv("test.csv")

In [3]:
dftest2 = df_test.copy(deep=True)
p_ids = dftest2[['PassengerId']]

In [28]:
print(df.shape)
print(df_test.shape)

(8693, 14)
(4277, 13)


In [29]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
df.drop(columns=['PassengerId','Name','Cabin'],inplace=True)
df_test.drop(columns=['PassengerId','Name','Cabin'],inplace=True)

In [5]:
df = pd.get_dummies(df, columns=['HomePlanet','Destination'])
df_test = pd.get_dummies(df_test, columns=['HomePlanet','Destination'])

In [6]:
df.loc[df['CryoSleep'] == True, 'CryoSleep'] = 1
df.loc[df['CryoSleep'] == False, 'CryoSleep'] = 0

df.loc[df['VIP'] == True, 'VIP'] = 1
df.loc[df['VIP'] == False, 'VIP'] = 0

df.loc[df['Transported'] == True, 'Transported'] = 1
df.loc[df['Transported'] == False, 'Transported'] = 0

df['expense'] = df.apply(lambda x: x['RoomService'] + x['FoodCourt'] + x['ShoppingMall'] + x['Spa'] + x['VRDeck'], axis=1)
df.drop(columns=['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'],inplace=True)

df.fillna(df.median(), inplace=True)

In [7]:
df_test.loc[df_test['CryoSleep'] == True, 'CryoSleep'] = 1
df_test.loc[df_test['CryoSleep'] == False, 'CryoSleep'] = 0

df_test.loc[df_test['VIP'] == True, 'VIP'] = 1
df_test.loc[df_test['VIP'] == False, 'VIP'] = 0

df_test['expense'] = df_test.apply(lambda x: x['RoomService'] + x['FoodCourt'] + x['ShoppingMall'] + x['Spa'] + x['VRDeck'], axis=1)
df_test.drop(columns=['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'],inplace=True)

df_test.fillna(df_test.median(), inplace=True) 

In [8]:
df[['CryoSleep','VIP']] = df[['CryoSleep','VIP']].astype(int)
df_test[['CryoSleep','VIP']] = df_test[['CryoSleep','VIP']].astype(int)

In [35]:
df_test.head()

Unnamed: 0,CryoSleep,Age,VIP,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,expense
0,1,27.0,0,1,0,0,0,0,1,0.0
1,0,19.0,0,1,0,0,0,0,1,2832.0
2,1,31.0,0,0,1,0,1,0,0,0.0
3,0,38.0,0,0,1,0,0,0,1,7418.0
4,0,20.0,0,1,0,0,0,0,1,645.0


In [9]:
x_train = df.drop(columns = ['Transported'])
y_train = df['Transported']
x_test = df_test

In [45]:
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [47]:
model = LogisticRegression(solver='lbfgs', C=0.01, penalty='l2')
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [46]:
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
print(grid_result.best_params_)

{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}


In [15]:
model2 = DecisionTreeClassifier()
model2.fit(x_train, y_train)
y_pred2 = model2.predict(x_test)

In [11]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [12]:
model3 = RandomForestClassifier()

In [14]:
rf_random = RandomizedSearchCV(estimator = model3, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(x_train, y_train)
print(rf_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 10, 'bootstrap': True}


In [20]:
model3 = RandomForestClassifier(n_estimators= 200, min_samples_split= 5, min_samples_leaf= 4, max_depth= 10, bootstrap= True)
model3.fit(x_train, y_train)
y_pred3 = model3.predict(x_test)

In [21]:
y_pred_df = pd.DataFrame(y_pred3,columns=['Transported'])

In [22]:
y_pred_df.loc[y_pred_df['Transported'] == 1, 'Transported'] = True
y_pred_df.loc[y_pred_df['Transported'] == 0, 'Transported'] = False

In [23]:
results = pd.concat([p_ids, y_pred_df], axis=1)

In [24]:
results.to_csv('C:\\Users\\Gunisha Chaturvedi\\Dropbox\\My PC (LAPTOP-1CT80JS5)\\Downloads\\result_titanic.csv',index=False)