In [61]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [62]:
train_data_orig = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data = train_data_orig
train_data.head()

In [63]:
test_data_orig = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data = test_data_orig
test_data.head()

In [5]:
train_data.shape, test_data.shape

In [6]:
train_data.dtypes

In [64]:
# import library
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn


from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve, train_test_split
from sklearn.model_selection import learning_curve, GridSearchCV



In [65]:
def evaluation(model, X_train, y_train, X_test, y_test, scoring, plot):

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    #print("test set : ")
    #print(confusion_matrix(y_test, y_pred))
    #print(classification_report(y_test, y_pred)) 
    print("train set : ")
    print(confusion_matrix(y_train, model.predict(X_train)))  
    print(classification_report(y_train, model.predict(X_train) ))

    if plot == True:
        N, train_score, val_score = learning_curve(model, X_train, y_train, shuffle=True, cv = 4, scoring = scoring, train_sizes = np.linspace(0.1, 1, 6) )
        N_A, train_score_A, val_score_A = learning_curve(model, X_train, y_train, shuffle=True, cv = 4, scoring = 'accuracy', train_sizes = np.linspace(0.1, 1, 6) )

        plt.figure(figsize = (12,6))
        plt.subplot(121)
        plt.title(scoring)
        plt.plot(N, train_score.mean(axis=1), label ="train_score")
        plt.plot(N, val_score.mean(axis=1), label ="cross_validation_score")
        plt.legend()
        plt.subplot(122)
        plt.title('accuracy')
        plt.plot(N_A, train_score_A.mean(axis=1), label ="train_score")
        plt.plot(N_A, val_score_A.mean(axis=1), label ="cross_validation_score")
        plt.legend()
        
        
        
    else :
        print("GRAPHIQUE NON DEMANDE")
    return y_pred

In [66]:
############################################################ DATA EXPLORATORY ############################################################
##########################################################################################################################################

for i in ['Survived','Sex', 'Pclass','Age']:
    
    if i != 'Age':
        plt.figure(figsize = (8,5))
    else:
        plt.figure(figsize = (20,5))
    
    plt.title(i)
    (train_data[i].value_counts(dropna = False, normalize=True).plot.bar())
    



In [67]:
######################################################## pre-processing ############################################################

label_encoder = LabelEncoder()
label_encoder.fit(['male', 'female'])
train_data['Sex'] = label_encoder.transform(train_data['Sex'])
test_data['Sex'] = label_encoder.transform(test_data['Sex'])


In [68]:
train_data['Sex'].value_counts()


In [69]:
 
######################################################## FIRST MODEL ################################################################

train_set = train_data[['PassengerId','Pclass','Sex','Survived']]
test_set = test_data[['PassengerId','Pclass','Sex']]

nb_var = train_set.shape[1]-1

train_X_s = (train_set.iloc[:,1:nb_var]).to_numpy()
train_Y_s = np.asanyarray(train_set.iloc[:,nb_var])

test_X_s = (test_set.iloc[:,1:nb_var]).to_numpy()
#test_Y_s = np.asanyarray(test_set.iloc[:,nb_var])


In [70]:
model_DecisionTreeClassifier = DecisionTreeClassifier(random_state = 0, class_weight = "balanced" )
param = { 'criterion': ["gini", "entropy"],
         'min_samples_split' : [5, 10, 20, 40, 50, 80, 100] }

grid_DecisionTreeClassifier = GridSearchCV(model_DecisionTreeClassifier, param_grid= param, cv = 4, scoring = "recall", n_jobs=-1)
resultat_grid_DecisionTreeClassifier = grid_DecisionTreeClassifier.fit(train_X_s, train_Y_s)
resultat_grid_DecisionTreeClassifier.best_params_

In [71]:
final_model_DecisionTreeClassifier = DecisionTreeClassifier(random_state = 0, class_weight = "balanced", criterion ='gini', min_samples_split=5 )
evaluation(final_model_DecisionTreeClassifier, train_X_s, train_Y_s, test_X_s, None, scoring="recall", plot=True)


In [None]:
model_RandomForestClassifier = RandomForestClassifier(class_weight="balanced", random_state =0 )
param = {'criterion': ['gini','entropy'] ,'n_estimators' : [3,5,10,20,50,100,200], 'min_samples_split' : [5, 10, 20, 40, 50, 80, 100] }

grid_RandomForestClassifier = GridSearchCV(model_RandomForestClassifier, param, cv = 4, scoring = "recall", n_jobs=-1)
resultat_grid_RandomForestClassifier = grid_RandomForestClassifier.fit(train_X_s, train_Y_s)
resultat_grid_RandomForestClassifier.best_params_

In [None]:
final_model_RandomForestClassifier = RandomForestClassifier(class_weight="balanced", criterion="gini", n_estimators = 20, random_state=0, min_samples_split = 100) 
evaluation(final_model_RandomForestClassifier, train_X_s, train_Y_s, test_X_s, None, scoring="recall", plot=True)

In [None]:
####################################################
####################################################

In [None]:
######################################################## ADD A COLUMN IN DATASET ################################################################
train_set = train_data[['PassengerId','Pclass','Sex', 'Age','Survived']]
test_set = test_data[['PassengerId','Pclass','Sex', 'Age']]


In [None]:
train_set[train_set['Age'].isna()]

In [None]:
for i in ['Pclass', 'Sex', 'Survived']: 
    print(train_set[train_set['Age'].isna()][i].value_counts()); print('\n')

In [None]:
##############################    ANALYSIS NAN    #############################

plt.figure(figsize=(10,5))
sns.countplot( x="Sex" , hue="Survived" , data=train_set[train_set['Age'].isna()])

In [None]:
plt.figure(figsize=(10,5))
sns.countplot( x="Pclass" , hue="Survived" , data=train_set[train_set['Age'].isna()])

plt.figure(figsize=(10,5))
sns.catplot( x="Pclass" , hue="Survived", col='Sex' , data=train_set[train_set['Age'].isna()], kind="count")

In [None]:
plt.plot(train_set['Age'].value_counts(dropna=True), '.')

In [None]:
plt.figure(figsize=(20,5))
plt.title('AGE')
train_data['Age'].value_counts(dropna = True, normalize=True).sort_index().plot.bar()

plt.figure(figsize=(20,5))
plt.title('Sex')
plt.hist(train_data['Age'],bins = 50)
plt.hist(train_data[train_data['Sex']==1]['Age'],bins = 50)

plt.figure(figsize=(20,5))
plt.title('Suvived')
plt.hist(train_data['Age'],bins = 50)
plt.hist(train_data[train_data['Survived']==1]['Age'],bins = 50)

In [None]:
train_set['Age']

In [None]:

    ######################################################## SECOND MODEL ################################################################


In [None]:
def change_age(x):
    x.fillna('25', inplace =True)
    x['Age']=x['Age'].astype(float)
    x.loc[x['Age']<19,'Age'] = 0
    x.loc[x['Age']>=19,'Age'] = 1

    return x

In [None]:
train_set = train_data[['PassengerId','Pclass','Sex', 'Age','Survived']]
test_set = test_data[['PassengerId','Pclass','Sex', 'Age']]


change_age(train_set)
change_age(test_set)


nb_var = train_set.shape[1]-1

train_X_s = (train_set.iloc[:,1:nb_var]).to_numpy()
train_Y_s = np.asanyarray(train_set.iloc[:,nb_var])

test_X_s = (test_set.iloc[:,1:nb_var]).to_numpy()
#test_Y_s = np.asanyarray(test_set.iloc[:,nb_var])


In [None]:
model_DecisionTreeClassifier = DecisionTreeClassifier(random_state = 0 , class_weight = "balanced")
param = { 'criterion': ["gini", "entropy"], 'max_features':[2,3],
         'min_samples_split' : [5, 10, 20, 40, 50, 80 ,100] }

grid_DecisionTreeClassifier = GridSearchCV(model_DecisionTreeClassifier, param_grid= param, cv = 4, scoring = "recall", n_jobs=-1)
resultat_grid_DecisionTreeClassifier = grid_DecisionTreeClassifier.fit(train_X_s, train_Y_s)
resultat_grid_DecisionTreeClassifier.best_params_

In [None]:
final_model_DecisionTreeClassifier = DecisionTreeClassifier(random_state = 0, class_weight = "balanced", criterion ='gini', min_samples_split=5, max_features = 2 )

evaluation(final_model_DecisionTreeClassifier, train_X_s, train_Y_s, test_X_s, None, scoring="recall", plot=True)


In [None]:
model_RandomForestClassifier = RandomForestClassifier(class_weight="balanced", random_state =0 )
param = {'criterion': ['gini','entropy'] ,'n_estimators' : [3,5,10,20,50,100,200], 'max_features':[2,3], 'min_samples_split' : [5, 10, 20, 40, 50, 80,100] }

grid_RandomForestClassifier = GridSearchCV(model_RandomForestClassifier, param, cv = 4, scoring = "recall", n_jobs=-1)
resultat_grid_RandomForestClassifier = grid_RandomForestClassifier.fit(train_X_s, train_Y_s)
resultat_grid_RandomForestClassifier.best_params_

In [None]:
final_model_RandomForestClassifier = RandomForestClassifier(class_weight="balanced", criterion="gini", n_estimators = 10, random_state=0, max_features=2, min_samples_split=5) 
evaluation(final_model_RandomForestClassifier, train_X_s, train_Y_s, test_X_s, None, scoring="recall", plot=True)

In [None]:
pd.set_option("display.max_rows", 50)
train_data[train_data['Age']<10]

In [None]:
# La pluplart des hommes sont morts dans S
plt.figure(figsize=(10,5))
sns.countplot( x="Embarked" , hue="Survived" , data=train_data)

plt.figure(figsize=(10,5))
sns.catplot( x="Embarked" , hue="Survived", col='Sex' , data=train_data, kind="count")

In [None]:
plt.figure(figsize=(25,5))
sns.countplot( x="Cabin" , hue="Survived" , data=train_data)

In [None]:
plt.figure(figsize=(10,5))
sns.countplot( x="Sex" , hue="Survived" , data=train_set)

In [None]:
plt.figure(figsize=(10,5))
sns.countplot( x="Pclass" , hue="Survived" , data=train_set)

plt.figure(figsize=(10,5))
sns.catplot( x="Pclass" , hue="Survived", col='Sex' , data=train_set, kind='count')

In [None]:
def change_age_v2(x):
    x.loc[:,'Age'].fillna('200', inplace =True)
    x.loc[:,'Age']=x.loc[:,'Age'].astype(float)
    
    x.loc[(x['Sex']==0)&(x['Age']==200) , 'Age'] = 18
    
    
    x.loc[x['Age']<8,'Age'] = 0
    x.loc[(x['Age']>=8)&(x['Age']<19),'Age'] = 1
    x.loc[(x['Age']>=19)&(x['Age']<33),'Age'] = 2
    x.loc[(x['Age']>=33)&(x['Age']<48),'Age'] = 3
    x.loc[(x['Age']>=48)&(x['Age']<65),'Age'] = 4
    x.loc[x['Age']>=65,'Age'] = 5

    return x

In [None]:
train_set = train_data[['PassengerId','Pclass','Sex', 'Age','Survived']]
test_set = test_data[['PassengerId','Pclass','Sex', 'Age']]


change_age_v2(train_set)
change_age_v2(test_set)


nb_var = train_set.shape[1]-1

train_X_s = (train_set.iloc[:,1:nb_var]).to_numpy()
train_Y_s = np.asanyarray(train_set.iloc[:,nb_var])

test_X_s = (test_set.iloc[:,1:nb_var]).to_numpy()
#test_Y_s = np.asanyarray(test_set.iloc[:,nb_var])


In [None]:
train_set['Age'].value_counts()

In [None]:
model_DecisionTreeClassifier = DecisionTreeClassifier(random_state = 0 , class_weight = "balanced")
param = { 'criterion': ["gini", "entropy"], 'max_features':[2,3],
         'min_samples_split' : [5, 10, 20, 40, 50, 100] }

grid_DecisionTreeClassifier = GridSearchCV(model_DecisionTreeClassifier, param_grid= param, cv = 4, scoring = "recall", n_jobs=-1)
resultat_grid_DecisionTreeClassifier = grid_DecisionTreeClassifier.fit(train_X_s, train_Y_s)
resultat_grid_DecisionTreeClassifier.best_params_

In [None]:
final_model_DecisionTreeClassifier = DecisionTreeClassifier(random_state = 0, class_weight = "balanced", criterion ='gini', min_samples_split=100, max_features = 3 )

evaluation(final_model_DecisionTreeClassifier, train_X_s, train_Y_s, test_X_s, None, scoring="recall", plot=True)


In [None]:
model_RandomForestClassifier = RandomForestClassifier(class_weight="balanced", random_state =0 )
param = {'criterion': ['gini','entropy'] ,'n_estimators' : [3,5,10,20,50,100,200], 'max_features':[2,3], 'min_samples_split' : [5, 10, 20, 40, 50,80, 100] }

grid_RandomForestClassifier = GridSearchCV(model_RandomForestClassifier, param, cv = 4, scoring = "recall", n_jobs=-1)
resultat_grid_RandomForestClassifier = grid_RandomForestClassifier.fit(train_X_s, train_Y_s)
resultat_grid_RandomForestClassifier.best_params_

In [None]:
final_model_RandomForestClassifier = RandomForestClassifier(random_state=0, class_weight="balanced", criterion="gini", n_estimators = 10, max_features=2, min_samples_split=80) 
evaluation(final_model_RandomForestClassifier, train_X_s, train_Y_s, test_X_s, None, scoring="recall", plot=True)

In [None]:
plt.figure(figsize=(10,5))
sns.countplot( x="Age" , hue="Survived" , data=train_set)

plt.figure(figsize=(10,5))
sns.catplot( x="Age" , hue="Survived", col='Sex' , data=train_set, kind='count')

In [None]:

    ######################################################## THIRD MODEL ################################################################


In [72]:
def change_age_v2(x):
    x.loc[:,'Age'].fillna('200', inplace =True); print('OK1')
    x.loc[:,'Age']=x.loc[:,'Age'].astype(float); print('OK2')
    
    x.loc[(x['Sex']==0)&(x['Age']==200) , 'Age'] = 25; print('OK3')
    

    x.loc[x['Age']<8,'Age'] = 0; print('OK4')
    x.loc[(x['Age']>=8)&(x['Age']<19),'Age'] = 1; print('OK5')
    x.loc[(x['Age']>=19)&(x['Age']<33),'Age'] = 1; print('OK6')
    x.loc[(x['Age']>=33)&(x['Age']<40),'Age'] = 1; print('OK7')
    x.loc[(x['Age']>=40)&(x['Age']<65),'Age'] = 2; print('OK8')
    x.loc[(x['Age']>=65)&(x['Age']<77),'Age'] = 3; print('OK9')
    x.loc[x['Age']>=77,'Age'] = 0; print('OK10')
    

def change_Pclass(x):
    x.loc[(x['Pclass']==1)|(x['Pclass']==2),'Pclass']=0
    x.loc[(x['Pclass']==3),'Pclass']=1

def change_SibSp(x):
    x.loc[x['SibSp']!=0,'SibSp'] = 1
    
def change_Parch(x):
    x.loc[x['Parch']!=0,'Parch'] = 1

def change_Embarked(x):
    x.loc[:,'Embarked'].fillna('C', inplace=True)
    x.loc[x['Embarked']=='S','Embarked']='0'
    x.loc[x['Embarked']=='C','Embarked']='1'
    x.loc[x['Embarked']=='Q','Embarked']='1'
    


In [73]:
#train_set = train_data[['PassengerId','Pclass','Sex', 'Age','SibSp','Parch','Embarked','Survived']]
#test_set = test_data[['PassengerId','Pclass','Sex', 'Age','SibSp','Parch','Embarked']]

train_set = train_data[['PassengerId','Pclass','Sex', 'Age','Parch','Embarked','Survived']]
test_set = test_data[['PassengerId','Pclass','Sex', 'Age','Parch','Embarked']]


change_age_v2(train_set)
change_age_v2(test_set)

change_Pclass(train_set)
change_Pclass(test_set)

#change_SibSp(train_set)
#change_SibSp(test_set)

change_Parch(train_set)
change_Parch(test_set)

change_Embarked(train_set)
change_Embarked(test_set)




nb_var = train_set.shape[1]-1

train_X_s = (train_set.iloc[:,1:nb_var]).to_numpy()
train_Y_s = np.asanyarray(train_set.iloc[:,nb_var])

test_X_s = (test_set.iloc[:,1:nb_var]).to_numpy()
#test_Y_s = np.asanyarray(test_set.iloc[:,nb_var])

In [74]:
train_set['Pclass'].value_counts(), train_set['Parch'].value_counts(), train_set['Age'].value_counts(dropna=False), train_set['Embarked'].value_counts(dropna=False),train_set['Sex'].value_counts()

In [75]:
model_DecisionTreeClassifier = DecisionTreeClassifier(random_state = 0 , class_weight = "balanced")
param = { 'criterion': ["gini", "entropy"], 'max_features':[2,3,4,5],
         'min_samples_split' : [5, 10, 20, 40, 50, 100] }

grid_DecisionTreeClassifier = GridSearchCV(model_DecisionTreeClassifier, param_grid= param, cv = 4, scoring = "recall", n_jobs=-1)
resultat_grid_DecisionTreeClassifier = grid_DecisionTreeClassifier.fit(train_X_s, train_Y_s)
resultat_grid_DecisionTreeClassifier.best_params_

In [76]:
final_model_DecisionTreeClassifier = DecisionTreeClassifier(random_state = 0, class_weight = "balanced", criterion ='gini', min_samples_split=100, max_features = 3 )

evaluation(final_model_DecisionTreeClassifier, train_X_s, train_Y_s, test_X_s, None, scoring="recall", plot=True)


In [77]:
model_RandomForestClassifier = RandomForestClassifier(class_weight="balanced", random_state =0 )
param = {'criterion': ['gini','entropy'] ,'n_estimators' : [3,5,10,20,50,100,200], 'max_features':[2,3,4,5], 'min_samples_split' : [5, 10, 20, 40, 50,80, 100] }

grid_RandomForestClassifier = GridSearchCV(model_RandomForestClassifier, param, cv = 4, scoring = "recall", n_jobs=-1)
resultat_grid_RandomForestClassifier = grid_RandomForestClassifier.fit(train_X_s, train_Y_s)
resultat_grid_RandomForestClassifier.best_params_

In [78]:
final_model_RandomForestClassifier = RandomForestClassifier(random_state=0, class_weight="balanced", criterion="entropy", n_estimators = 5, max_features=2, min_samples_split=80) 
y_pred = evaluation(final_model_RandomForestClassifier, train_X_s, train_Y_s, test_X_s, None, scoring="recall", plot=True)

In [79]:
#On conserve les noms de variable à part
liste_variables = list(train_set.columns[1:nb_var])
importances = final_model_RandomForestClassifier.feature_importances_
indices = np.argsort(importances)
# style du graphique 
plt.style.use('fivethirtyeight')
%matplotlib inline
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [liste_variables[i] for i in indices])
plt.xlabel('Relative Importance')

In [None]:
model_AdaBoost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(class_weight = "balanced"), random_state =0 )
param = {'base_estimator__criterion': ['gini','entropy'], 'base_estimator__min_samples_split' : [5, 10, 20, 40, 50,80, 100],
         'base_estimator__max_features':[2,3,4,5] , 'n_estimators' : [3,5,10,20,50,100],'learning_rate':[0.01,0.1,1] }

grid_AdaBoost = GridSearchCV(model_AdaBoost, param, cv = 4, scoring = "recall", n_jobs=-1)
resultat_grid_AdaBoost = grid_AdaBoost.fit(train_X_s, train_Y_s)
resultat_grid_AdaBoost.best_params_

In [None]:
final_model_AdaBoost = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(class_weight="balanced", criterion="gini", max_features=3, min_samples_split=100),n_estimators = 5, learning_rate=0.01, random_state=0 ) 
evaluation(final_model_AdaBoost, train_X_s, train_Y_s, test_X_s, None, scoring="recall" , plot =True)

In [None]:
model_RandomForestClassifier = RandomForestClassifier(random_state =0 )
param = {'criterion': ['gini','entropy'] ,'n_estimators' : [3,5,10,20,50,100,200], 'max_features':[2,3,4,5], 'min_samples_split' : [5, 10, 20, 40, 50,80, 100] }

grid_RandomForestClassifier = GridSearchCV(model_RandomForestClassifier, param, cv = 4, scoring = "recall", n_jobs=-1)
resultat_grid_RandomForestClassifier = grid_RandomForestClassifier.fit(train_X_s, train_Y_s)
resultat_grid_RandomForestClassifier.best_params_

In [None]:
final_model_RandomForestClassifier = RandomForestClassifier(random_state=0, criterion="gini", n_estimators = 20, max_features=2, min_samples_split=100) 
y_pred = evaluation(final_model_RandomForestClassifier, train_X_s, train_Y_s, test_X_s, None, scoring="recall", plot=True)

In [None]:
model_AdaBoost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), random_state =0 )
param = {'base_estimator__criterion': ['gini','entropy'], 'base_estimator__min_samples_split' : [5, 10, 20, 40, 50,80, 100],
         'base_estimator__max_features':[2,3,4,5] , 'n_estimators' : [3,5,10,20,50,100], 'learning_rate':[0.01,0.1,1] }

grid_AdaBoost = GridSearchCV(model_AdaBoost, param, cv = 4, scoring = "recall", n_jobs=-1)
resultat_grid_AdaBoost = grid_AdaBoost.fit(train_X_s, train_Y_s)
resultat_grid_AdaBoost.best_params_

In [None]:
final_model_AdaBoost = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(criterion="gini", max_features=3, min_samples_split=80),n_estimators = 5, learning_rate=1, random_state=0 ) 
evaluation(final_model_AdaBoost, train_X_s, train_Y_s, test_X_s, None, scoring="recall" , plot =True)

In [80]:
data_pred = train_data.copy()
y_pred_train= final_model_RandomForestClassifier.predict(train_X_s)
data_pred['y_pred_train'] = y_pred_train
data_pred

In [83]:
diff = data_pred[data_pred['Survived']!=data_pred['y_pred_train']]
diff

In [84]:
plt.figure(figsize=(10,5))
sns.countplot( x="Pclass" , hue="Survived" , data=diff)

plt.figure(figsize=(10,5))
sns.catplot( x="Pclass" , hue="Survived", col='Sex' , data=diff, kind='count')

In [85]:
plt.figure(figsize=(10,5))
sns.countplot( x="Age" , hue="Survived" , data=train_set)

plt.figure(figsize=(10,5))
sns.catplot( x="Age" , hue="Survived", col='Sex' , data=train_set, kind='count')

In [None]:
train_data

In [None]:
train_data.isna().sum(), test_data.isna().sum()

In [None]:
y_pred

In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y_pred})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
output

# 