# **Titanic data Modelling**

* This notebook is focus on applying algorithm to the train and test data.
* For the EDA part is located on another notebook
* Algorithmn I used :
    1. XGBoost
    2. SVC

# Import required libraries

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Import data

In [None]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')
train = df_train.copy()
test = df_test.copy()

**Features details**
* survival : Survival	0 = No, 1 = Yes
* pclass : Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
* sex	: Sex	
* Age	: Age in years	
* sibsp	: # of siblings / spouses aboard the Titanic	
* parch	: # of parents / children aboard the Titanic	
* ticket	: Ticket number	
* fare	: Passenger fare	
* cabin	: Cabin number	
* embarked	: Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

**Variable Notes**
* pclass: A proxy for socio-economic status (SES)
* 1st = Upper
* 2nd = Middle
* 3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way
* Sibling = brother, sister, stepbrother, stepsister
* Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...
* Parent = mother, father
* Child = daughter, son, stepdaughter, stepson
* Some children travelled only with a nanny, therefore parch=0 for them.

In [None]:
print('Train data','\n')
print(train.info(),'\n')
print(train.describe(),'\n')

print('Test data','\n')
print(test.info(),'\n')
print(test.describe(),'\n')

print('Train data : {}'.format(train.shape))
print('Test data : {}'.format(test.shape))

In [None]:
train

In [None]:
test

# Data Manipulation and Cleaning

In [None]:
def col_counts(col):
    '''Print the value counts of columns from train and test data.'''
    if col in train.columns:
        print('Train\'s {} : '.format(col))
        print(train[col].value_counts(),'\n')
    else:
        print('Train\'s data does not have {} column.'.format(col))
        
    if col in test.columns:
        print('Test\'s {} : '.format(col))
        print(test[col].value_counts())
    else:
        print('Test\'s data does not have {} column.'.format(col))

**Drop Passenger Id and Ticket's columns**

In [None]:
def drop_col(data, column):
    return data.drop(column, inplace = True,axis = 1)

drop_col(train, ['PassengerId', 'Ticket'])
drop_col(test, ['PassengerId', 'Ticket'])

**Survived**

In [None]:
col_counts('Survived')

**Name**

**Extract the title from the name of passenger**

In [None]:
train['Title'] = train.Name.apply(lambda x : x.split(',')[1].split('.')[0].strip())
test['Title'] = test.Name.apply(lambda x : x.split(',')[1].split('.')[0].strip())

def title_type(row):
    if row in ['Don', 'Mme',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer','Dona','Dr','Rev']:
        # label as rare for titles that are low in counts
        return 'Rare'
    elif row == 'Miss':
        return 'Ms'
    else:
        return row
    
train['Title'] = train.Title.apply(title_type)
test['Title'] = test.Title.apply(title_type)

drop_col(train, 'Name')
drop_col(test,'Name')

In [None]:
col_counts('Title')

**Sex**

In [None]:
train['Sex'] = train.Sex.map({'male':'Male','female':'Female'})
test['Sex'] = test.Sex.map({'male':'Male','female':'Female'})

In [None]:
col_counts('Sex')

**Family**

In [None]:
train['Family'] = train.SibSp + train.Parch + 1
test['Family'] = test.SibSp + test.Parch + 1
drop_col(train, ['SibSp','Parch'])
drop_col(test, ['SibSp','Parch'])

In [None]:
col_counts('Family')

In [None]:
train['Family_type'] = pd.cut(train.Family, [0,1,4,7,11], labels = ['Single', 'Small', 'Medium', 'Large'])
test['Family_type'] = pd.cut(test.Family, [0,1,4,7,11], labels = ['Single', 'Small', 'Medium', 'Large'])

In [None]:
col_counts('Family_type')

**Age**

Divide into 3 stage : Child, Adult, Elderly

In [None]:
def age_diff(row):
    if row < 18:
        return 'Child'
    elif (row < 60) & (row >=18):
        return 'Adult'
    else:
        return 'Elderly'

train['Age_cat'] = train.Age.apply(age_diff)
test['Age_cat'] = test.Age.apply(age_diff)

In [None]:
col_counts('Age_cat')

**Cabin**

In [None]:
train['Cabin_floor'] = train.Cabin.apply(lambda x: list(str(x))[0])
train['Cabin_floor'] = train.Cabin_floor.replace('n', np.nan)

test['Cabin_floor'] = test.Cabin.apply(lambda x: list(str(x))[0])
test['Cabin_floor'] = test.Cabin_floor.replace('n', np.nan)

drop_col(train,'Cabin')
drop_col(test,'Cabin')

In [None]:
col_counts('Cabin_floor')

# Modelling

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

seed = 225

In [None]:
y = train['Survived']
X = train.drop('Survived',axis = 1)

**XG with Age as categorical**

In [None]:
# define columns for numerical and categorical
num_cols = ['Fare']

cat_cols = ['Pclass', 'Sex','Embarked','Title','Family_type','Age_cat']

# pipeline for preprocessing of numerical and categorical data
cat_transformer = Pipeline(steps = [('Cat_Imputer', SimpleImputer(strategy = 'most_frequent')),('OneHotEncoder',OneHotEncoder(handle_unknown = 'ignore'))])
num_transformer = Pipeline(steps = [('Num_Imputer', SimpleImputer(strategy = 'median'))])

preprocessor = ColumnTransformer(transformers = [('num', num_transformer, num_cols), ('cat',cat_transformer, cat_cols)])

# pipeline for modeling
titanic_pipeline = Pipeline(steps = [('Preprocessor',preprocessor),('XG', XGBClassifier(random_state = seed))])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = seed)

params = {'XG__learning_rate' : [0.1,0.2], 'XG__gamma' : [0.001,0.01,1,10],'XG__max_depth' : [4,6,8,10], 'XG__n_estimators' : [400,500]}

searcher = GridSearchCV(titanic_pipeline,params, cv = 3, verbose = 1, n_jobs = -1 )

searcher.fit(X_train,y_train)

print('Best params : {}'.format(searcher.best_params_))
print('Best score : {:.2f}'.format(searcher.best_score_))

y_pred_train = searcher.predict(X_train)
y_pred_test = searcher.predict(X_test)

print('XGBoost\'s train score : {:.3f}'.format(accuracy_score(y_train,y_pred_train)))
print('XGBoost\'s test score : {:.3f}'.format(accuracy_score(y_test,y_pred_test)))
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test,y_pred_test))
print('XGBoost\'s roc score : {:.3f}'.format(roc_auc_score(y_test,y_pred_test)))

**XG with Age as numerical**

In [None]:
# define columns for numerical and categorical
num_cols = ['Fare','Age']

cat_cols = ['Pclass', 'Sex','Embarked','Title','Family_type']

# pipeline for preprocessing of numerical and categorical data
cat_transformer = Pipeline(steps = [('Cat_Imputer', SimpleImputer(strategy = 'most_frequent')),('OneHotEncoder',OneHotEncoder(handle_unknown = 'ignore'))])
num_transformer = Pipeline(steps = [('Num_Imputer', SimpleImputer(strategy = 'median'))])

preprocessor = ColumnTransformer(transformers = [('num', num_transformer, num_cols), ('cat',cat_transformer, cat_cols)])

# pipeline for modeling
titanic_pipeline = Pipeline(steps = [('Preprocessor',preprocessor),('XG', XGBClassifier(random_state = seed, learning_rate = 0.1))])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = seed)

params = { 'XG__gamma' : [0.001,0.01,1,10,100,1000],'XG__max_depth' : [2,4,6,8,10], 'XG__n_estimators' : [400,500]}

searcher_xg = GridSearchCV(titanic_pipeline,params, cv = 3, verbose = 1, n_jobs = -1 )

searcher_xg.fit(X_train,y_train)

print('Best params : {}'.format(searcher_xg.best_params_))
print('Best score : {:.2f}'.format(searcher_xg.best_score_))

y_pred_train = searcher_xg.predict(X_train)
y_pred_test = searcher_xg.predict(X_test)

print('XGBoost\'s train score : {:.3f}'.format(accuracy_score(y_train,y_pred_train)))
print('XGBoost\'s test score : {:.3f}'.format(accuracy_score(y_test,y_pred_test)))
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test,y_pred_test))
print('XGBoost\'s roc score : {:.3f}'.format(roc_auc_score(y_test,y_pred_test)))

**SVC with Age as categorical**

In [None]:
num_cols = ['Fare']

cat_cols = ['Pclass', 'Sex','Embarked','Title','Family_type','Age_cat']

cat_transformer = Pipeline(steps = [('Cat_Imputer', SimpleImputer(strategy = 'most_frequent')),('OneHotEncoder',OneHotEncoder(handle_unknown = 'ignore'))])
num_transformer = Pipeline(steps = [('Num_Imputer', SimpleImputer(strategy = 'median')), ('Scaler', RobustScaler())])

preprocessor = ColumnTransformer(transformers = [('num', num_transformer, num_cols), ('cat',cat_transformer, cat_cols)])

titanic_pipeline = Pipeline(steps = [('Preprocessor',preprocessor),('SVC', SVC(random_state = seed))])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = seed)

parameters = {'SVC__C':[0.1, 1, 10,100], 'SVC__gamma':[ 0.001, 0.01, 0.1,1,10]}
searcher = GridSearchCV(titanic_pipeline, parameters, cv = 5, n_jobs = -1, verbose = 1)

searcher.fit(X_train,y_train)

print('Best params : {}'.format(searcher.best_params_))
print('Best score : {:.2f}'.format(searcher.best_score_))

y_pred_train = searcher.predict(X_train)
y_pred_test = searcher.predict(X_test)

print('SVC\'s train score : {:.3f}'.format(accuracy_score(y_train,y_pred_train)))
print('SVC\'s test score : {:.3f}'.format(accuracy_score(y_test,y_pred_test)))
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test,y_pred_test))
print('SVC\'s roc score : {:.3f}'.format(roc_auc_score(y_test,y_pred_test)))

# Applying model on test data

In [None]:
output = searcher_xg.predict(test)

In [None]:
df_test['Survived'] = output
df_test = df_test[['PassengerId', 'Survived']]
print(df_test.shape)

In [None]:
df_test.to_csv('submission_2.csv', index=False)