In [41]:
from warnings import filterwarnings
filterwarnings('ignore')

In [42]:
import pandas as pd
df = pd.read_csv('train_titanic.csv')

In [43]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [45]:
s = df.isna().sum()
s[s>0]

Age         177
Cabin       687
Embarked      2
dtype: int64

### create X and Y 

In [46]:
X = df.drop(columns=['PassengerId','Name','Ticket','Survived'],axis=1)
Y = df[['Survived']]

In [47]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.25,,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.925,,S
3,1,female,35.0,1,0,53.1,C123,S
4,3,male,35.0,0,0,8.05,,S


In [48]:
Y.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


### Separate Cat and con

In [49]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [50]:
cat

['Sex', 'Cabin', 'Embarked']

In [51]:
con

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [52]:
cat1 = ['Sex','Embarked']
cat2 =['Cabin']

In [53]:
cat1

['Sex', 'Embarked']

In [54]:
cat2

['Cabin']

### Build a pipeline

In [55]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler

In [56]:
# Numeric Pipeline
num_pipe = Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),
                           ('scaler',StandardScaler())])
# categorical Pipeline 1
cat_pipe1 = Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                             ('OHE', OneHotEncoder(handle_unknown='ignore') )])

#catgeorical pipeline 2
cat_pipe2 = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant', fill_value='unknown')),
                            ('OHE', OneHotEncoder(handle_unknown='ignore'))])
pre = ColumnTransformer([('num', num_pipe, con),
                        ('cat1',cat_pipe1,cat1),
                        ('cat2',cat_pipe2,cat2)])


In [57]:
X_pre = pre.fit_transform(X).toarray()
X_pre

array([[ 0.82737724, -0.56573646,  0.43279337, ...,  0.        ,
         0.        ,  1.        ],
       [-1.56610693,  0.66386103,  0.43279337, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.82737724, -0.25833709, -0.4745452 , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.82737724, -0.1046374 ,  0.43279337, ...,  0.        ,
         0.        ,  1.        ],
       [-1.56610693, -0.25833709, -0.4745452 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.82737724,  0.20276197, -0.4745452 , ...,  0.        ,
         0.        ,  1.        ]])

In [58]:
cols = pre.get_feature_names_out()
cols

array(['num__Pclass', 'num__Age', 'num__SibSp', 'num__Parch', 'num__Fare',
       'cat1__Sex_female', 'cat1__Sex_male', 'cat1__Embarked_C',
       'cat1__Embarked_Q', 'cat1__Embarked_S', 'cat2__Cabin_A10',
       'cat2__Cabin_A14', 'cat2__Cabin_A16', 'cat2__Cabin_A19',
       'cat2__Cabin_A20', 'cat2__Cabin_A23', 'cat2__Cabin_A24',
       'cat2__Cabin_A26', 'cat2__Cabin_A31', 'cat2__Cabin_A32',
       'cat2__Cabin_A34', 'cat2__Cabin_A36', 'cat2__Cabin_A5',
       'cat2__Cabin_A6', 'cat2__Cabin_A7', 'cat2__Cabin_B101',
       'cat2__Cabin_B102', 'cat2__Cabin_B18', 'cat2__Cabin_B19',
       'cat2__Cabin_B20', 'cat2__Cabin_B22', 'cat2__Cabin_B28',
       'cat2__Cabin_B3', 'cat2__Cabin_B30', 'cat2__Cabin_B35',
       'cat2__Cabin_B37', 'cat2__Cabin_B38', 'cat2__Cabin_B39',
       'cat2__Cabin_B4', 'cat2__Cabin_B41', 'cat2__Cabin_B42',
       'cat2__Cabin_B49', 'cat2__Cabin_B5', 'cat2__Cabin_B50',
       'cat2__Cabin_B51 B53 B55', 'cat2__Cabin_B57 B59 B63 B66',
       'cat2__Cabin_B58 B60',

In [59]:
X_pre = pd.DataFrame(X_pre,columns=cols)
X_pre.head()

Unnamed: 0,num__Pclass,num__Age,num__SibSp,num__Parch,num__Fare,cat1__Sex_female,cat1__Sex_male,cat1__Embarked_C,cat1__Embarked_Q,cat1__Embarked_S,...,cat2__Cabin_F E69,cat2__Cabin_F G63,cat2__Cabin_F G73,cat2__Cabin_F2,cat2__Cabin_F33,cat2__Cabin_F38,cat2__Cabin_F4,cat2__Cabin_G6,cat2__Cabin_T,cat2__Cabin_unknown
0,0.827377,-0.565736,0.432793,-0.473674,-0.502445,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-1.566107,0.663861,0.432793,-0.473674,0.786845,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.827377,-0.258337,-0.474545,-0.473674,-0.488854,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-1.566107,0.433312,0.432793,-0.473674,0.42073,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.827377,0.433312,-0.474545,-0.473674,-0.486337,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [60]:
X_pre.shape

(891, 158)

### Train-Test Split

In [61]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X_pre, Y, test_size=0.2, random_state=21)

In [62]:
xtrain.shape

(712, 158)

In [63]:
xtest.shape

(179, 158)

In [64]:
ytest.shape

(179, 1)

In [65]:
ytrain.shape

(712, 1)

### Evaluate the Algorithms

In [66]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [67]:
dct = {'LogisticRegression': LogisticRegression(),
       'DecisionTree':DecisionTreeClassifier(),
       'RandomForest':RandomForestClassifier(),
       'GradientBoosting':GradientBoostingClassifier(),
       'SVM': SVC()
       
}

In [68]:
dct.items()

dict_items([('LogisticRegression', LogisticRegression()), ('DecisionTree', DecisionTreeClassifier()), ('RandomForest', RandomForestClassifier()), ('GradientBoosting', GradientBoostingClassifier()), ('SVM', SVC())])

In [69]:
dct.keys()

dict_keys(['LogisticRegression', 'DecisionTree', 'RandomForest', 'GradientBoosting', 'SVM'])

In [70]:
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
tr = []
ts = []
tr_cv = []
for name, model in dct.items():
    
    ## fit the model on training data
    model.fit(xtrain,ytrain)
    ypred_tr = model.predict(xtrain)
    ypred_ts = model.predict(xtest)
    
    # Calculate F1 score for testing
    f1_tr = f1_score(ytrain,ypred_tr)

    ## Calculate F1 score validated results
    scores = cross_val_score(model,xtrain,ytrain,cv=5,scoring='f1')
    f1_tr_cv = scores.mean()

    # Calculate F1 score in Testing
    f1_ts = f1_score(ytest,ypred_ts)

    # Append all values in the list

    tr.append(f1_tr)
    tr_cv.append(f1_tr_cv)
    ts.append(f1_ts)

    # Print the results
    print(f'Model Name: {name}\n')
    print(f'Training Score: {f1_tr:.4f}\n')
    print(f'Training CV:{f1_tr_cv:.4f}\n')
    print(f'Testing Score: {f1_ts:.4f}\n')
    print('\n======================================\n')




Model Name: LogisticRegression

Training Score: 0.7672

Training CV:0.7131

Testing Score: 0.7746



Model Name: DecisionTree

Training Score: 0.9849

Training CV:0.6965

Testing Score: 0.7518





Model Name: RandomForest

Training Score: 0.9849

Training CV:0.7187

Testing Score: 0.7826



Model Name: GradientBoosting

Training Score: 0.8715

Training CV:0.7342

Testing Score: 0.7794



Model Name: SVM

Training Score: 0.7619

Training CV:0.7502

Testing Score: 0.7852





## Create a Dataframe for model evaluation 

In [71]:
dct_eval = { 'Name': list(dct.keys()),
            'TrainF1':tr,
            'trainF1CV':tr_cv,
            'TestF1':ts }

In [72]:
df_eval = pd.DataFrame(dct_eval)
df_eval

Unnamed: 0,Name,TrainF1,trainF1CV,TestF1
0,LogisticRegression,0.767176,0.713137,0.774648
1,DecisionTree,0.984906,0.696517,0.751773
2,RandomForest,0.984906,0.718672,0.782609
3,GradientBoosting,0.871486,0.734179,0.779412
4,SVM,0.761905,0.750163,0.785185


In [73]:
df_eval.sort_values(by='TestF1', ascending= False)

Unnamed: 0,Name,TrainF1,trainF1CV,TestF1
4,SVM,0.761905,0.750163,0.785185
2,RandomForest,0.984906,0.718672,0.782609
3,GradientBoosting,0.871486,0.734179,0.779412
0,LogisticRegression,0.767176,0.713137,0.774648
1,DecisionTree,0.984906,0.696517,0.751773


### As per the score selecting Gradient Boosting Model

In [74]:
params = {'learning_rate':[0.001, 0.01, 0.05, 0.1],
          'n_estimators':[10,50,100,200],
          'max_depth':[3,4,5,6,7,8,9,10],
          'min_samples_split':[6,7,8,9,10]}

In [75]:
from sklearn.model_selection import RandomizedSearchCV
GB = GradientBoostingClassifier(random_state=21)
rscv = RandomizedSearchCV(GB, param_distributions= params, cv =5, scoring= 'f1' )
rscv.fit(xtrain,ytrain)