In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
train.head()
train.shape

(891, 12)

In [4]:
test.shape


(418, 11)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [6]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
train['Age']=train['Age'].fillna(train['Age'].median())
test['Age']=test['Age'].fillna(test['Age'].median())

train['Fare']=train['Fare'].fillna(train['Fare'].median())
test['Fare']=test['Fare'].fillna(test['Fare'].median())




In [8]:
train=train.drop(['Cabin','Ticket'],axis=1)
test=test.drop(['Cabin','Ticket'],axis=1)

train=train.dropna()
test=test.dropna()

In [9]:
print(train.shape)
print(test.shape)

(891, 10)
(418, 9)


In [10]:
titles_ignore =['Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer','Dona'] 

# Extracting the Titles from Name....
# Step01: Split the Names into Surname and First Name
newnames_train= train["Name"].apply(lambda x:x.split(", ")[-1])
newnames_test= test["Name"].apply(lambda x:x.split(", ")[-1])



# Extracting the Titles from Name....
# Step02: Split the Names into Title and First Name
titles_train = newnames_train.apply(lambda x: x.split(". ")[0])

titles_test = newnames_test.apply(lambda x: x.split(". ")[0])


# titles that are to be kept - Mr, Mrs, Miss, Master & Others
titles_ignore =['Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer','Dona'] 

# replacing titles_ignore with Others
titles_train.replace(to_replace=titles_ignore, value="Others", inplace=True)
titles_test.replace(to_replace=titles_ignore, value="Others", inplace=True)


train['Name']=titles_train
test['Name']=titles_test

In [11]:
train['Family']=train['SibSp']+train['Parch']+1
test['Family']=test['SibSp']+test['Parch']+1

In [12]:
train[['Survived','Pclass']]=train[['Survived','Pclass']].astype('str')
test['Pclass']=test['Pclass'].astype('str')



In [13]:
# Size of the Family

def size_family(x):
    if (x==1):
        return('Single')
    elif(x>1) and (x<=4):
        return('Small')
    elif(x>4):
        return('Large')
    
# Applying the Function to the Family Variable:: Train Set
train["Family Size"]=train["Family"].apply(size_family)
test["Family Size"]=test["Family"].apply(size_family)

In [14]:
target='Survived'
IDcol='PassengerId'
predictors= [x for x in train.columns if x not in [target, IDcol]]


In [15]:
train1=pd.get_dummies(train[predictors])
test=pd.get_dummies(test[predictors])

train=pd.concat([train1,train['Survived']],axis=1)



In [16]:
train.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Family,Pclass_1,Pclass_2,Pclass_3,Name_Master,Name_Miss,...,Name_Others,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Family Size_Large,Family Size_Single,Family Size_Small,Survived
0,22.0,1,0,7.25,2,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
1,38.0,1,0,71.2833,2,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,1
2,26.0,0,0,7.925,1,0,0,1,0,1,...,0,1,0,0,0,1,0,1,0,1
3,35.0,1,0,53.1,2,1,0,0,0,0,...,0,1,0,0,0,1,0,0,1,1
4,35.0,0,0,8.05,1,0,0,1,0,0,...,0,0,1,0,0,1,0,1,0,0


In [17]:
test.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Family,Pclass_1,Pclass_2,Pclass_3,Name_Master,Name_Miss,...,Name_Mrs,Name_Others,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Family Size_Large,Family Size_Single,Family Size_Small
0,34.5,0,0,7.8292,1,0,0,1,0,0,...,0,0,0,1,0,1,0,0,1,0
1,47.0,1,0,7.0,2,0,0,1,0,0,...,1,0,1,0,0,0,1,0,0,1
2,62.0,0,0,9.6875,1,0,1,0,0,0,...,0,0,0,1,0,1,0,0,1,0
3,27.0,0,0,8.6625,1,0,0,1,0,0,...,0,0,0,1,0,0,1,0,1,0
4,22.0,1,1,12.2875,3,0,0,1,0,0,...,1,0,1,0,0,0,1,0,0,1


In [18]:
print(train.shape)
print(test.shape)



(891, 22)
(418, 21)


In [19]:
scale_cols=['Age','Fare']


In [20]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

sc.fit(train[scale_cols])
train[scale_cols]=pd.DataFrame(sc.transform(train[scale_cols]),index=train[scale_cols].index)

test[scale_cols]=pd.DataFrame(sc.transform(test[scale_cols]),index=test[scale_cols].index)



In [21]:
test.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Family,Pclass_1,Pclass_2,Pclass_3,Name_Master,Name_Miss,...,Name_Mrs,Name_Others,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Family Size_Large,Family Size_Single,Family Size_Small
0,0.394887,0,0,-0.490783,1,0,0,1,0,0,...,0,0,0,1,0,1,0,0,1,0
1,1.35551,1,0,-0.507479,2,0,0,1,0,0,...,1,0,1,0,0,0,1,0,0,1
2,2.508257,0,0,-0.453367,1,0,1,0,0,0,...,0,0,0,1,0,1,0,0,1,0
3,-0.181487,0,0,-0.474005,1,0,0,1,0,0,...,0,0,0,1,0,0,1,0,1,0
4,-0.565736,1,1,-0.401017,3,0,0,1,0,0,...,1,0,1,0,0,0,1,0,0,1


In [22]:
# train.to_csv('train_featured.csv')
# test.to_csv('test_featured.csv')

In [23]:
from sklearn.model_selection import train_test_split

In [24]:

predictors1=[x for x in train.columns if x not in ['Survived']]
predictors1

['Age',
 'SibSp',
 'Parch',
 'Fare',
 'Family',
 'Pclass_1',
 'Pclass_2',
 'Pclass_3',
 'Name_Master',
 'Name_Miss',
 'Name_Mr',
 'Name_Mrs',
 'Name_Others',
 'Sex_female',
 'Sex_male',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S',
 'Family Size_Large',
 'Family Size_Single',
 'Family Size_Small']

In [25]:
x=train[predictors1]
y=train[target]

xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=101)

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [27]:
Lr=LogisticRegression()

In [28]:
xtrain.shape

(712, 21)

In [29]:
ytrain.shape

(712,)

In [42]:
## code to find the best model for the dataset

## Before using this template you should find the hyperparameters for the KNN 
## and Decision and random Forest the add those parameters in the model below 

## After selecting the best model we can use KFold CV to check for the bias error and variance error

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from prettytable import PrettyTable
from sklearn import metrics

report= PrettyTable()
report.field_names=['Model name','Accuracy_score','Precision_score','Recall_score','F1_score']


regressor=['LogisticRegression','KNN','DecisionTreeClassifier','RandomForestClassifier']
accuracy=[]
precision=[]
recall=[]
f1_score=[]

for regressor in regressor:
    if regressor=='LogisticRegression':
        model1=LogisticRegression()
        model1.fit(xtrain,ytrain)
        log_pred=pd.DataFrame(model1.predict(xtest))
        #Evaluation metrics
        report.add_row([regressor,
                    metrics.accuracy_score(ytest,log_pred),
                    metrics.precision_score(ytest,log_pred,average='weighted'),
                    metrics.recall_score(ytest,log_pred,average='weighted'),
                    metrics.f1_score(ytest,log_pred,average='weighted')])
        
    elif regressor=='KNN': 
        model2=KNeighborsClassifier(n_neighbors=4,p=35)
        model2.fit(xtrain,ytrain)
        knn_pred=model2.predict(xtest)
        #Evaluation metrics
        report.add_row([regressor,
                    metrics.accuracy_score(ytest,knn_pred),
                    metrics.precision_score(ytest,knn_pred,average='weighted'),
                    metrics.recall_score(ytest,knn_pred,average='weighted'),
                    metrics.f1_score(ytest,knn_pred,average='weighted')])
    elif regressor=='DecisionTreeClassifier':
        model3=DecisionTreeClassifier(min_samples_leaf=6,max_leaf_nodes=8,max_depth=8,criterion='entropy')
        model3.fit(xtrain,ytrain)
        dec_pred=model3.predict(xtest)
        #Evaluation metrics
        report.add_row([regressor,
                    metrics.accuracy_score(ytest,dec_pred),
                    metrics.precision_score(ytest,dec_pred,average='weighted'),
                    metrics.recall_score(ytest,dec_pred,average='weighted'),
                    metrics.f1_score(ytest,dec_pred,average='weighted')])
        
    elif regressor=='RandomForestClassifier':
        model4=RandomForestClassifier(criterion='entropy',n_estimators=45,max_features=8)
        model4.fit(xtrain,ytrain)
        random_pred=model4.predict(xtest)
        #Evaluation metrics
        report.add_row([regressor,
                    metrics.accuracy_score(ytest,random_pred),
                    metrics.precision_score(ytest,random_pred,average='weighted'),
                    metrics.recall_score(ytest,random_pred,average='weighted'),
                    metrics.f1_score(ytest,random_pred,average='weighted')])
print(report)

+------------------------+--------------------+--------------------+--------------------+--------------------+
|       Model name       |   Accuracy_score   |  Precision_score   |    Recall_score    |      F1_score      |
+------------------------+--------------------+--------------------+--------------------+--------------------+
|   LogisticRegression   | 0.8268156424581006 | 0.8313257138092458 | 0.8268156424581006 | 0.8244928938770828 |
|          KNN           | 0.7821229050279329 | 0.8146119642528262 | 0.7821229050279329 | 0.770691013852729  |
| DecisionTreeClassifier | 0.8379888268156425 | 0.8451360459593332 | 0.8379888268156425 | 0.8353135573215832 |
| RandomForestClassifier | 0.8044692737430168 | 0.806631042020889  | 0.8044692737430168 | 0.8023972420271179 |
+------------------------+--------------------+--------------------+--------------------+--------------------+


In [46]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
model_ada=AdaBoostClassifier()

In [44]:
ada_pred=model_ada.fit(xtrain,ytrain).predict(xtest)

In [45]:
print(metrics.classification_report(ytest,ada_pred))

              precision    recall  f1-score   support

           0       0.79      0.88      0.83        99
           1       0.83      0.71      0.77        80

   micro avg       0.80      0.80      0.80       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.80      0.80       179



In [57]:
from sklearn.model_selection import RandomizedSearchCV

In [58]:
ada_param={'base_estimator':[model1,model3,model4],'n_estimators':[50,100,200,300]}

In [59]:
grid=RandomizedSearchCV(model_ada,ada_param,cv=3)

grid.fit(xtrain,ytrain)


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          learning_rate=1.0, n_estimators=300, random_state=None),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'base_estimator': [LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=Fal..., random_state=None, verbose=0,
            warm_start=False)], 'n_estimators': [50, 100, 200, 300]},
          pre_dispatch='2*n_jobs', random_state

In [60]:
grid.best_params_

{'n_estimators': 300,
 'base_estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=None, solver='warn',
           tol=0.0001, verbose=0, warm_start=False)}

In [61]:
model_ada=AdaBoostClassifier(base_estimator=LogisticRegression(),n_estimators=300)

ada_pred=model_ada.fit(xtrain,ytrain).predict(xtest)

print(metrics.classification_report(ytest,ada_pred))

              precision    recall  f1-score   support

           0       0.80      0.91      0.85        99
           1       0.87      0.72      0.79        80

   micro avg       0.83      0.83      0.83       179
   macro avg       0.83      0.82      0.82       179
weighted avg       0.83      0.83      0.82       179



In [62]:
from sklearn.svm import SVC

In [63]:
sv=SVC()

sv.fit(xtrain,ytrain)

sv_pred=sv.predict(xtest)

print(metrics.classification_report(ytest,sv_pred))

              precision    recall  f1-score   support

           0       0.80      0.93      0.86        99
           1       0.89      0.71      0.79        80

   micro avg       0.83      0.83      0.83       179
   macro avg       0.85      0.82      0.83       179
weighted avg       0.84      0.83      0.83       179



In [64]:
from sklearn.ensemble import GradientBoostingClassifier

In [69]:
model_gbc=AdaBoostClassifier()

gbc_pred=model_gbc.fit(xtrain,ytrain).predict(xtest)

print(metrics.classification_report(ytest,gbc_pred))
print(model_gbc.score(xtest,ytest))


              precision    recall  f1-score   support

           0       0.79      0.88      0.83        99
           1       0.83      0.71      0.77        80

   micro avg       0.80      0.80      0.80       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.80      0.80       179

0.8044692737430168


In [70]:
# XG Boost
import xgboost as xgb

xg = xgb.XGBClassifier(max_depth=2, learning_rate=0.01) # 0.78947

# Fitting the Model
xgb_pred = xg.fit(xtrain,ytrain).predict(xtest)
print(metrics.classification_report(ytest,xgb_pred))
print(xg.score(xtest,ytest))


              precision    recall  f1-score   support

           0       0.79      0.93      0.86        99
           1       0.89      0.70      0.78        80

   micro avg       0.83      0.83      0.83       179
   macro avg       0.84      0.81      0.82       179
weighted avg       0.84      0.83      0.82       179

0.8268156424581006
