# <font color=blue>Assignments for "Cross Validation"</font>

In this assignment, you are going to measure the performance of the model you created with the Titanic dataset in the previous lesson. To complete this assignment, send a link to a Jupyter notebook containing solutions to the following tasks.

- Evaluate your model's performance with cross validation and using different metrics.
- Determine the model with the most appropriate parameters by hyperparameter tuning.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action="ignore")

pd.options.display.max_columns = None

In [3]:
titanic =  pd.read_csv("C:/Users/Elif/data/titanic.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [11]:
titanic.groupby(["Embarked"])["PassengerId"].count()

Embarked
C    130
Q     28
S    554
Name: PassengerId, dtype: int64

In [10]:
titanic["EmbarkedC"]=titanic["Embarked"].replace("C",1)
titanic["EmbarkedC"].replace("S",0,inplace=True)
titanic["EmbarkedC"].replace("Q",0,inplace=True)
titanic["EmbarkedS"]=titanic["Embarked"].replace("S",1)
titanic["EmbarkedS"].replace("Q",0,inplace=True)
titanic["EmbarkedS"].replace("C",0,inplace=True)
titanic["EmbarkedQ"]=titanic["Embarked"].replace("Q",1)
titanic["EmbarkedQ"].replace("S",0,inplace=True)
titanic["EmbarkedQ"].replace("C",0,inplace=True)
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_,EmbarkedC,EmbarkedS,EmbarkedQ
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,0,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0,0,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,0,1,0


In [8]:
titanic.dropna(subset=['Age', 'Embarked'],inplace=True)

In [13]:
titanic["is_male"] = pd.get_dummies(titanic.Sex, drop_first=True)

In [17]:
X = titanic[["Pclass","Sex_","Age","Parch","is_male","Fare"]]
y = titanic["Survived"] 

In [18]:
log_reg = LogisticRegression()
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.20, random_state=111)
log_reg.fit(X_train, y_train)

LogisticRegression()

In [19]:
train_accuracy = log_reg.score(X_train, y_train)
test_accuracy = log_reg.score(X_test, y_test)
print('Accuracy of test data ',test_accuracy)


Accuracy of test data  0.7832167832167832


In [27]:
from sklearn.model_selection import cross_validate, cross_val_score


In [28]:
log_reg_model = LogisticRegression()

cv = cross_validate(estimator=log_reg_model,
                    X=X,
                    y=y,
                    cv=10,
                    return_train_score=True,
                   )

print('Train Scores :', cv['train_score'], sep = '\n')
print("-"*50)
print('Test Scores     :', cv['test_score'], sep = '\n')

Train Scores :
[0.79375    0.7953125  0.7925117  0.78627145 0.79407176 0.7925117
 0.79563183 0.79875195 0.78783151 0.79407176]
--------------------------------------------------
Test Scores     :
[0.77777778 0.73611111 0.78873239 0.87323944 0.76056338 0.77464789
 0.8028169  0.76056338 0.81690141 0.78873239]


In [29]:
print('Mean score of train set : ', cv['train_score'].mean())
print('Mean score of test set  : ', cv['test_score'].mean())

Mean score of train set :  0.7930716166146647
Mean score of test set  :  0.788008607198748


In [31]:
cv = cross_validate(estimator=log_reg_model,
                    X=X,
                    y=y,
                    cv=10,
                    return_train_score=True,
                    scoring = ['accuracy', 'precision', 'r2','recall']
                   )

In [32]:
print('Train Set Mean Accuracy  : {:.2f}  '.format(cv['train_accuracy'].mean()))
print('Train Set Mean R-square  : {:.2f}  '.format(cv['train_r2'].mean()))
print('Train Set Mean Precision : {:.2f}'.format(cv['train_precision'].mean()))
print('Train Set Mean recall : {:.2f}\n'.format(cv['train_recall'].mean()))

print('Test Set Mean Accuracy   : {:.2f}  '.format(cv['test_accuracy'].mean()))
print('Test Set Mean R-square   : {:.2f}  '.format(cv['test_r2'].mean()))
print('Test Set Mean Precision  : {:.2f}  '.format(cv['test_precision'].mean()))
print('Test Set Mean recall  : {:.2f}  '.format(cv['test_recall'].mean()))

Train Set Mean Accuracy  : 0.79  
Train Set Mean R-square  : 0.14  
Train Set Mean Precision : 0.76
Train Set Mean recall : 0.71

Test Set Mean Accuracy   : 0.79  
Test Set Mean R-square   : 0.12  
Test Set Mean Precision  : 0.76  
Test Set Mean recall  : 0.70  


2-Determine the model with the most appropriate parameters by hyperparameter tuning.

**Grid Search**

In [33]:
parameters = {"C": [10 ** x for x in range (-5, 5, 1)],
              "penalty": ['l1', 'l2']
             }

In [34]:
from sklearn.model_selection import GridSearchCV

log_reg = LogisticRegression()
grid_cv = GridSearchCV(estimator=log_reg,
                       param_grid = parameters,
                       cv = 10
                      )

grid_cv.fit(X, y)

GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                               1000, 10000],
                         'penalty': ['l1', 'l2']})

In [35]:
print("Best Parameters : ", grid_cv.best_params_)
print("Best Score      : ", grid_cv.best_score_)

Best Parameters :  {'C': 1, 'penalty': 'l2'}
Best Score      :  0.788008607198748


In [36]:
results = grid_cv.cv_results_

df = pd.DataFrame(results)

In [37]:
df = df[['param_penalty','param_C', 'mean_test_score']]
df = df.sort_values(by='mean_test_score', ascending = False)
df

Unnamed: 0,param_penalty,param_C,mean_test_score
11,l2,1.0,0.788009
13,l2,10.0,0.788009
15,l2,100.0,0.788009
17,l2,1000.0,0.788009
19,l2,10000.0,0.788009
9,l2,0.1,0.78662
7,l2,0.01,0.754499
5,l2,0.001,0.65894
3,l2,0.0001,0.654695
1,l2,1e-05,0.644855


**Random Search**

In [38]:

from sklearn.model_selection import RandomizedSearchCV

rs_cv = RandomizedSearchCV(estimator=log_reg,
                           param_distributions = parameters,
                           cv = 10,
                           n_iter = 10,
                           random_state = 111,
                           scoring = 'precision'
                      )

rs_cv.fit(X, y)

RandomizedSearchCV(cv=10, estimator=LogisticRegression(),
                   param_distributions={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1,
                                              1, 10, 100, 1000, 10000],
                                        'penalty': ['l1', 'l2']},
                   random_state=111, scoring='precision')

In [39]:
print("Best parameters      : ", rs_cv.best_params_, "\n")
print("Best precision value : ", rs_cv.best_score_)

Best parameters      :  {'penalty': 'l2', 'C': 10} 

Best precision value :  0.764537903538328
