In [1168]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

In [1169]:
titanicData = pd.read_csv('http://bit.ly/kaggletrain', index_col = None)
titanicData

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [1170]:
titanicData.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [1171]:
titanicData.shape

(891, 12)

In [1172]:
titanicData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [1173]:
cols = ["Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Embarked", "Fare"]
titanicData_X = titanicData[cols]
titanicData_y = titanicData["Survived"]
titanicData_X

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Embarked,Fare
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,S,7.2500
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,C,71.2833
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,S,7.9250
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,S,53.1000
4,3,"Allen, Mr. William Henry",male,35.0,0,0,S,8.0500
...,...,...,...,...,...,...,...,...
886,2,"Montvila, Rev. Juozas",male,27.0,0,0,S,13.0000
887,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,S,30.0000
888,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,S,23.4500
889,1,"Behr, Mr. Karl Howell",male,26.0,0,0,C,30.0000


In [1174]:
titanicData_y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [1175]:
titanicData_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Name      891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Embarked  889 non-null    object 
 7   Fare      891 non-null    float64
dtypes: float64(2), int64(3), object(3)
memory usage: 55.8+ KB


### Required transformations
- Pclass: None
- Name: CountVectorizer
- Sex: Categorical to Numerical
- Age: Missing value, To numerical
- Embarked: To numerical
- Fare: None

### Steps
1- "Pclass", "Name", "Sex", "SibSp", "Parch", "Fare"
2- "Pclass", "Name", "Sex", "SibSp", "Parch", "Fare", "Age"
3- "Pclass", "Name", "Sex", "SibSp", "Parch", "Fare", "Age", "Embarked"

In [1176]:
step1Cols = ["Pclass", "Name", "Sex", "SibSp", "Parch", "Fare"]
step2Cols = ["Pclass", "Name", "Sex", "SibSp", "Parch", "Fare","Age"]
step3Cols = ["Pclass", "Name", "Sex", "SibSp", "Parch", "Fare", "Age", "Embarked"]

### Step 1

In [1177]:
knn = KNeighborsClassifier(n_neighbors = 8)
X_train, X_test, y_train, y_test = train_test_split(
    titanicData_X[step1Cols], titanicData_y,
    random_state = 40, test_size = 0.2
)

In [1178]:
X_train.shape

(712, 6)

In [1179]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 661 to 326
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  712 non-null    int64  
 1   Name    712 non-null    object 
 2   Sex     712 non-null    object 
 3   SibSp   712 non-null    int64  
 4   Parch   712 non-null    int64  
 5   Fare    712 non-null    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 38.9+ KB


In [1180]:
y_train.shape

(712,)

###

In [1181]:
countVectorizer = CountVectorizer()
oneHotEncoder = OneHotEncoder()

In [1182]:
step1_columnTransformer = make_column_transformer(
    (countVectorizer, "Name"),
    (oneHotEncoder, ["Sex"]),
    remainder = "passthrough",
    n_jobs = -1
)

In [1183]:
step1_pipeLine = make_pipeline(step1_columnTransformer, knn)

In [1184]:
k_range = list(range(1, 31))
params = {"kneighborsclassifier__n_neighbors": k_range}

In [1185]:
gridSearchCV = GridSearchCV(
    step1_pipeLine, params, cv = 5, scoring = "accuracy", n_jobs = -1
)

In [1186]:
gridSearchCV.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=-1,
                                                          remainder='passthrough',
                                                          transformers=[('countvectorizer',
                                                                         CountVectorizer(),
                                                                         'Name'),
                                                                        ('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         ['Sex'])])),
                                       ('kneighborsclassifier',
                                        KNeighborsClassifier(n_neighbors=8))]),
             n_jobs=-1,
             param_grid={'kneighborsclassifier__n_neighbors': [1, 2,

In [1187]:
gridSearchCV.best_score_

0.7626514330739682

In [1188]:
predicted = gridSearchCV.predict(X_test)

In [1189]:
print("Step 1 Accuracy:", accuracy_score(y_test, predicted))

Step 1 Accuracy: 0.8324022346368715


### End - Step 1

### Step 2

In [1190]:
knn = KNeighborsClassifier(n_neighbors = 8)
X_train, X_test, y_train, y_test = train_test_split(
    titanicData_X[step2Cols], titanicData_y,
    random_state = 40, test_size = 0.2
)

In [1191]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 661 to 326
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  712 non-null    int64  
 1   Name    712 non-null    object 
 2   Sex     712 non-null    object 
 3   SibSp   712 non-null    int64  
 4   Parch   712 non-null    int64  
 5   Fare    712 non-null    float64
 6   Age     573 non-null    float64
dtypes: float64(2), int64(3), object(2)
memory usage: 44.5+ KB


In [1192]:
X_train.isna().sum()

Pclass      0
Name        0
Sex         0
SibSp       0
Parch       0
Fare        0
Age       139
dtype: int64

In [1193]:
step2_simpleImputer = SimpleImputer()

In [1194]:
step2_columnTransformer = make_column_transformer(
    (countVectorizer, "Name"),
    (oneHotEncoder, ["Sex"]),
    (step2_simpleImputer, ["Age"]),
    remainder = "passthrough",
    n_jobs = -1
)

In [1195]:
step2_pipeLine = make_pipeline(
    step2_columnTransformer, knn
)

In [1196]:
step2_gridSearchCV = GridSearchCV(
    step2_pipeLine, params, cv = 5, scoring = "accuracy", n_jobs = -1
)

In [1197]:
step2_gridSearchCV.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=-1,
                                                          remainder='passthrough',
                                                          transformers=[('countvectorizer',
                                                                         CountVectorizer(),
                                                                         'Name'),
                                                                        ('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         ['Sex']),
                                                                        ('simpleimputer',
                                                                         SimpleImputer(),
                                                           

In [1198]:
step2_gridSearchCV.best_score_

0.719078104993598

In [1199]:
predicted = step2_gridSearchCV.predict(X_test)
print("Step 2 Accuracy:", accuracy_score(y_test, predicted))

Step 2 Accuracy: 0.7318435754189944


### End - Step2

In [1200]:
knn = KNeighborsClassifier(n_neighbors = 8)
X_train, X_test, y_train, y_test = train_test_split(
    titanicData_X[step3Cols], titanicData_y,
    random_state = 40, test_size = 0.2
)

In [1201]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 661 to 326
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Name      712 non-null    object 
 2   Sex       712 non-null    object 
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Fare      712 non-null    float64
 6   Age       573 non-null    float64
 7   Embarked  711 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 50.1+ KB


In [1202]:
X_train["Embarked"].value_counts()

S    514
C    132
Q     65
Name: Embarked, dtype: int64

In [1203]:
ageImputer = SimpleImputer(strategy = "median")
embarkedOneHotEncoder = OneHotEncoder()
embarkedImputer = SimpleImputer(
    strategy = "constant",
    fill_value = "missing"
)
step3TransfomerPipeline = make_pipeline(embarkedImputer, embarkedOneHotEncoder)
#ageTransformer.fit_transform(X_train[["Embarked"]])

In [1204]:
embarkedOneHotEncoder.fit_transform(X_train[["Sex"]])
embarkedOneHotEncoder.categories_

[array(['female', 'male'], dtype=object)]

In [1205]:
step3TransfomerPipeline.fit_transform(X_train[["Embarked"]])

<712x4 sparse matrix of type '<class 'numpy.float64'>'
	with 712 stored elements in Compressed Sparse Row format>

In [1206]:
ageImputer.fit_transform(X_train[["Age"]])
ageImputer.statistics_

array([28.])

In [1207]:
step3_columnTransformer = make_column_transformer(
    (countVectorizer, "Name"),
    (oneHotEncoder, ["Sex"]),
    (ageImputer, ["Age"]),
    (step3TransfomerPipeline, ["Embarked"]),
    remainder = "passthrough",
    n_jobs = -1
)
step3_columnTransformer.fit_transform(X_train)

<712x1264 sparse matrix of type '<class 'numpy.float64'>'
	with 6781 stored elements in Compressed Sparse Row format>

In [1208]:
step3_pipeline = make_pipeline(
    step3_columnTransformer,
    knn
)
step3_pipeline.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   transformers=[('countvectorizer',
                                                  CountVectorizer(), 'Name'),
                                                 ('onehotencoder',
                                                  OneHotEncoder(), ['Sex']),
                                                 ('simpleimputer',
                                                  SimpleImputer(strategy='median'),
                                                  ['Age']),
                                                 ('pipeline',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                    

In [1209]:
step3_gridSearchCV = GridSearchCV(
    step3_pipeline,
    params,
    cv = 5,
    scoring = "accuracy",
    n_jobs = -1
)
step3_gridSearchCV.fit(X_train, y_train)

 nan nan nan nan nan nan nan nan nan nan nan nan]


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=-1,
                                                          remainder='passthrough',
                                                          transformers=[('countvectorizer',
                                                                         CountVectorizer(),
                                                                         'Name'),
                                                                        ('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         ['Sex']),
                                                                        ('simpleimputer',
                                                                         SimpleImputer(strategy='median'),
                                          

In [1210]:
pd.DataFrame(step3_gridSearchCV.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.061264,0.012969,0.053511,0.012224,1,{'kneighborsclassifier__n_neighbors': 1},0.608392,0.699301,0.739437,,0.676056,,,1
1,0.052259,0.003893,0.03644,0.01041,2,{'kneighborsclassifier__n_neighbors': 2},0.664336,0.664336,0.697183,,0.676056,,,28
2,0.045045,0.002364,0.043886,0.015146,3,{'kneighborsclassifier__n_neighbors': 3},0.706294,0.685315,0.767606,,0.690141,,,27
3,0.04684,0.003517,0.03913,0.011406,4,{'kneighborsclassifier__n_neighbors': 4},0.72028,0.706294,0.71831,,0.690141,,,26
4,0.044826,0.001031,0.038408,0.006891,5,{'kneighborsclassifier__n_neighbors': 5},0.713287,0.706294,0.704225,,0.71831,,,25
5,0.051381,0.003634,0.046582,0.012422,6,{'kneighborsclassifier__n_neighbors': 6},0.713287,0.685315,0.683099,,0.71831,,,24
6,0.055429,0.010084,0.043696,0.011674,7,{'kneighborsclassifier__n_neighbors': 7},0.699301,0.657343,0.697183,,0.732394,,,23
7,0.06559,0.012313,0.041979,0.011497,8,{'kneighborsclassifier__n_neighbors': 8},0.699301,0.664336,0.704225,,0.725352,,,22
8,0.0512,0.001886,0.044134,0.012312,9,{'kneighborsclassifier__n_neighbors': 9},0.72028,0.685315,0.697183,,0.711268,,,21
9,0.051579,0.006011,0.039278,0.0125,10,{'kneighborsclassifier__n_neighbors': 10},0.727273,0.72028,0.711268,,0.697183,,,20
