In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [42]:
data = pd.read_csv('data/titanic.csv')
data

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


In [7]:
data.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard',
       'Parents/Children Aboard', 'Fare'],
      dtype='object')

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  887 non-null    int64  
 1   Pclass    887 non-null    int64  
 2   Name      887 non-null    object 
 3   Sex       887 non-null    object 
 4   Age       887 non-null    float64
 5   Fare      887 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 41.7+ KB


In [43]:
data['name_title'] = data['Name'].apply(lambda x: x.split('.')[0].strip() if '.' in x else x.strip())
data

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,name_title
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500,Mr
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833,Mrs
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250,Miss
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000,Mrs
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500,Mr
...,...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000,Rev
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000,Miss
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500,Miss
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000,Mr


In [46]:
data.drop('Name', axis=1, inplace=True)
data

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,name_title
0,0,3,male,22.0,1,0,7.2500,Mr
1,1,1,female,38.0,1,0,71.2833,Mrs
2,1,3,female,26.0,0,0,7.9250,Miss
3,1,1,female,35.0,1,0,53.1000,Mrs
4,0,3,male,35.0,0,0,8.0500,Mr
...,...,...,...,...,...,...,...,...
882,0,2,male,27.0,0,0,13.0000,Rev
883,1,1,female,19.0,0,0,30.0000,Miss
884,0,3,female,7.0,1,2,23.4500,Miss
885,1,1,male,26.0,0,0,30.0000,Mr


In [23]:
df.dtypes

Survived        int64
Pclass          int64
Name           object
Sex            object
Age           float64
Fare          float64
name_title     object
dtype: object

In [26]:
data['Age'] = data['Age'].astype('int')

In [27]:
data.dtypes

Survived        int64
Pclass          int64
Name           object
Sex            object
Age             int32
Fare          float64
name_title     object
dtype: object

In [47]:
data['AgeGroup'] = pd.cut(data.Age, bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], labels=['0','1','2','3','4','5','6','7','8','9'])
data

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,name_title,AgeGroup
0,0,3,male,22.0,1,0,7.2500,Mr,2
1,1,1,female,38.0,1,0,71.2833,Mrs,3
2,1,3,female,26.0,0,0,7.9250,Miss,2
3,1,1,female,35.0,1,0,53.1000,Mrs,3
4,0,3,male,35.0,0,0,8.0500,Mr,3
...,...,...,...,...,...,...,...,...,...
882,0,2,male,27.0,0,0,13.0000,Rev,2
883,1,1,female,19.0,0,0,30.0000,Miss,1
884,0,3,female,7.0,1,2,23.4500,Miss,0
885,1,1,male,26.0,0,0,30.0000,Mr,2


In [56]:
def name_title(s):
    if s == 'Mr':
        return 1
    elif s == 'Mrs':
        return 2
    elif s == 'Miss':
        return 3
    else :
        return 4

In [53]:
data['Name_title'] = data['name_title'].apply(lambda x: name_title(x))
data

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,name_title,AgeGroup,Name_title
0,0,3,male,22.0,1,0,7.2500,Mr,2,1
1,1,1,female,38.0,1,0,71.2833,Mrs,3,2
2,1,3,female,26.0,0,0,7.9250,Miss,2,3
3,1,1,female,35.0,1,0,53.1000,Mrs,3,2
4,0,3,male,35.0,0,0,8.0500,Mr,3,1
...,...,...,...,...,...,...,...,...,...,...
882,0,2,male,27.0,0,0,13.0000,Rev,2,4
883,1,1,female,19.0,0,0,30.0000,Miss,1,3
884,0,3,female,7.0,1,2,23.4500,Miss,0,3
885,1,1,male,26.0,0,0,30.0000,Mr,2,1


In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   Survived                 887 non-null    int64   
 1   Pclass                   887 non-null    int64   
 2   Sex                      887 non-null    object  
 3   Age                      887 non-null    float64 
 4   Siblings/Spouses Aboard  887 non-null    int64   
 5   Parents/Children Aboard  887 non-null    int64   
 6   Fare                     887 non-null    float64 
 7   name_title               887 non-null    object  
 8   AgeGroup                 887 non-null    category
 9   Name_title               887 non-null    int64   
dtypes: category(1), float64(2), int64(5), object(2)
memory usage: 63.7+ KB


In [57]:
data['Gender'] = data['Sex'].map({'female':1, 'male':0}).astype(int)
data

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,name_title,AgeGroup,Name_title,Gender
0,0,3,male,22.0,1,0,7.2500,Mr,2,1,0
1,1,1,female,38.0,1,0,71.2833,Mrs,3,2,1
2,1,3,female,26.0,0,0,7.9250,Miss,2,3,1
3,1,1,female,35.0,1,0,53.1000,Mrs,3,2,1
4,0,3,male,35.0,0,0,8.0500,Mr,3,1,0
...,...,...,...,...,...,...,...,...,...,...,...
882,0,2,male,27.0,0,0,13.0000,Rev,2,4,0
883,1,1,female,19.0,0,0,30.0000,Miss,1,3,1
884,0,3,female,7.0,1,2,23.4500,Miss,0,3,1
885,1,1,male,26.0,0,0,30.0000,Mr,2,1,0


In [58]:
data.drop('Sex', axis=1, inplace=True)
data

Unnamed: 0,Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,name_title,AgeGroup,Name_title,Gender
0,0,3,22.0,1,0,7.2500,Mr,2,1,0
1,1,1,38.0,1,0,71.2833,Mrs,3,2,1
2,1,3,26.0,0,0,7.9250,Miss,2,3,1
3,1,1,35.0,1,0,53.1000,Mrs,3,2,1
4,0,3,35.0,0,0,8.0500,Mr,3,1,0
...,...,...,...,...,...,...,...,...,...,...
882,0,2,27.0,0,0,13.0000,Rev,2,4,0
883,1,1,19.0,0,0,30.0000,Miss,1,3,1
884,0,3,7.0,1,2,23.4500,Miss,0,3,1
885,1,1,26.0,0,0,30.0000,Mr,2,1,0


In [59]:
data.drop('name_title', axis=1, inplace=True)
data

Unnamed: 0,Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,AgeGroup,Name_title,Gender
0,0,3,22.0,1,0,7.2500,2,1,0
1,1,1,38.0,1,0,71.2833,3,2,1
2,1,3,26.0,0,0,7.9250,2,3,1
3,1,1,35.0,1,0,53.1000,3,2,1
4,0,3,35.0,0,0,8.0500,3,1,0
...,...,...,...,...,...,...,...,...,...
882,0,2,27.0,0,0,13.0000,2,4,0
883,1,1,19.0,0,0,30.0000,1,3,1
884,0,3,7.0,1,2,23.4500,0,3,1
885,1,1,26.0,0,0,30.0000,2,1,0


In [61]:
x_data = data.loc[:, 'Pclass':'Gender']
y_data = data.loc[:,'Survived']

In [62]:
X_train, X_test, y_train, y_test = train_test_split(
    x_data, y_data, test_size=0.2, shuffle=True, random_state=20
)

In [63]:
from sklearn.neighbors import KNeighborsClassifier

In [84]:
knn = KNeighborsClassifier(n_neighbors=7)

In [85]:
knn.fit(X_train, y_train)

  return f(*args, **kwargs)


KNeighborsClassifier(n_neighbors=7)

In [86]:
pred = knn.predict(X_test)
pred

  return f(*args, **kwargs)


array([0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0], dtype=int64)

In [87]:
from sklearn.metrics import accuracy_score

In [88]:
score = accuracy_score(y_test, pred)
score

0.7471910112359551

In [89]:
from sklearn.svm import SVC

In [90]:
svc = SVC(kernel='linear')

In [91]:
svc.fit(X_train, y_train)

SVC(kernel='linear')

In [92]:
pred2 = svc.predict(X_test)
pred2

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1], dtype=int64)

In [93]:
score2 = accuracy_score(y_test, pred2)
score2

0.7752808988764045