# Alexine Studios

In [1]:
!mkdir datasets

In [2]:
!wget -qq https://raw.githubusercontent.com/chiruharshith/Alexine_Studios/main/datasets/Titanic.csv -P datasets

### Import Required Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv("datasets/Titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
## SibSp = Siblings / Spouses
## Parch = Parents / Childrens

In [5]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
to_be_dropped = ['Ticket', 'Fare','PassengerId','Name','Cabin']

In [7]:
df.drop(to_be_dropped, axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [9]:
df.shape

(891, 7)

In [10]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Embarked      2
dtype: int64

In [12]:
avg_age = df['Age'].mean()
avg_age

29.69911764705882

In [13]:
df['Age'] = df['Age'].fillna(avg_age)

In [14]:
df.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    2
dtype: int64

In [15]:
mode_embarked = df['Embarked'].mode()
mode_embarked

0    S
dtype: object

In [16]:
df['Embarked'] = df['Embarked'].fillna("S")

In [17]:
df.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
dtype: int64

**Label Encoding**

In [18]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [19]:
set(df['Sex'])

{'female', 'male'}

In [20]:
df['Sex'] = df['Sex'].replace('male', 1)

In [21]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,1,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,1,35.0,0,0,S


In [22]:
df['Sex'] = df['Sex'].replace('female', 2)

In [23]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,1,22.0,1,0,S
1,1,1,2,38.0,1,0,C
2,1,3,2,26.0,0,0,S
3,1,1,2,35.0,1,0,S
4,0,3,1,35.0,0,0,S


In [24]:
set(df['Sex'])

{1, 2}

In [25]:
# Male:1
# Female:2

In [26]:
# S: 1
# C: 2
# Q: 3

In [27]:
set(df["Embarked"])

{'C', 'Q', 'S'}

In [28]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,1,22.0,1,0,S
1,1,1,2,38.0,1,0,C
2,1,3,2,26.0,0,0,S
3,1,1,2,35.0,1,0,S
4,0,3,1,35.0,0,0,S


In [29]:
df["Embarked"].unique()

array(['S', 'C', 'Q'], dtype=object)

In [30]:
df['Embarked'] = df['Embarked'].replace(['S','C','Q'],[1,2,3])

In [31]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,1,22.0,1,0,1
1,1,1,2,38.0,1,0,2
2,1,3,2,26.0,0,0,1
3,1,1,2,35.0,1,0,1
4,0,3,1,35.0,0,0,1


In [32]:
features = df.iloc[:,1:]
labels = df['Survived']

In [33]:
features.shape

(891, 6)

In [34]:
labels.shape

(891,)

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)

In [36]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((623, 6), (268, 6), (623,), (268,))

In [40]:
from sklearn import metrics

def getAccuracy(clf):
    # TRAIN THE MODEL USING THE TRAIN SETS
    clf.fit(X_train, y_train)
    
    # MAKE PREDICTIONS ON THE TEST SET
    y_pred = clf.predict(X_test)
    
    # COMPARING THE ACTUAL RESPONSE (y_test) WITH THE PREDICTED VALUES (y_pred)
    print("Accuracy is: ",metrics.accuracy_score(y_test, y_pred)*100)
    
    return clf

In [41]:
from sklearn.neighbors import KNeighborsClassifier

In [43]:
trained_model1 = getAccuracy(KNeighborsClassifier())

Accuracy is:  79.47761194029852


**Change the k(n_neighbors) value and see the difference in the accuracy**

In [44]:
# K = 2
trained_model2 = getAccuracy(KNeighborsClassifier(n_neighbors=2))

Accuracy is:  76.11940298507463


In [45]:
# K = 3
trained_model3 = getAccuracy(KNeighborsClassifier(n_neighbors=3))

Accuracy is:  79.8507462686567


In [46]:
# K = 4
trained_model4 = getAccuracy(KNeighborsClassifier(n_neighbors=4))

Accuracy is:  77.98507462686567


In [47]:
# K = 5
trained_model5 = getAccuracy(KNeighborsClassifier(n_neighbors=5))

Accuracy is:  79.47761194029852


In [48]:
# K = 6
trained_model6 = getAccuracy(KNeighborsClassifier(n_neighbors=6))

Accuracy is:  76.86567164179104


**DecisionTree**

In [50]:
from sklearn.tree import DecisionTreeClassifier
dt_model = getAccuracy(DecisionTreeClassifier())

Accuracy is:  77.23880597014924


In [51]:
dt_mode1 = getAccuracy(DecisionTreeClassifier(max_depth=2))

Accuracy is:  79.1044776119403


In [52]:
dt_mode2 = getAccuracy(DecisionTreeClassifier(max_depth=3))

Accuracy is:  81.34328358208955


In [53]:
dt_mode3 = getAccuracy(DecisionTreeClassifier(max_depth=4))

Accuracy is:  81.34328358208955


In [54]:
dt_mode4 = getAccuracy(DecisionTreeClassifier(max_depth=5))

Accuracy is:  81.71641791044776
