### titanic-without-using-pipeline

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
df=sns.load_dataset('titanic')

In [3]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
#droping columns
df.drop(columns=['class','who','adult_male','deck','embark_town','alive','alone'],inplace=True)

In [5]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [6]:
#step 1-train/test/split
X_train, X_test, y_train, y_test=train_test_split(df.drop(columns=['survived']),df['survived'],test_size=0.2, random_state=42)

In [7]:
X_train.head(2)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S


In [8]:
y_train.head()

331    0
733    0
382    0
704    0
813    0
Name: survived, dtype: int64

In [9]:
df.isnull().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [10]:
#Applying imputation

si_age=SimpleImputer()
si_embarked=SimpleImputer(strategy='most_frequent')

X_train_age=si_age.fit_transform(X_train[['age']])
X_train_embarked=si_embarked.fit_transform(X_train[['embarked']])


X_test_age=si_age.transform(X_test[['age']])
X_test_embarked=si_embarked.transform(X_test[['embarked']])


In [11]:
df['embarked'].value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [12]:
df['sex'].value_counts()

male      577
female    314
Name: sex, dtype: int64

In [13]:
#one hot encoding sex and embarked
ohe_sex=OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe_embarked=OneHotEncoder(sparse=False, handle_unknown='ignore')

X_train_sex=ohe_sex.fit_transform(X_train[['sex']])
X_train_embarked=ohe_embarked.fit_transform(X_train_embarked)

X_test_sex=ohe_sex.transform(X_test[['sex']])
X_test_embarked=ohe_embarked.transform(X_test_embarked)


In [14]:
X_train.head(2)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S


In [15]:
X_train_sex

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

In [16]:
X_train_rem=X_train.drop(columns=['sex','age','embarked'])

In [17]:
X_test_rem=X_test.drop(columns=['sex','age','embarked'])

In [18]:
X_train_transformed=np.concatenate((X_train_rem, X_train_age, X_train_sex, X_train_embarked),axis=1)
X_test_transformed=np.concatenate((X_test_rem, X_test_age,X_test_sex, X_test_embarked), axis=1)

In [21]:
X_train_transformed.shape

(712, 10)

In [22]:
X_test_transformed.shape

(179, 10)

In [24]:
clf=DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

DecisionTreeClassifier()

In [25]:
y_pred=clf.predict(X_test_transformed)

In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7988826815642458

In [27]:
import pickle

In [29]:
pickle.dump(ohe_sex,open('Voting/ohe_sex.pk1','wb'))
pickle.dump(ohe_embarked,open('Voting/ohe_embarked.pk1','wb'))
pickle.dump(clf, open('Voting/clf.pk1','wb'))