In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
df = df[['survived','pclass','sex','age','sibsp','parch','fare','embarked','who','deck']]
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,deck
886,0,2,male,27.0,0,0,13.0,S,man,
887,1,1,female,19.0,0,0,30.0,S,woman,B
888,0,3,female,,1,2,23.45,S,woman,
889,1,1,male,26.0,0,0,30.0,C,man,C
890,0,3,male,32.0,0,0,7.75,Q,man,


In [6]:
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
who           0
deck        688
dtype: int64

In [9]:
df.age.fillna(df[df.who.isin(['man', 'woman'])].age.mean().round(1), inplace=True)

In [11]:
df.embarked.fillna('S', inplace=True)

In [12]:
df.drop(columns=['deck','who'], inplace=True)

In [13]:
df.isna().sum().sum()

0

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [15]:
df.sex = le.fit_transform(df.sex)
df.embarked = le.fit_transform(df.embarked)
df.tail(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
888,0,3,0,32.8,1,2,23.45,2
889,1,1,1,26.0,0,0,30.0,0
890,0,3,1,32.0,0,0,7.75,1


In [18]:

X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

In [19]:
from sklearn.preprocessing import StandardScaler
df_std = StandardScaler().fit_transform(X)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df_std, y, stratify=y, test_size=0.2, random_state=2023
)

In [21]:
# Hard Voting
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [22]:
lrc = LogisticRegression(random_state=2023)
svc = SVC(random_state=2023)
knn = KNeighborsClassifier(n_neighbors=5)

In [23]:
from sklearn.ensemble import VotingClassifier
voc = VotingClassifier(
    estimators=[('LRC', lrc), ('SVC', svc), ('KNN', knn)],
    voting='hard'
)

In [24]:
voc.fit(X_train, y_train)
voc.score(X_test, y_test)

0.776536312849162

In [25]:
lrc.fit(X_train, y_train)
svc.fit(X_train, y_train)
knn.fit(X_train, y_train)
lrc.score(X_test, y_test), svc.score(X_test, y_test), knn.score(X_test, y_test)

(0.7486033519553073, 0.7877094972067039, 0.7821229050279329)

In [26]:
# Soft Voting
svc2 = SVC(probability=True, random_state=2023)
svc2.fit(X_train, y_train)
svc2.predict_proba(X_test[:3])

array([[0.85366144, 0.14633856],
       [0.51052503, 0.48947497],
       [0.84773444, 0.15226556]])

In [28]:
voc2 = VotingClassifier(
        estimators=[('LRC', lrc), ('SVC', svc2), ('KNN', knn)],
    voting='soft'
)
voc2.fit(X_train, y_train)
voc2.score(X_test, y_test)

0.8044692737430168

In [29]:
lrc.C, svc.C

(1.0, 1.0)

In [30]:
from sklearn.model_selection import GridSearchCV
params = {
    'LRC__C':[0.1, 1, 10],
    'SVC__C':[0.1, 1, 10]
}
grid_voc2 = GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 1, 'SVC__C': 0.1}

In [31]:
params = {
    'LRC__C':[0.5, 1, 3],
    'SVC__C':[0.01, 0.1, 0.5]
}
grid_voc2 = GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 0.5, 'SVC__C': 0.5}

In [32]:
params = {
    'LRC__C':[0.1, 0.5, 1],
    'SVC__C':[0.1, 0.5, 1]
}
grid_voc2 = GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 0.1, 'SVC__C': 0.5}

In [33]:
grid_voc2.best_estimator_.score(X_test,y_test)

0.8044692737430168

In [35]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2023)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2023,
 'verbose': 0,
 'warm_start': False}

In [36]:
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.776536312849162

In [37]:
rfc.predict_proba(X_test[:5])

array([[0.51566667, 0.48433333],
       [0.3       , 0.7       ],
       [0.99      , 0.01      ],
       [0.96      , 0.04      ],
       [0.93      , 0.07      ]])