## Preprocessing

In [20]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

In [4]:
# Loading DataSet
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.shape

(891, 12)

In [6]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
# Droping the Columns with Null values and non useable columns.
df.drop(['Age' , 'Cabin' , 'Embarked' , 'Name', 'Ticket'] , axis = 1 , inplace = True)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Fare
0,1,0,3,male,1,0,7.25
1,2,1,1,female,1,0,71.2833
2,3,1,3,female,0,0,7.925


In [8]:
#Label Encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df.Sex = encoder.fit_transform(df.Sex)

In [9]:
#Splitting The DataSet into trian and Test Sets
from sklearn.model_selection import train_test_split
X = df.drop(['Survived'] , axis = 1)
y = df.Survived
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2)

In [10]:
print(X_train.shape , y_test.shape)

(712, 6) (179,)


## Cross Validation

In [63]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
CV = cross_val_score(LogisticRegression(max_iter=800) , X , y)
CV

array([0.81005587, 0.80337079, 0.78089888, 0.76966292, 0.80898876])

## HyperParameter Tuning

### GridSearchCV

In [64]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [69]:
parameter = {'n_estimators' : [5 , 20 ,30 , 50 , 80 , 100]}

In [70]:
from sklearn.model_selection import GridSearchCV
classifier = GridSearchCV(model , parameter , cv = 5)

In [71]:
classifier.fit(X ,y)

0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'n_estimators': [5, 20, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,30
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [75]:
resultGCV = pd.DataFrame(classifier.cv_results_)
resultGCV.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.039379,0.029937,0.003317,0.003903,5,{'n_estimators': 5},0.648045,0.764045,0.825843,0.803371,0.803371,0.768935,0.063637,5
1,0.088374,0.020645,0.004341,0.003663,20,{'n_estimators': 20},0.659218,0.792135,0.808989,0.792135,0.803371,0.771169,0.056355,3
2,0.134125,0.0612,0.008295,0.007379,30,{'n_estimators': 30},0.698324,0.797753,0.825843,0.814607,0.803371,0.787979,0.045854,1
3,0.179177,0.033014,0.007016,0.006232,50,{'n_estimators': 50},0.715084,0.797753,0.831461,0.786517,0.808989,0.787961,0.039353,2
4,0.272991,0.029726,0.007569,0.00747,80,{'n_estimators': 80},0.625698,0.786517,0.814607,0.808989,0.808989,0.76896,0.072278,4


In [77]:
# Heighest Accuracy 
print((resultGCV['mean_test_score'].max())*100 , ' %')

78.79794112108468  %


In [78]:
# Heighest Accuracy Parameters
print(classifier.best_params_)

{'n_estimators': 30}


### RandomizedSearchCV

In [89]:
parameter = {'n_estimators' : [5 , 10 , 20 ,30 ,35 , 40 , 45 , 50 , 80 , 100 , 150 , 200 , 250 , 300 , 500 , 1000]}

In [90]:
from sklearn.model_selection import RandomizedSearchCV
classifier = RandomizedSearchCV(model , parameter , cv = 5)

In [91]:
classifier.fit(X ,y)

0,1,2
,estimator,RandomForestClassifier()
,param_distributions,"{'n_estimators': [5, 10, ...]}"
,n_iter,10
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,n_estimators,45
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [92]:
resultGCV = pd.DataFrame(classifier.cv_results_)
resultGCV.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.923286,0.09555,0.032683,0.009874,250,{'n_estimators': 250},0.614525,0.769663,0.831461,0.803371,0.814607,0.766725,0.07874,8
1,0.271041,0.025783,0.010241,0.005929,80,{'n_estimators': 80},0.703911,0.786517,0.831461,0.792135,0.808989,0.784602,0.043271,3
2,0.036109,0.037445,0.001215,0.00162,5,{'n_estimators': 5},0.659218,0.775281,0.769663,0.780899,0.814607,0.759933,0.052731,10
3,0.149704,0.007392,0.004796,0.006067,45,{'n_estimators': 45},0.703911,0.803371,0.837079,0.808989,0.825843,0.795838,0.0475,1
4,0.124112,0.034518,0.006026,0.005936,35,{'n_estimators': 35},0.715084,0.780899,0.825843,0.797753,0.820225,0.787961,0.039832,2


In [93]:
# Heighest Accuracy 
print((resultGCV['mean_test_score'].max())*100 , ' %')

79.58383026803088  %


In [94]:
# Heighest Accuracy Parameters
print(classifier.best_params_)

{'n_estimators': 45}


## Decision Trees

In [12]:
from sklearn.tree import DecisionTreeClassifier
Tree = DecisionTreeClassifier()
Tree.fit(X_train , y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [15]:
from sklearn.metrics import accuracy_score
y_predict = Tree.predict(X_test)
accuracy_score(y_test, y_predict)

0.7877094972067039

## Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
RNF = RandomForestClassifier(n_estimators = 50)

In [18]:
RNF.fit(X_train , y_train)

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [19]:
y_predict = RNF.predict(X_test)
accuracy_score(y_test , y_predict)

0.776536312849162

## Gradient Boost

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

In [22]:
model = GradientBoostingClassifier()
model.fit(X_train , y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [25]:
y_pred = model.predict(X_test)
accuracy_score(y_pred , y_test)

0.7877094972067039

## XG Boost

In [27]:
pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 1.0/150.0 MB 6.3 MB/s eta 0:00:24
   ---------------------------------------- 1.6/150.0 MB 3.6 MB/s eta 0:00:41
    --------------------------------------- 2.1/150.0 MB 3.3 MB/s eta 0:00:46
    --------------------------------------- 3.1/150.0 MB 3.7 MB/s eta 0:00:40
   - -------------------------------------- 3.9/150.0 MB 3.7 MB/s eta 0:00:40
   - -------------------------------------- 4.7/150.0 MB 3.8 MB/s eta 0:00:39
   - -------------------------------------- 5.5/150.0 MB 3.8 MB/s eta 0:00:38
   - -------------------------------------- 6.3/150.0 MB 3.8 MB/s eta 0:00:38
   - -------------------------------------- 7.1/150.0 MB 3.8 MB/s eta 0:00:38
 

In [28]:
from xgboost import XGBClassifier

In [29]:
XGB = XGBClassifier()
XGB.fit(X_train , y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [30]:
y_pred = XGB.predict(X_test)
accuracy_score(y_test , y_pred)

0.776536312849162

## Catboost

In [31]:
pip install catboost

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl (102.4 MB)
   ---------------------------------------- 0.0/102.4 MB ? eta -:--:--
   ---------------------------------------- 1.0/102.4 MB 6.3 MB/s eta 0:00:17
    --------------------------------------- 1.6/102.4 MB 3.6 MB/s eta 0:00:28
    --------------------------------------- 2.1/102.4 MB 4.1 MB/s eta 0:00:25
   - -------------------------------------- 3.1/102.4 MB 3.8 MB/s eta 0:00:27
   - -------------------------------------- 3.7/102.4 MB 3.4 MB/s eta 0:00:30
   -- ------------------------------------- 5.2/102.4 MB 3.8 MB/s eta 0:00:26
   -- ------------------------------------- 6.6/102.4 MB 3.8 M

In [32]:
from catboost import CatBoostClassifier

In [45]:
model = CatBoostClassifier()

In [46]:
model.fit(X_train , y_train)

Learning rate set to 0.008911
0:	learn: 0.6870136	total: 3.56ms	remaining: 3.56s
1:	learn: 0.6812200	total: 6.7ms	remaining: 3.34s
2:	learn: 0.6753715	total: 10.2ms	remaining: 3.39s
3:	learn: 0.6701606	total: 14.6ms	remaining: 3.62s
4:	learn: 0.6648123	total: 20.5ms	remaining: 4.07s
5:	learn: 0.6595875	total: 23.5ms	remaining: 3.89s
6:	learn: 0.6550905	total: 25.3ms	remaining: 3.59s
7:	learn: 0.6504464	total: 38.1ms	remaining: 4.73s
8:	learn: 0.6449847	total: 103ms	remaining: 11.4s
9:	learn: 0.6398612	total: 118ms	remaining: 11.7s
10:	learn: 0.6354415	total: 149ms	remaining: 13.4s
11:	learn: 0.6304517	total: 187ms	remaining: 15.4s
12:	learn: 0.6266462	total: 195ms	remaining: 14.8s
13:	learn: 0.6228026	total: 197ms	remaining: 13.9s
14:	learn: 0.6192540	total: 199ms	remaining: 13s
15:	learn: 0.6149805	total: 208ms	remaining: 12.8s
16:	learn: 0.6113403	total: 230ms	remaining: 13.3s
17:	learn: 0.6078649	total: 232ms	remaining: 12.7s
18:	learn: 0.6036216	total: 235ms	remaining: 12.1s
19:	le

<catboost.core.CatBoostClassifier at 0x2b77c8e3110>

In [35]:
y_pred = model.predict(X_test)

In [36]:
accuracy_score(y_test , y_pred)

0.7821229050279329

## Adaboost

In [37]:
from sklearn.ensemble import AdaBoostClassifier

In [38]:
model = AdaBoostClassifier()

In [40]:
model.fit(X_train , y_train)

0,1,2
,estimator,
,n_estimators,50
,learning_rate,1.0
,algorithm,'deprecated'
,random_state,


In [41]:
y_pred = model.predict(X_test)

In [42]:
accuracy_score(y_pred , y_test)

0.770949720670391

## LightGBM

In [47]:
pip install lightgbm

Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------- ----------- 1.0/1.5 MB 5.6 MB/s eta 0:00:01
   ------------------------------------ --- 1.3/1.5 MB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 2.2 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [48]:
from lightgbm import LGBMClassifier

In [49]:
model = LGBMClassifier()

In [50]:
model.fit(X_train , y_train)

[LightGBM] [Info] Number of positive: 271, number of negative: 441
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 373
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.380618 -> initscore=-0.486926
[LightGBM] [Info] Start training from score -0.486926


[WinError 2] The system cannot find the file specified
  File "C:\ProgramData\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\ProgramData\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [51]:
y_pred = model.predict(X_test)

In [52]:
accuracy_score(y_pred , y_test)

0.7653631284916201

## KNN

In [53]:
from sklearn.neighbors import KNeighborsClassifier

In [54]:
model = KNeighborsClassifier()

In [55]:
model.fit(X_train , y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [56]:
y_pred = model.predict(X_test)

In [57]:
accuracy_score(y_pred , y_test)

0.6145251396648045