In [83]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score 



In [84]:
titanic = sns.load_dataset("titanic")
titanic.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [85]:
#DT, rf, gb  survived == 1 alive
X = titanic[["pclass", "sex", "age", "sibsp", "parch"]]
y = titanic["survived"]
X = pd.get_dummies(titanic, columns= ["sex"], drop_first=True)


In [86]:
X["age"].fillna(X["age"].median(), inplace = True)
print(X.head())

   survived  pclass   age  sibsp  parch     fare embarked  class    who  \
0         0       3  22.0      1      0   7.2500        S  Third    man   
1         1       1  38.0      1      0  71.2833        C  First  woman   
2         1       3  26.0      0      0   7.9250        S  Third  woman   
3         1       1  35.0      1      0  53.1000        S  First  woman   
4         0       3  35.0      0      0   8.0500        S  Third    man   

   adult_male deck  embark_town alive  alone  sex_male  
0        True  NaN  Southampton    no  False      True  
1       False    C    Cherbourg   yes  False     False  
2       False  NaN  Southampton   yes   True     False  
3       False    C  Southampton   yes  False     False  
4        True  NaN  Southampton    no   True      True  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X["age"].fillna(X["age"].median(), inplace = True)


In [87]:
print("Data Types:")
print(X.dtypes)
print("Preprocessed Data:")
print(X.head())

Data Types:
survived          int64
pclass            int64
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
sex_male           bool
dtype: object
Preprocessed Data:
   survived  pclass   age  sibsp  parch     fare embarked  class    who  \
0         0       3  22.0      1      0   7.2500        S  Third    man   
1         1       1  38.0      1      0  71.2833        C  First  woman   
2         1       3  26.0      0      0   7.9250        S  Third  woman   
3         1       1  35.0      1      0  53.1000        S  First  woman   
4         0       3  35.0      0      0   8.0500        S  Third    man   

   adult_male deck  embark_town alive  alone  sex_male  
0        True  NaN  Southampton    no  False      True  
1       False    C    Cherbour

In [88]:
train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size = 0.2
)

In [89]:
print("X_train shape before scaling:", train_X.shape)  
print("X_test shape before scaling:", test_X.shape)    
print("y_train shape:", train_y.shape)                 
print("y_test shape:", test_y.shape)  

X_train shape before scaling: (712, 15)
X_test shape before scaling: (179, 15)
y_train shape: (712,)
y_test shape: (179,)


In [90]:
standardScaler = StandardScaler()

train_X = train_X.select_dtypes(include=['float64', 'int64'])
test_X = test_X.select_dtypes(include=['float64', 'int64'])

standardScaler.fit(train_X)

print(standardScaler.mean_)
print(standardScaler.scale_)

train_scaled = standardScaler.transform(train_X)
test_scaled = standardScaler.transform(test_X)


[ 0.38764045  2.30477528 29.01287921  0.50702247  0.38342697 31.75434115]
[ 0.48721179  0.83704919 12.9130898   1.0358496   0.8144796  47.83182304]


In [91]:
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'max_depth': [3, 5, 7]
}

In [92]:

dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier()
gb_model = GradientBoostingClassifier()

dt_model.fit(train_scaled, train_y)
rf_model.fit(train_scaled, train_y)
gb_model.fit(train_scaled, train_y)

dt_predictions = dt_model.predict(test_scaled)
rf_predictions = rf_model.predict(test_scaled)
gb_predictions = gb_model.predict(test_scaled)

dt_accuracy = accuracy_score(test_y, dt_predictions)
rf_accuracy = accuracy_score(test_y, rf_predictions)
gb_accuracy = accuracy_score(test_y, gb_predictions)

In [94]:
print(f"\nDT ACC: {dt_accuracy:.4f}")
print(f"RF ACC: {rf_accuracy:.4f}")
print(f"GB ACC: {gb_accuracy:.4f}")


DT ACC: 1.0000
RF ACC: 1.0000
GB ACC: 1.0000
