In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
df = df.drop(columns=['Cabin', 'Name', 'Ticket', 'Embarked'])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,male,22.0,1,0,7.25
1,2,1,1,female,38.0,1,0,71.2833
2,3,1,3,female,26.0,0,0,7.925
3,4,1,1,female,35.0,1,0,53.1
4,5,0,3,male,35.0,0,0,8.05


In [5]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,0,22.0,1,0,7.25
1,2,1,1,1,38.0,1,0,71.2833
2,3,1,3,1,26.0,0,0,7.925
3,4,1,1,1,35.0,1,0,53.1
4,5,0,3,0,35.0,0,0,8.05


In [6]:
x = df.drop(columns=['Survived'])
y = df['Survived']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [7]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(x_train, y_train)

logreg_preds = logreg.predict(x_test)
logreg_accuracy = accuracy_score(y_test, logreg_preds)

print("Logistic Regression Accuracy:", logreg_accuracy * 100)

Logistic Regression Accuracy: 80.97014925373134


In [8]:
lr = LogisticRegression(max_iter=1000)
lr_param_dist = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}
lr_search = RandomizedSearchCV(lr, lr_param_dist, n_iter=5, cv=3, random_state=42)
lr_search.fit(x_train, y_train)
best_lr = lr_search.best_estimator_

In [9]:
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)

gb_preds = gb.predict(x_test)
gb_accuracy = accuracy_score(y_test, gb_preds)

print("Gradient Boosting Accuracy:", gb_accuracy * 100)

Gradient Boosting Accuracy: 81.71641791044776


In [10]:
gb = GradientBoostingClassifier()
gb_param_dist = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}
gb_search = RandomizedSearchCV(gb, gb_param_dist, n_iter=5, cv=3, random_state=42)
gb_search.fit(x_train, y_train)
best_gb = gb_search.best_estimator_
print("best gb:", best_gb)

best gb: GradientBoostingClassifier(n_estimators=50)


In [11]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

preds = model.predict(x_test)
accuracy = accuracy_score(y_test, preds)

print(accuracy * 100)

80.59701492537313


In [12]:
rf = RandomForestClassifier()
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(x_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   

0,1,2
,estimator,RandomForestClassifier()
,param_distributions,"{'bootstrap': [True, False], 'max_depth': [None, 10, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...], ...}"
,n_iter,50
,scoring,
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


In [13]:
print("Best Hyperparameters:", random_search.best_params_)
best_model = random_search.best_estimator_
best_preds = best_model.predict(x_test)
best_accuracy = accuracy_score(y_test, best_preds)
print("Best Model Accuracy:", best_accuracy * 100)

Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': None, 'bootstrap': False}
Best Model Accuracy: 80.97014925373134


In [14]:
estimators = [
    ('lr', best_lr),
    ('rf', RandomForestClassifier()),  # Or use best_rf if you want
    ('gb', best_gb)
]

# Create the stacking classifier
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression()
)

# Fit on training data
stacking_clf.fit(x_train, y_train)

# Predict on test data
stacking_preds = stacking_clf.predict(x_test)

# Evaluate accuracy
stacking_accuracy = accuracy_score(y_test, stacking_preds)
print("Stacking Classifier Accuracy:", stacking_accuracy * 100)

Stacking Classifier Accuracy: 82.08955223880598
