In [2]:
#Project 1
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier


In [2]:
url = "https://github.com/datasciencedojo/datasets/raw/master/titanic.csv"
df = pd.read_csv(url)
df.head()
df.info()
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
# Handling the missing values
imputer = SimpleImputer(strategy='median')
df['Age'] = imputer.fit_transform(df[['Age']])
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])
X = df.drop('Survived', axis=1)
y = df['Survived']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [4]:
def evaluate_models(models, X_train, X_test, y_train, y_test):
    for name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        print(f'{name} Accuracy: {accuracy_score(y_test, predictions):.4f}')
        print(confusion_matrix(y_test, predictions))
        print(classification_report(y_test, predictions))
        print('-'*50)
# My Different ML models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Classifier': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}
evaluate_models(models, X_train, X_test, y_train, y_test)


Logistic Regression Accuracy: 0.8045
[[90 15]
 [20 54]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       105
           1       0.78      0.73      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179

--------------------------------------------------
K-Nearest Neighbors Accuracy: 0.7989
[[89 16]
 [20 54]]
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       105
           1       0.77      0.73      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

--------------------------------------------------
Support Vector Classifier Accuracy: 0.8156
[[93 12]
 [21 53]]
              precision    recall  f1-score   support

           0   

In [5]:
# Cross-validation STEP
for name, model in models.items():
    cv_scores = cross_val_score(model, X_scaled, y, cv=5)
    print(f'{name} Cross-Validation Accuracy: {np.mean(cv_scores):.4f}')
# Using RFE FOR feature selection
lr = LogisticRegression()
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(X_train, y_train)
print("Selected Features:")
print(list(X.columns[rfe.support_]))

Logistic Regression Cross-Validation Accuracy: 0.7845
K-Nearest Neighbors Cross-Validation Accuracy: 0.8047
Support Vector Classifier Cross-Validation Accuracy: 0.8272
Random Forest Cross-Validation Accuracy: 0.8092
Gradient Boosting Cross-Validation Accuracy: 0.8216
Selected Features:
['Pclass', 'Sex', 'Age', 'SibSp', 'Embarked']


In [6]:
param_grid = {  #hyperparamter tuning step
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5]
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters for RandomForestClassifier:")
print(grid_search.best_params_)
best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)
print(f'Tuned RandomForestClassifier Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Best parameters for RandomForestClassifier:
{'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 200}
Tuned RandomForestClassifier Accuracy: 0.8212
[[96  9]
 [23 51]]
              precision    recall  f1-score   support

           0       0.81      0.91      0.86       105
           1       0.85      0.69      0.76        74

    accuracy                           0.82       179
   macro avg       0.83      0.80      0.81       179
weighted avg       0.82      0.82      0.82       179



In [None]:
#I have taken only the random forest for hyperparamter tuning 
#Project 1 