In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import seaborn as sns
import pandas as pd

# Load data
df = sns.load_dataset('titanic')

# Check for missing values
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None


In [12]:
# Impute missing values using KNNImputer
imputer = KNNImputer(n_neighbors=5)
df[['age']] = imputer.fit_transform(df[['age']])
df[['fare']] = imputer.fit_transform(df[['fare']])

# Use LabelEncoder for categorical variables
le = LabelEncoder()
df['embark_town'] = le.fit_transform(df['embark_town'])
df['embarked'] = le.fit_transform(df['embarked'])
df['sex'] = le.fit_transform(df['sex'])

# encode the categorical variables using for loop where object and categoy datatypes are given
for col in df.columns:
    if df[col].dtype == 'object' or df[col].dtype.name == 'category':
        df[col] = LabelEncoder().fit_transform(df[col])

# Drop unnecessary columns
df = df.drop(['deck'], axis=1)

# Split data into training and testing sets
X = df.drop('survived', axis=1)
y = df['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#######################################################################
# Define pipeline with ---> RandomForestClassifier
pipeline_rf = Pipeline([
    ('pca', PCA(n_components=5)),
    ('clf', RandomForestClassifier())
])

# Define pipeline with ---> LogisticRegression
pipeline_logreg = Pipeline([
    ('pca', PCA(n_components=5)),
    ('logreg', LogisticRegression())
])

# Define hyperparameter tuning space for RandomForestClassifier
param_grid_rf = {
    'clf__n_estimators': [10, 50, 100, 200],
    'clf__max_depth': [None, 5, 10],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 5, 10]
}

# Define hyperparameter tuning space for LogisticRegression
param_grid_logreg = {
    'logreg__C': [0.1, 1, 10],
    'logreg__penalty': ['l2']
}
#######################################################################

In [13]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("X_train:", X_train.head())
print("y_train:", y_train.head())

Shape of X_train: (712, 13)
Shape of y_train: (712,)
X_train:      pclass  sex   age  sibsp  parch     fare  embarked  class  who  \
331       1    1  45.5      0      0  28.5000         2      0    1   
733       2    1  23.0      0      0  13.0000         2      1    1   
382       3    1  32.0      0      0   7.9250         2      2    1   
704       3    1  26.0      1      0   7.8542         2      2    1   
813       3    0   6.0      4      2  31.2750         2      2    0   

     adult_male  embark_town  alive  alone  
331        True            2      0   True  
733        True            2      0   True  
382        True            2      0   True  
704        True            2      0  False  
813       False            2      0  False  
y_train: 331    0
733    0
382    0
704    0
813    0
Name: survived, dtype: int64


In [14]:
X_train.T.apply(lambda x: x.unique(), axis=1)

pclass                                                 [1, 2, 3]
sex                                                       [1, 0]
age            [45.5, 23.0, 32.0, 26.0, 6.0, 24.0, 45.0, 29.0...
sibsp                                      [0, 1, 4, 3, 2, 8, 5]
parch                                      [0, 2, 1, 6, 4, 3, 5]
fare           [28.5, 13.0, 7.925, 7.8542, 31.275, 247.5208, ...
embarked                                            [2, 0, 1, 3]
class                                                  [0, 1, 2]
who                                                    [1, 0, 2]
adult_male                                         [True, False]
embark_town                                         [2, 0, 1, 3]
alive                                                     [0, 1]
alone                                              [True, False]
dtype: object

In [15]:
# Perform randomized search with cross-validation for RandomForestClassifier
random_search_rf = RandomizedSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='accuracy', n_iter=10)
random_search_rf.fit(X_train, y_train)

In [17]:
# Perform randomized search with cross-validation for LogisticRegression
random_search_logreg = GridSearchCV(pipeline_logreg, param_grid_logreg, cv=5 )
random_search_logreg.fit(X_train, y_train)

In [18]:
# Print best parameters and best score for RandomForestClassifier
print("Best Parameters (RF): ", random_search_rf.best_params_)
print("Best Score (RF): ", random_search_rf.best_score_)

# Print best parameters and best score for LogisticRegression
print("Best Parameters (LogReg): ", random_search_logreg.best_params_)
print("Best Score (LogReg): ", random_search_logreg.best_score_)

Best Parameters (RF):  {'clf__n_estimators': 200, 'clf__min_samples_split': 5, 'clf__min_samples_leaf': 1, 'clf__max_depth': None}
Best Score (RF):  0.8693686595095047
Best Parameters (LogReg):  {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Score (LogReg):  0.7668964837978922
