### Work Flow
Data ---> Data Preprocessing ---> Feature Engineering ---> Data Analysis ---> Model Selection & Evaluation

In [1]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Data

In [2]:
# loading the data
titanic = pd.read_csv('train.csv')

# checking the size of the dataframe
print(f"Shape of dataframe: {titanic.shape}")

# print the first five rows of the dataframe
titanic.head()

Shape of dataframe: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Data Preprocessing

###### perform some basic data preprocessing

In [3]:
# finding some information on the dataframe
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# drop unnecessary columns
titanic = titanic.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [5]:
# check for missing values
titanic.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [6]:
# fill missing age values with the median value
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].mean())

In [7]:
# fill missing embarked values with mode
titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].mode()[0])

### Feature Engineering

###### performing feature engineering

In [8]:
# Add a new feature for family size
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1

In [9]:
# Add a new feature for alone/not alone
titanic['IsAlone'] = 0

In [10]:
# Set IsAlone to 1 if family size is 1
titanic.loc[titanic['IsAlone'] == 1, 'IsAlone'] = 1

In [11]:
# converting categorical columns
titanic = titanic.replace({'Sex':{'male':0, 'female':1}, 'Embarked':{'S':0, 'C':1, 'Q':2}})

### Model Selection

In [12]:
### Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

In [13]:
# split the data into features (X) and target (y)
X = titanic.drop(['Survived', 'Parch', 'SibSp', 'Fare'], axis=1)
y = titanic['Survived']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# peform feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
# Model 1
# Normal Random Forest Classifier

# train the random forest classifier
model_1 = RandomForestClassifier(n_estimators=100, random_state=42)
model_1.fit(X_train, y_train)

# make predictions on the test set
y_pred = model_1.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

# using the confusion matrix
print(confusion_matrix(y_test, y_pred))

Accuracy: 82.68156424581005
[[92 13]
 [18 56]]


In [15]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
# Model 2
# Random Forest Classifer 1 with hyperparameter tuning

# define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# create the random forest classifier
model_2 = RandomForestClassifier(random_state=42)

# perform grid search with corss-validation
grid_search = GridSearchCV(estimator=model_2, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

# use the best parameters to train the final model
final_model = RandomForestClassifier(random_state=42, **best_params)
final_model.fit(X_train, y_train)

# make predictions on the test set using the final model
y_pred = final_model.predict(X_test)

# Evaluate the final model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

# using the confusion matrix
print(confusion_matrix(y_test, y_pred))

Best Parameters: {'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.8300305328474341
Accuracy: 81.56424581005587
[[95 10]
 [23 51]]


In [17]:
# Model 3
# Random Forest Classifer 2 with hyperparameter tuning

# define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# create the random forest classifier
model = RandomForestClassifier(random_state=42)

# perform grid search with corss-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

# use the best parameters to train the final model
final_model = RandomForestClassifier(random_state=42, **best_params)
final_model.fit(X_train_scaled, y_train)

# make predictions on the test set using the final model
y_pred = final_model.predict(X_test_scaled)

# Evaluate the final model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

# using the confusion matrix
print(confusion_matrix(y_test, y_pred))

Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.8286220821432089
Accuracy: 81.56424581005587
[[95 10]
 [23 51]]


In [18]:
# Model 4
# XGBoost Classifer with hyperparameter tuning
from xgboost import XGBClassifier

# define the parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# create the XGBoost model
model = XGBClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

# use the best parameters to train the final model
final_model = XGBClassifier(**best_params)
final_model.fit(X_train_scaled, y_train)

# make predictions on the test set using the final model
y_pred = final_model.predict(X_test_scaled)

# Evaluate the final model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

# using the confusion matrix
print(confusion_matrix(y_test, y_pred))

Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Best Score: 0.8328277356446371
Accuracy: 82.68156424581005
[[96  9]
 [22 52]]


In [19]:
# Model 5
# Logistic Regression 1 

# load the model
model = LogisticRegression()

# training the logistic regression model with training data
model.fit(X_train, y_train)

# make predictions on training data
predictions_train = model.predict(X_train)

# accuracy score on training data
accuracy_train = accuracy_score(y_train, predictions_train)
print(f"Accuracy score of our model on training data: {accuracy_train*100}")

# make predictions on testing data
predictions_test = model.predict(X_test)

# accuracy score on training data
accuracy_test = accuracy_score(y_test, predictions_test)
print(f"Accuracy score of our model on test data: {accuracy_test*100}")

# using confusion matrix on training data
confusion_matrix(y_train, predictions_train)

# using confusion matrix on testing data
confusion_matrix(y_test, predictions_test)

Accuracy score of our model on training data: 80.33707865168539
Accuracy score of our model on test data: 81.56424581005587


array([[92, 13],
       [20, 54]], dtype=int64)

In [20]:
# Model 6
# Logistic Regression 2 with hyperparameter tuning

# define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none']
}

# create the logistic regression model
model = LogisticRegression()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

# use the best parameters to train the final model
final_model = LogisticRegression(**best_params)
final_model.fit(X_train_scaled, y_train)

# make predictions on the test set using the final model
y_pred = final_model.predict(X_test_scaled)

# Evaluate the final model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

# using the confusion matrix
print(confusion_matrix(y_test, y_pred))

Best Parameters: {'C': 0.01, 'penalty': 'l2'}
Best Score: 0.8089727174234216
Accuracy: 80.44692737430168
[[93 12]
 [23 51]]


In [21]:
# Model 7
# Naive Bayes with hyperparameter tuning
from sklearn.naive_bayes import GaussianNB

# for naive bayes, there are no hyperparameters to tune 

# Use the best parameters (not applicable for Gaussian Naive Bayes) to train the final model
final_model = GaussianNB()
final_model.fit(X_train_scaled, y_train)

# Make predictions on the test set using the final model
y_pred = final_model.predict(X_test_scaled)

# Evaluate the final model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

# using the confusion matrix
print(confusion_matrix(y_test, y_pred))

Accuracy: 76.53631284916202
[[84 21]
 [21 53]]
