In [None]:
# imports...
from sklearn import datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# read csv file and split the data into X_train, X_test, y_train, y_test
df = pd.read_csv('titanic.csv')

# Modeling 1: only 2 features

In [None]:
# dropping irrelevant columns and rows with null values
df1 = df.copy()
df1 = df1.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked', 'Sex', 'SibSp', 'Parch', 'Fare'], axis=1)
df1 = df1.dropna(axis=0, how='any', subset=['Age', 'Pclass'])
df1.head()

In [None]:
X = df1.drop('Survived', axis=1)
y = df1['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train = X_train.reset_index().drop('index', axis=1) # resetting the index
X_test = X_test.reset_index().drop('index', axis=1) # resetting the index
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# Training of the Machine Learning model (Random Forest)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

# making prediction on the test set
y_pred = clf.predict(X_test)

In [None]:
print(f"Accuracy score: {100*f1_score(y_test, y_pred):.2f}%")

# Modeling 2: adding more features

In [None]:
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train = X_train.reset_index().drop('index', axis=1) # resetting the index
X_test = X_test.reset_index().drop('index', axis=1) # resetting the index
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# Train data preparation

# dropping irrelevant columns
X_train =  X_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# replacing null values in Age and Embarked
avg_age = X_train.Age.mean()
X_train.Age =  X_train.Age.fillna(avg_age)
X_train.Embarked = X_train.Embarked.fillna('unknown')

# enconding categorical data using One Hot Encoding
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_train[['Sex', 'Embarked']])
encoding = pd.DataFrame(enc.transform(X_train[['Sex', 'Embarked']]).toarray(), columns=enc.get_feature_names_out())
X_train = X_train.join(encoding)
X_train = X_train.drop(['Sex', 'Embarked'], axis=1)

X_train.head()

In [None]:
# Test data preparation

# drop irrelevant columns
X_test = X_test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# replacing null values in Age and Embarked
X_test.Age =  X_test.Age.fillna(avg_age)
X_test.Embarked = X_test.Embarked.fillna('unknown')

# enconding categorical data using One Hot Encoding
encoding = pd.DataFrame(enc.transform(X_test[['Sex', 'Embarked']]).toarray(), columns=enc.get_feature_names_out())
X_test = X_test.join(encoding)
X_test = X_test.drop(['Sex', 'Embarked'], axis=1)
X_test.head()

In [None]:
# Training of the Machine Learning model (Random Forest)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

# making prediction on the test set
y_pred = clf.predict(X_test)

In [None]:
print(f"Accuracy score: {100*f1_score(y_test, y_pred):.2f}%")

# Modeling 3 : Fine-tuning

In [None]:
## n_estimators
for est in [1, 10, 50, 100, 150, 200, 300]:
    clf = RandomForestClassifier(n_estimators=est, random_state=0)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print(f"Accuracy (n_estimators={est}): {100*f1_score(y_test, y_pred):.2f}%")

In [None]:
## max_depth
for md in [1, 2, 10, 15, 20]:
    clf = RandomForestClassifier(n_estimators=100, max_depth=md, random_state=0)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print(f"Accuracy (n_estimators={est}): {100*f1_score(y_test, y_pred):.2f}%")

In [None]:
# import GridSearchCV library
from sklearn.model_selection import GridSearchCV

In [None]:
# first we need to define the list of parameters and list of values we want to test
# For each parameters, give a list of all values you want to test (same list used in previous exercise)
parameters = {
    'n_estimators':[...],
    'max_depth':[...]
}

# init random forest object
rf = RandomForestClassifier(random_state=0)
# init grid search object
gs = GridSearchCV([...])
# fit grid search object using train data
gs.fit([...])

In [None]:
# printing the best set of parameters found by grid search
print(f"Best parameters: {gs.best_params_}")

# getting the trained model with best performance
final_model = gs.best_estimator_

In [None]:
# making prediction on test set using best model
y_pred = final_model.predict([...])

In [None]:
# calculating the accuracy of the model on test data
print(f"Accuracy (n_estimators={est}): {100*f1_score([...]):.2f}%")