In [1]:
# imports...
from sklearn import datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# read csv file and split the data into X_train, X_test, y_train, y_test
df = pd.read_csv('titanic(2).csv')

# Modeling 1: only 2 features

In [3]:
# dropping irrelevant columns and rows with null values
df1 = df.copy()
df1 = df1.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked', 'Sex', 'SibSp', 'Parch', 'Fare'], axis=1)
df1 = df1.dropna(axis=0, how='any', subset=['Age', 'Pclass'])
df1.head()

Unnamed: 0,Survived,Pclass,Age
0,0,3,22.0
1,1,1,38.0
2,1,3,26.0
3,1,1,35.0
4,0,3,35.0


In [4]:
X = df1.drop('Survived', axis=1)
y = df1['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train = X_train.reset_index().drop('index', axis=1) # resetting the index
X_test = X_test.reset_index().drop('index', axis=1) # resetting the index
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(535, 2) (179, 2) (535,) (179,)


In [5]:
# Training of the Machine Learning model (Random Forest)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

# making prediction on the test set
y_pred = clf.predict(X_test)

In [6]:
print(f"Accuracy score: {100*f1_score(y_test, y_pred):.2f}%")

Accuracy score: 48.70%


# Modeling 2: adding more features

In [7]:
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train = X_train.reset_index().drop('index', axis=1) # resetting the index
X_test = X_test.reset_index().drop('index', axis=1) # resetting the index
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(668, 11) (223, 11) (668,) (223,)


In [8]:
# Train data preparation

# dropping irrelevant columns
X_train =  X_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# replacing null values in Age and Embarked
avg_age = X_train.Age.mean()
X_train.Age =  X_train.Age.fillna(avg_age)
X_train.Embarked = X_train.Embarked.fillna('unknown')

# enconding categorical data using One Hot Encoding
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_train[['Sex', 'Embarked']])
encoding = pd.DataFrame(enc.transform(X_train[['Sex', 'Embarked']]).toarray(), columns=enc.get_feature_names_out())
X_train = X_train.join(encoding)
X_train = X_train.drop(['Sex', 'Embarked'], axis=1)

X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_unknown
0,1,29.421343,0,0,30.5,0.0,1.0,0.0,0.0,1.0,0.0
1,3,25.0,0,0,7.05,0.0,1.0,0.0,0.0,1.0,0.0
2,2,24.0,0,2,14.5,1.0,0.0,0.0,0.0,1.0,0.0
3,3,22.0,0,0,7.5208,0.0,1.0,0.0,0.0,1.0,0.0
4,1,0.92,1,2,151.55,0.0,1.0,0.0,0.0,1.0,0.0


In [9]:
# Test data preparation

# drop irrelevant columns
X_test = X_test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# replacing null values in Age and Embarked
X_test.Age =  X_test.Age.fillna(avg_age)
X_test.Embarked = X_test.Embarked.fillna('unknown')

# enconding categorical data using One Hot Encoding
encoding = pd.DataFrame(enc.transform(X_test[['Sex', 'Embarked']]).toarray(), columns=enc.get_feature_names_out())
X_test = X_test.join(encoding)
X_test = X_test.drop(['Sex', 'Embarked'], axis=1)
X_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_unknown
0,3,29.421343,1,1,15.2458,0.0,1.0,1.0,0.0,0.0,0.0
1,2,31.0,0,0,10.5,0.0,1.0,0.0,0.0,1.0,0.0
2,3,20.0,0,0,7.925,0.0,1.0,0.0,0.0,1.0,0.0
3,2,6.0,0,1,33.0,1.0,0.0,0.0,0.0,1.0,0.0
4,3,14.0,1,0,11.2417,1.0,0.0,1.0,0.0,0.0,0.0


In [10]:
# Training of the Machine Learning model (Random Forest)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

# making prediction on the test set
y_pred = clf.predict(X_test)

In [11]:
print(f"Accuracy score: {100*f1_score(y_test, y_pred):.2f}%")

Accuracy score: 72.41%


# Modeling 3 : Fine-tuning

In [12]:
## n_estimators
for est in [1, 10, 50, 100, 150, 200, 300]:
    clf = RandomForestClassifier(n_estimators=est, random_state=0)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print(f"Accuracy (n_estimators={est}): {100*f1_score(y_test, y_pred):.2f}%")

Accuracy (n_estimators=1): 70.45%
Accuracy (n_estimators=10): 72.51%
Accuracy (n_estimators=50): 75.00%
Accuracy (n_estimators=100): 75.00%
Accuracy (n_estimators=150): 74.16%
Accuracy (n_estimators=200): 75.00%
Accuracy (n_estimators=300): 74.29%


In [13]:
## max_depth
for md in [1, 2, 10, 15, 20]:
    clf = RandomForestClassifier(n_estimators=100, max_depth=md, random_state=0)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print(f"Accuracy (n_estimators={est}): {100*f1_score(y_test, y_pred):.2f}%")

Accuracy (n_estimators=300): 71.43%
Accuracy (n_estimators=300): 72.41%
Accuracy (n_estimators=300): 76.30%
Accuracy (n_estimators=300): 75.43%
Accuracy (n_estimators=300): 75.00%


In [14]:
# import GridSearchCV library
from sklearn.model_selection import GridSearchCV

In [None]:
# first we need to define the list of parameters and list of values we want to test
# For each parameters, give a list of all values you want to test (same list used in previous exercise)
parameters = {
    'n_estimators':[1, 10, 50, 100, 150, 200, 300],
    'max_depth':[1, 2, 10, 15, 20]
}

# init random forest object
rf = RandomForestClassifier(random_state=0)
# init grid search object
gs = GridSearchCV(rf, param_grid  = parameters, cv=10, scoring='accuracy')

# fit grid search object using train data
gs.fit(X_train, y_train)

In [33]:
# printing the best set of parameters found by grid search
print(f"Best parameters: {gs.best_params_}")

# getting the trained model with best performance
final_model = gs.best_estimator_

Best parameters: {'max_depth': 10, 'n_estimators': 200}


In [26]:
# making prediction on test set using best model
y_pred = final_model.predict(X_test)

In [31]:
# calculating the accuracy of the model on test data


In [32]:
 print(f"Accuracy (n_estimators={est}): {100*f1_score(y_test, y_pred):.2f}%")

Accuracy (n_estimators=300): 75.14%
