In [236]:
from typing import Tuple
from sklearn.model_selection import train_test_split

import pandas as pd

TRAIN_PATH = "kaggle/input/titanic/train.csv"
TEST_PATH = "kaggle/input/titanic/test.csv"
GENDER_SUBMISSION_PATH = "kaggle/input/titanic/gender_submission.csv"
RANDOM_STATE = 137

In [237]:
from typing import Union


def one_hot(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    df = df.copy()

    df[column_name] = list(map(lambda x: f'{column_name}_{x}', df[column_name]))
    one_hot = pd.get_dummies(df[column_name])
    df = df.drop(column_name, axis=1)
    df = df.join(one_hot)

    return df


def transform_name(names: pd.Series) -> pd.Series:
    return names

def print_nan_rows(df: pd.DataFrame) -> None:
    is_NaN = df.isnull()
    row_has_NaN = is_NaN.any(axis=1)
    rows_with_NaN = df[row_has_NaN]

    print(rows_with_NaN)

def drop_non_impormative_columns(data: pd.DataFrame) -> pd.DataFrame:
    return data.drop(columns=['Ticket', 'Name', 'Age', 'Cabin', 'Embarked', 'Fare'])

def transform_data(data: pd.DataFrame) -> pd.DataFrame:
    data = drop_non_impormative_columns(data)

    data = one_hot(data, 'Pclass')
    data = one_hot(data, 'Sex')
    data = one_hot(data, 'SibSp')
    data = one_hot(data, 'Parch')

    return data


def get_data_target(data: pd.DataFrame) -> Tuple[pd.DataFrame, Union[pd.Series, None]]:
    if 'Survived' in data.columns:
        target = data['Survived']
        return data.drop(columns=['Survived']), target
    else:
        return data, None

train = pd.read_csv(TRAIN_PATH).set_index('PassengerId')
test = pd.read_csv(TEST_PATH).set_index('PassengerId')

train = transform_data(train)
test = transform_data(test)

train_x, train_y = get_data_target(train)
test_x, _ = get_data_target(test)

for column in test_x.columns:
    if column not in train_x.columns:
        test_x.drop(columns=[column], inplace=True)


In [238]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

parameters = {
    'n_estimators': [10, 100, 150, 250, 500],
    'max_depth': [1, 2, 3, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

clf = GridSearchCV(
    RandomForestClassifier(random_state=137),
    parameters,
    scoring=['accuracy'],
    refit='accuracy'
)
clf.fit(train_x, train_y)

GridSearchCV(estimator=RandomForestClassifier(random_state=137),
             param_grid={'max_depth': [1, 2, 3, 5, 10],
                         'min_samples_leaf': [1, 2, 4],
                         'n_estimators': [10, 100, 150, 250, 500]},
             refit='accuracy', scoring=['accuracy'])

In [239]:
results = pd.DataFrame(clf.cv_results_)
results.sort_values(by=['rank_test_accuracy'], inplace=True, ascending=True)

for i in range(len(results['params'])):
    print(results['params'].iloc[i])
    print(results['mean_test_accuracy'].iloc[i])

{'max_depth': 5, 'min_samples_leaf': 4, 'n_estimators': 10}
0.8024794425961961
{'max_depth': 3, 'min_samples_leaf': 1, 'n_estimators': 250}
0.8002134203753688
{'max_depth': 5, 'min_samples_leaf': 2, 'n_estimators': 500}
0.7980164459230431
{'max_depth': 3, 'min_samples_leaf': 4, 'n_estimators': 250}
0.7979599522942691
{'max_depth': 5, 'min_samples_leaf': 2, 'n_estimators': 10}
0.796899127487289
{'max_depth': 3, 'min_samples_leaf': 2, 'n_estimators': 250}
0.7968489109283786
{'max_depth': 5, 'min_samples_leaf': 4, 'n_estimators': 500}
0.7957692549118072
{'max_depth': 5, 'min_samples_leaf': 4, 'n_estimators': 250}
0.7957692549118072
{'max_depth': 5, 'min_samples_leaf': 4, 'n_estimators': 150}
0.7957629778419433
{'max_depth': 3, 'min_samples_leaf': 2, 'n_estimators': 10}
0.7957253154227607
{'max_depth': 5, 'min_samples_leaf': 2, 'n_estimators': 100}
0.7946519364760529
{'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 100}
0.7946519364760529
{'max_depth': 10, 'min_samples_leaf': 2, 'n_

In [240]:
from sklearn.metrics import accuracy_score

true = pd.read_csv(GENDER_SUBMISSION_PATH)['Survived'].to_numpy()
predict = clf.predict(test_x)

print(accuracy_score(true, predict))

0.9569377990430622
