In [149]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

In [157]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

combine = [train_df, test_df]

for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    
pd.crosstab(train_df['Title'], train_df['Sex'])


Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [151]:
modes = train_df.mode().iloc[0] # Fill out the empty data

def proc_data(df):
    df['Fare'] = df.Fare.fillna(0)
    df.fillna(modes, inplace=True)
    df['LogFare'] = np.log1p(df['Fare']) # Turns it from tailed to centred distribution
    df['Embarked'] = pd.Categorical(df.Embarked) # Assign hidden indices
    df['Sex'] = pd.Categorical(df.Sex) # Assign hidden indices


In [152]:
proc_data(train_df)
proc_data(test_df)

# normalize_cols = ["PassengerId", "Age", "Pclass", "SibSp", "Parch", "Fare"]

titles = 'Mr.|Miss.|Mrs.|Master.|Major.|Countess|Capt.|Lady.|Col.|Dr.|Rev.';

y_train = train_df['Survived']

test_passengerId = test_df["PassengerId"]

# Check if the 'Name' column contains 'Mr', 'Miss', or 'Mrs'
train_df['Title'] = train_df['Name'].str.contains(titles)
test_df['Title'] = test_df['Name'].str.contains(titles)

train_df = train_df.drop('PassengerId', axis=1)
train_df = train_df.drop('Cabin', axis=1)
train_df = train_df.drop('Ticket', axis=1)
train_df = train_df.drop('Embarked', axis=1)
train_df = train_df.drop('Survived', axis=1)
train_df = train_df.drop('Name', axis=1)

test_df = test_df.drop('PassengerId', axis=1)
test_df = test_df.drop('Cabin', axis=1)
test_df = test_df.drop('Ticket', axis=1)
test_df = test_df.drop('Embarked', axis=1)
test_df = test_df.drop('Name', axis=1)

train_df['Sex'] = (train_df['Sex'] == 'male').astype(int)
test_df['Sex'] = (test_df['Sex'] == 'male').astype(int)



# Convert the boolean values to integers (0 or 1)
train_df['Title'] = train_df['Title'].astype(int)
test_df['Title'] = test_df['Title'].astype(int)

X_train = train_df
X_test = test_df

X_test


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,LogFare,Title
0,3,1,34.5,0,0,7.8292,2.178064,1
1,3,0,47.0,1,0,7.0000,2.079442,1
2,2,1,62.0,0,0,9.6875,2.369075,1
3,3,1,27.0,0,0,8.6625,2.268252,1
4,3,0,22.0,1,1,12.2875,2.586824,1
...,...,...,...,...,...,...,...,...
413,3,1,24.0,0,0,8.0500,2.202765,1
414,1,0,39.0,0,0,108.9000,4.699571,0
415,3,1,38.5,0,0,7.2500,2.110213,1
416,3,1,24.0,0,0,8.0500,2.202765,1


In [153]:
param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

# Create a random forest classifier
rf = RandomForestClassifier()

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf, param_distributions = param_dist, n_iter=5, cv=5)

# Fit the random search object to the data
rand_search.fit(X_train, y_train)

In [154]:
# Create a variable for the best model
best_rf = rand_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'max_depth': 8, 'n_estimators': 124}


In [155]:
# Generate predictions with the best model
y_pred = best_rf.predict(X_test)

output = pd.DataFrame({'PassengerId': test_passengerId, 'Survived': y_pred})
output.to_csv('submission.csv', index=False)