In [229]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

In [230]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

combine = [train_df, test_df]

for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    
pd.crosstab(train_df['Title'], train_df['Sex'])


Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [231]:
modes = train_df.mode().iloc[0] # Fill out the empty data

def proc_data(df):
    df['Fare'] = df.Fare.fillna(0)
    df.fillna(modes, inplace=True)
    df['LogFare'] = np.log1p(df['Fare']) # Turns it from tailed to centred distribution
    df['Embarked'] = pd.Categorical(df.Embarked) # Assign hidden indices
    df['Sex'] = pd.Categorical(df.Sex) # Assign hidden indices


In [232]:
proc_data(train_df)
proc_data(test_df)

titles = 'Mr.|Miss.|Mrs.|Master.|Major.|Countess|Capt.|Lady.|Col.|Dr.|Rev.';

y_train = train_df['Survived']

test_passengerId = test_df["PassengerId"]

# Check if the 'Name' column contains 'Mr', 'Miss', or 'Mrs'
train_df['Title'] = train_df['Name'].str.contains(titles)
test_df['Title'] = test_df['Name'].str.contains(titles)

train_df = train_df.drop('PassengerId', axis=1)
train_df = train_df.drop('Cabin', axis=1)
train_df = train_df.drop('Ticket', axis=1)
train_df = train_df.drop('Survived', axis=1)
train_df = train_df.drop('Name', axis=1)

test_df = test_df.drop('PassengerId', axis=1)
test_df = test_df.drop('Cabin', axis=1)
test_df = test_df.drop('Ticket', axis=1)
test_df = test_df.drop('Name', axis=1)

train_df['Sex'] = (train_df['Sex'] == 'male').astype(int)
test_df['Sex'] = (test_df['Sex'] == 'male').astype(int)

train_df['Pclass'] = (train_df['Pclass'] < 3).astype(int)
test_df['Pclass'] = (test_df['Pclass'] < 3).astype(int)

train_df['female_S'] = ((train_df['Sex'] == 0) & (train_df['Embarked'] == 'S')).astype(int)
train_df['female_Q'] = ((train_df['Sex'] == 0) & (train_df['Embarked'] == 'Q')).astype(int)
train_df['male_C'] = ((train_df['Sex'] == 1) & (train_df['Embarked'] == 'C')).astype(int)

test_df['female_S'] = ((test_df['Sex'] == 0) & (test_df['Embarked'] == 'S')).astype(int)
test_df['female_Q'] = ((test_df['Sex'] == 0) & (test_df['Embarked'] == 'Q')).astype(int)
test_df['male_C'] = ((test_df['Sex'] == 1) & (test_df['Embarked'] == 'C')).astype(int)

train_df = train_df.drop('Embarked', axis=1)
test_df = test_df.drop('Embarked', axis=1)

# Convert the boolean values to integers (0 or 1)
train_df['Title'] = train_df['Title'].astype(int)
test_df['Title'] = test_df['Title'].astype(int)

X_train = train_df
X_test = test_df

X_train.describe()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title,LogFare,female_S,female_Q,male_C
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.448934,0.647587,28.56697,0.523008,0.381594,32.204208,0.994388,2.962246,0.230079,0.040404,0.106622
std,0.497665,0.47799,13.199572,1.102743,0.806057,49.693429,0.074743,0.969048,0.421119,0.197016,0.308805
min,0.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,22.0,0.0,0.0,7.9104,1.0,2.187218,0.0,0.0,0.0
50%,0.0,1.0,24.0,0.0,0.0,14.4542,1.0,2.737881,0.0,0.0,0.0
75%,1.0,1.0,35.0,1.0,0.0,31.0,1.0,3.465736,0.0,0.0,0.0
max,1.0,1.0,80.0,8.0,6.0,512.3292,1.0,6.240917,1.0,1.0,1.0


In [233]:
param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

# Create a random forest classifier
rf = RandomForestClassifier()

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf, param_distributions = param_dist, n_iter=5, cv=5)

# Fit the random search object to the data
rand_search.fit(X_train, y_train)

In [234]:
# Create a variable for the best model
best_rf = rand_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'max_depth': 8, 'n_estimators': 202}


In [235]:
# Generate predictions with the best model
y_pred = best_rf.predict(X_test)

output = pd.DataFrame({'PassengerId': test_passengerId, 'Survived': y_pred})
output.to_csv('submission.csv', index=False)