In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [11]:
# Using ordinal encoding
train_df = pd.read_csv("train.csv") # get the original dataframe, not the one where I used ordinal encoding
test_df = pd.read_csv("test.csv")
from sklearn.preprocessing import OneHotEncoder
features = ['Sex','Parch','Age','Pclass','Embarked', 'Fare'] # not using SibSp this time, doesn't make a difference in accuracy

# Deal with missing values
train_avg = train_df.Age.mean()
train_df.Age.fillna(train_avg, inplace=True)

test_avg = test_df.Age.mean()
test_df.Age.fillna(test_avg, inplace=True)
test_df.Fare.fillna(test_df.Fare.mean(), inplace=True)

X = train_df[features]
y = train_df['Survived']
X_train, X_valid, y_train, y_val = train_test_split(X, y, random_state = 0)


In [12]:
# Use one-hot encoder on columns sex and embarked
object_cols = ['Sex', 'Embarked']

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

X_train[object_cols]

Unnamed: 0,Sex,Embarked
105,male,S
68,female,S
253,male,S
320,male,S
706,female,S
...,...,...
835,female,C
192,female,S
629,male,Q
559,female,S


In [14]:
forest_model = RandomForestRegressor()
forest_model.fit(OH_X_train, y_train)
predictions = forest_model.predict(OH_X_valid)
predictions = [round(num, 0) for num in predictions]
predictions

[0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0

In [15]:
def accuracy(actual, pred):
    return 1 - abs(actual - pred).mean()

print(accuracy(y_val, predictions))
# predictions are a little better than the decision tree regressor
# to do for next day: XGB 

0.8161434977578476
