import pandas as pd
import numpy as np

train_data = pd.read_csv("train.csv")


clean_training_data =     train_data[
        (train_data['Age'].notnull()) & 
        (train_data['Fare'] != 0.0) & 
        (train_data['Embarked'].notnull())].copy()

clean_training_data.drop(columns=['Cabin'], inplace=True)


# We are removing 'Name' column as they unique for each data entry and hence does not give # us any predictive power.
clean_training_data_v1 = clean_training_data.drop(columns=['Name'])

# Remove the ticket column here
clean_training_data_v2 = clean_training_data_v1.drop(columns=['Ticket'])
clean_training_data_v2.head()

clean_training_data_v3 = clean_training_data_v2.drop(columns=['Fare'])


# Using h2o
import h2o
h2o.init(nthreads = -1, max_mem_size = 8)

clean_h2o_data = h2o.H2OFrame(clean_training_data_v3)


# Make sure that the predictor is labelled as a factor to ensure proper functioning.
clean_h2o_data['Survived'] = clean_h2o_data['Survived'].asfactor()

# Split to 75% training and validation, 25% for test. Seed set to fixed value to ensure 
# reproducibility.
splits = clean_h2o_data.split_frame(ratios=[0.74], seed=1)  
train_validation = splits[0]
test = splits[1]


y_column = 'Survived'
x_columns = 'Sex'


# Import H2O RF:
from h2o.estimators.random_forest import H2ORandomForestEstimator


rf_fit6 = H2ORandomForestEstimator(model_id='rf_fit6', seed=1)

rf_fit6.train(x=x_columns, y=y_column, 
              training_frame=train_validation)

print(rf_fit6.auc())


print(rf_fit6.model_performance().auc())


print(rf_fit6.model_performance(train_validation).auc())


# ### Why is the auc() value differing when I pass in the training data