import pandas as pd import numpy as np train_data = pd.read_csv("train.csv") clean_training_data = train_data[ (train_data['Age'].notnull()) & (train_data['Fare'] != 0.0) & (train_data['Embarked'].notnull())].copy() clean_training_data.drop(columns=['Cabin'], inplace=True) # We are removing 'Name' column as they unique for each data entry and hence does not give # us any predictive power. clean_training_data_v1 = clean_training_data.drop(columns=['Name']) # Remove the ticket column here clean_training_data_v2 = clean_training_data_v1.drop(columns=['Ticket']) clean_training_data_v2.head() clean_training_data_v3 = clean_training_data_v2.drop(columns=['Fare']) # Using h2o import h2o h2o.init(nthreads = -1, max_mem_size = 8) clean_h2o_data = h2o.H2OFrame(clean_training_data_v3) # Make sure that the predictor is labelled as a factor to ensure proper functioning. clean_h2o_data['Survived'] = clean_h2o_data['Survived'].asfactor() # Split to 75% training and validation, 25% for test. Seed set to fixed value to ensure # reproducibility. splits = clean_h2o_data.split_frame(ratios=[0.74], seed=1) train_validation = splits[0] test = splits[1] y_column = 'Survived' x_columns = 'Sex' # Import H2O RF: from h2o.estimators.random_forest import H2ORandomForestEstimator rf_fit6 = H2ORandomForestEstimator(model_id='rf_fit6', seed=1) rf_fit6.train(x=x_columns, y=y_column, training_frame=train_validation) print(rf_fit6.auc()) print(rf_fit6.model_performance().auc()) print(rf_fit6.model_performance(train_validation).auc()) # ### Why is the auc() value differing when I pass in the training data