In [22]:
import numpy as np
import pandas as pd
import csc665.features as ft
import csc665.metrics as mt
from csc665.ensemble import RandomForestRegressor as our_rfr
from sklearn.ensemble import RandomForestRegressor as sk_rfr

In [23]:
def preprocess_train(csv_df, target_col_name):
    
    # replace all NA values with zero
    csv_df_temp = csv_df.copy().fillna(0)
    
    # convert all strings to numbers
    string_columns = list(csv_df_temp.select_dtypes(exclude='number'))
    ft.create_categories(csv_df_temp, string_columns)
    
    # split the data frame into x and y
    csv_df_x = csv_df_temp.drop(target_col_name, axis=1)
    csv_df_y = csv_df_temp[target_col_name].values
    
    return csv_df_x, csv_df_y

In [24]:
def preprocess_test(csv_df):
    
    # replace all NA values with zero
    csv_df_temp = csv_df.copy().fillna(0)
    
    # convert all strings to numbers
    string_columns = list(csv_df_temp.select_dtypes(exclude='number'))
    ft.create_categories(csv_df_temp, string_columns)
    
    return csv_df_temp

In [25]:
def rmse_log(y_predicted, y_true):
    return mt.rmse(np.log(y_predicted), np.log(y_true))

In [26]:
# Read CSV files
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [27]:
# Preprocess and split train set into x and y, replace all NaN elements with 0
x_train_df, y_train_df = preprocess_train(train_df, "SalePrice")

# Convert all strings in test set to numbers, replace all NaN elements with 0
x_test_df = preprocess_test(test_df)

# Our Random Forest Library

In [28]:
# Fit
ours = our_rfr(10, 0.4)
ours.fit(x_train_df, y_train_df)

In [29]:
# Predict and score (train set)
ours_prediction_train = ours.predict(x_train_df)
ours_score_train = mt.rsq(ours_prediction_train, y_train_df)
ours_rmse_log_train = rmse_log(ours_prediction_train, y_train_df)
print("Score: ", ours_score_train)
print("Log RMSE: ", ours_rmse_log_train)

Score:  0.9318672646472856
Log RMSE:  0.11009203105972233


In [30]:
# Predict (test set)
ours_prediction_test = ours.predict(x_test_df)

# scikit-learn's Random Forest Library

In [31]:
# Fit
sk = sk_rfr(n_estimators = 500, min_samples_split = 5, n_jobs=-1)
sk.fit(x_train_df, y_train_df)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [32]:
# Predict and score (train set)
sk_prediction_train = sk.predict(x_train_df)
sk_score_train = sk.score(x_train_df, y_train_df)
sk_rmse_log_train = rmse_log(sk_prediction_train, y_train_df)
print("Score: ", sk_score_train)
print("Log RMSE: ", sk_rmse_log_train)

Score:  0.9760254033416562
Log RMSE:  0.06561660839652415


In [33]:
# Predict (test set)
sk_prediction_test = sk.predict(x_test_df)

In [34]:
sk_prediction_test

array([126613.10795902, 153435.51568748, 175874.86266335, ...,
       151841.71013059, 111676.84449812, 220770.9988653 ])

# Submission

In [35]:
sk_predictions = pd.DataFrame({'Id': x_test_df.Id, 'SalePrice': sk_prediction_test})
sk_predictions.to_csv('submission_sk.csv', index=False)
our_predictions = pd.DataFrame({'Id': x_test_df.Id, 'SalePrice': ours_prediction_test})
our_predictions.to_csv('submission_665.csv', index=False)