In [None]:
import numpy as np
import pandas as pd
import csc665.features as ft
import csc665.metrics as mt
from csc665.ensemble import RandomForestRegressor as our_rfr
from sklearn.ensemble import RandomForestRegressor as sk_rfr

In [None]:
def preprocess_train(csv_df, target_col_name):
    
    # replace all NA values with zero
    csv_df_temp = csv_df.copy().fillna(0)
    
    # convert all strings to numbers
    string_columns = list(csv_df_temp.select_dtypes(exclude='number'))
    ft.create_categories(csv_df_temp, string_columns)
    
    # split the data frame into x and y
    csv_df_x = csv_df_temp.drop(target_col_name, axis=1)
    csv_df_y = csv_df_temp[target_col_name].values
    
    return csv_df_x, csv_df_y

In [None]:
def preprocess_test(csv_df):
    
    # replace all NA values with zero
    csv_df_temp = csv_df.copy().fillna(0)
    
    # convert all strings to numbers
    string_columns = list(csv_df_temp.select_dtypes(exclude='number'))
    ft.create_categories(csv_df_temp, string_columns)
    
    return csv_df_temp

In [None]:
def rmse_log(y_predicted, y_true):
    
    # calculate the RMSE of the natural log applied element-wise to y_predicted and y_true
    log_rmse = mt.rmse(np.log(y_predicted), np.log(y_true))
    
    return log_rmse

In [None]:
# read CSV files
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
# preprocess and split train set into x and y, replace all NaN elements with 0
x_train_df, y_train_df = preprocess_train(train_df, "SalePrice")

# convert all strings in test set to numbers, replace all NaN elements with 0
x_test_df = preprocess_test(test_df)

# Our Random Forest Library

In [None]:
# fit our random forest regressor
ours = our_rfr(10, 0.4)
ours.fit(x_train_df, y_train_df)

In [None]:
# predict and score (train set)
ours_prediction_train = ours.predict(x_train_df)
ours_score_train = mt.rsq(ours_prediction_train, y_train_df)
ours_rmse_log_train = rmse_log(ours_prediction_train, y_train_df)
print("Log RMSE: ", ours_rmse_log_train)
print("RSQ: ", ours_score_train)

In [None]:
# predict (test set)
ours_prediction_test = ours.predict(x_test_df)

# scikit-learn's Random Forest Library

In [None]:
# fit scikit-learn's random forest regressor
sk = sk_rfr(n_estimators = 400, n_jobs=-1)
sk.fit(x_train_df, y_train_df)

In [None]:
# predict and score (train set)
sk_prediction_train = sk.predict(x_train_df)
sk_score_train = sk.score(x_train_df, y_train_df)
sk_rmse_log_train = rmse_log(sk_prediction_train, y_train_df)
print("Log RMSE: ", sk_rmse_log_train)
print("RSQ: ", sk_score_train)

In [None]:
# predict (test set)
sk_prediction_test = sk.predict(x_test_df)

# Submission

In [None]:
# output predictions from both our random forest regressor and scikit-learn's random forest regressor
sk_predictions = pd.DataFrame({'Id': x_test_df.Id, 'SalePrice': sk_prediction_test})
sk_predictions.to_csv('submission_sk.csv', index=False)
our_predictions = pd.DataFrame({'Id': x_test_df.Id, 'SalePrice': ours_prediction_test})
our_predictions.to_csv('submission_665.csv', index=False)

# Final Output

In [None]:
# Log RMSE
if sk_rmse_log_train < our_rmse_log_train:
    print("Log RMSE: ", sk_rmse_log_train)
else:
    print("Log RMSE: ", our_rmse_log_train)

# RSQ
if sk_score_train < our_score_train:
        print("Log RMSE: ", sk_rmse_log_train)