In [64]:
import numpy as np
import pandas as pd
import csc665.features as ft
import csc665.metrics as mt
from csc665.ensemble import RandomForestRegressor as our_rfr
from sklearn.ensemble import RandomForestRegressor as sk_rfr

In [65]:
def preprocess_train(csv_df, target_col_name):
    
    # replace all NA values with zero
    csv_df_temp = csv_df.copy().fillna(0)
    
    # convert all strings to numbers
    string_columns = list(csv_df_temp.select_dtypes(exclude='number'))
    ft.create_categories(csv_df_temp, string_columns)
    
    # split the data frame into x and y
    csv_df_x = csv_df_temp.drop(target_col_name, axis=1)
    csv_df_y = csv_df_temp[target_col_name].values
    
    return csv_df_x, csv_df_y

In [66]:
def preprocess_test(csv_df):
    
    # replace all NA values with zero
    csv_df_temp = csv_df.copy().fillna(0)
    
    # convert all strings to numbers
    string_columns = list(csv_df_temp.select_dtypes(exclude='number'))
    ft.create_categories(csv_df_temp, string_columns)
    
    return csv_df_temp

In [67]:
def rmse_log(y_predicted, y_true):
    
    # calculate the RMSE of the natural log applied element-wise to y_predicted and y_true
    log_rmse = mt.rmse(np.log(y_predicted), np.log(y_true))
    
    return log_rmse

In [68]:
# read CSV files
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [69]:
# preprocess and split train set into x and y, replace all NaN elements with 0
x_train_df, y_train_df = preprocess_train(train_df, "SalePrice")

# convert all strings in test set to numbers, replace all NaN elements with 0
x_test_df = preprocess_test(test_df)

# Our Random Forest Library

In [70]:
# fit our random forest regressor
ours = our_rfr(10, 0.4)
ours.fit(x_train_df, y_train_df)

In [71]:
# predict and score (train set)
ours_prediction_train = ours.predict(x_train_df)
ours_score_train = mt.rsq(ours_prediction_train, y_train_df)
ours_rmse_log_train = rmse_log(ours_prediction_train, y_train_df)
print("Score: ", ours_score_train)
print("Log RMSE: ", ours_rmse_log_train)

Score:  0.9200425771722056
Log RMSE:  0.11410305302521441


In [72]:
# predict (test set)
ours_prediction_test = ours.predict(x_test_df)

# scikit-learn's Random Forest Library

In [73]:
# fit scikit-learn's random forest regressor
sk = sk_rfr(n_estimators = 400, n_jobs=-1)
sk.fit(x_train_df, y_train_df)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [74]:
# predict and score (train set)
sk_prediction_train = sk.predict(x_train_df)
sk_score_train = sk.score(x_train_df, y_train_df)
sk_rmse_log_train = rmse_log(sk_prediction_train, y_train_df)
print("Score: ", sk_score_train)
print("Log RMSE: ", sk_rmse_log_train)

Score:  0.9747474048316922
Log RMSE:  0.06863363587695807


In [75]:
# predict (test set)
sk_prediction_test = sk.predict(x_test_df)

array([125739.70423214, 155329.0080754 , 175972.13529401, ...,
       150925.53050794, 112610.79710588, 221298.77168346])

# Submission

In [77]:
sk_predictions = pd.DataFrame({'Id': x_test_df.Id, 'SalePrice': sk_prediction_test})
sk_predictions.to_csv('submission_sk.csv', index=False)
our_predictions = pd.DataFrame({'Id': x_test_df.Id, 'SalePrice': ours_prediction_test})
our_predictions.to_csv('submission_665.csv', index=False)