**STEP 1:** Preliminary language-specific commands

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import model_selection, metrics, datasets
from sklearn.ensemble import RandomForestRegressor

**STEP 2:** Load the data

In [2]:
input_data, output_data = datasets.fetch_california_housing(return_X_y=True)

# read the data
print("input data")
display(pd.DataFrame(input_data))
print("\noutput data")
display(pd.DataFrame(output_data))

input data


Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32



output data


Unnamed: 0,0
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422
...,...
20635,0.781
20636,0.771
20637,0.923
20638,0.847


**STEP 3:** Shuffle the samples and split into train and test

In [3]:
[train_in, test_in, train_out, test_out] = model_selection.train_test_split(input_data, output_data, test_size=.2)

# display the split data
print("\ntraining input data")
display(pd.DataFrame(train_in))

print("\ntraining output data")
display(pd.DataFrame(train_out))

print("\ntesting input data")
display(pd.DataFrame(test_in))

print("\ntesting output data")
display(pd.DataFrame(test_out))


training input data


Unnamed: 0,0,1,2,3,4,5,6,7
0,1.3942,38.0,3.566138,0.941799,701.0,3.708995,38.68,-121.76
1,5.7407,25.0,7.405882,1.082353,599.0,3.523529,33.98,-117.96
2,2.3833,52.0,4.638132,1.124514,772.0,3.003891,37.80,-122.23
3,2.6944,36.0,5.333333,1.052632,647.0,2.837719,40.54,-122.38
4,5.3991,29.0,6.380282,1.039003,2784.0,3.016251,37.59,-122.48
...,...,...,...,...,...,...,...,...
16507,2.6111,13.0,3.984010,1.044280,2751.0,3.383764,33.88,-117.59
16508,5.2815,30.0,6.392435,1.007092,1289.0,3.047281,33.69,-117.91
16509,4.5750,36.0,4.987915,1.039275,1062.0,3.208459,37.37,-121.98
16510,5.5368,31.0,6.405959,1.068901,1538.0,2.864060,38.33,-122.23



training output data


Unnamed: 0,0
0,0.69400
1,3.02200
2,1.46900
3,0.75300
4,2.73000
...,...
16507,1.07000
16508,2.29500
16509,2.15400
16510,3.25900



testing input data


Unnamed: 0,0,1,2,3,4,5,6,7
0,6.5069,4.0,6.062900,1.034115,2677.0,2.853945,33.68,-117.64
1,4.0602,44.0,4.811287,1.031746,1227.0,2.164021,34.10,-118.28
2,4.2989,34.0,4.787659,1.052632,1150.0,2.087114,34.16,-118.40
3,7.7450,4.0,7.508065,1.173387,685.0,2.762097,33.10,-117.23
4,3.4013,52.0,4.613158,1.000000,982.0,2.584211,38.61,-122.86
...,...,...,...,...,...,...,...,...
4123,4.9618,7.0,6.153906,1.016241,4460.0,3.449343,38.00,-121.72
4124,2.2917,37.0,4.921875,1.118750,933.0,2.915625,37.76,-122.18
4125,4.7500,20.0,6.100000,1.036957,1369.0,2.976087,38.47,-122.66
4126,5.7260,41.0,5.267241,1.020115,847.0,2.433908,33.97,-118.37



testing output data


Unnamed: 0,0
0,2.434
1,3.240
2,3.647
3,2.374
4,1.833
...,...
4123,1.565
4124,1.070
4125,1.905
4126,2.878


**STEP 4:** Determine the hyperparameters (if any)

In [4]:
model = RandomForestRegressor()

**STEP 5:** Train the model



In [5]:
model.fit(train_in, train_out)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

**STEP 6:** Predict training outputs

In [6]:
pred_train_out = model.predict(train_in)

# compare predicted values (left) against actual values (right)
train_compare = pd.DataFrame((np.transpose((np.vstack((pred_train_out,np.transpose(train_out)))))))
train_compare.columns = ["Predicted", "Actual"] 
display(train_compare)

Unnamed: 0,Predicted,Actual
0,0.781200,0.69400
1,2.910900,3.02200
2,1.371700,1.46900
3,0.855600,0.75300
4,2.896600,2.73000
...,...,...
16507,1.088600,1.07000
16508,2.220200,2.29500
16509,2.249200,2.15400
16510,3.148500,3.25900


**STEP 7:** Evaluate the training data

In [7]:
eval_method = "Mean Squared Error"
train_score = metrics.mean_squared_error(train_out, pred_train_out)

train_results = ["Training " + eval_method + " (%): ", 100 * train_score]
print(*train_results, sep="\n")

Training Mean Squared Error (%): 
5.065548963087064


**STEP 8:** Predict test outputs

In [8]:
pred_test_out = model.predict(test_in)

# compare predicted values (left) against actual values (right)
test_compare = pd.DataFrame((np.transpose((np.vstack((pred_test_out,np.transpose(test_out)))))))
test_compare.columns = ["Predicted", "Actual"] 
display(test_compare)

Unnamed: 0,Predicted,Actual
0,2.790600,2.434
1,3.545800,3.240
2,3.293301,3.647
3,3.658601,2.374
4,1.760900,1.833
...,...,...
4123,1.636800,1.565
4124,0.920400,1.070
4125,2.143800,1.905
4126,3.586701,2.878


**STEP 9:** Get the testing score

In [9]:
test_score = metrics.mean_squared_error(test_out, pred_test_out)

test_results = ["Testing " + eval_method + " (%): ", 100 * test_score]
print(*test_results, sep='\n')

Testing Mean Squared Error (%): 
30.21527530271869


**STEP 10:** Save evaluation results to a file

In [10]:
# training and testing results
results = np.array([train_results,test_results])
results_file = pd.DataFrame(results)

# filepath to "Saved Files" folder
savedir = "Saved Files" + os.sep
# export evaluation results
results_file.to_csv(savedir + eval_method + ".csv", index = False, header = False)
# export training outputs
train_compare.to_csv(savedir + "Training Outputs.csv", index = False, header = ["Predicted", "Actual"])
# export test outputs
test_compare.to_csv(savedir + "Test Outputs.csv", index = False, header = ["Predicted", "Actual"])

**STEP 11:** Display results to the console

In [11]:
for elt in results: print(*elt, sep="\n")

Training Mean Squared Error (%): 
5.065548963087064
Testing Mean Squared Error (%): 
30.21527530271869
