**STEP 1:** Preliminary language-specific commands

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import model_selection, metrics, datasets
from sklearn.ensemble import RandomForestRegressor

**STEP 2:** Load the data

In [3]:
input_data, output_data = datasets.fetch_california_housing(return_X_y=True)

# read the data
print("input data")
display(pd.DataFrame(input_data))
print("\noutput data")
display(pd.DataFrame(output_data))

input data


Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32



output data


Unnamed: 0,0
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422
...,...
20635,0.781
20636,0.771
20637,0.923
20638,0.847


**STEP 3:** Shuffle the samples and split into train and test

In [5]:
[train_in, test_in, train_out, test_out] = model_selection.train_test_split(input_data, output_data, test_size=.2)

# display the split data
print("\ntraining input data")
display(pd.DataFrame(train_in))

print("\ntraining output data")
display(pd.DataFrame(train_out))

print("\ntesting input data")
display(pd.DataFrame(test_in))

print("\ntesting output data")
display(pd.DataFrame(test_out))


training input data


Unnamed: 0,0,1,2,3,4,5,6,7
0,2.5544,8.0,2.996979,1.187311,694.0,2.096677,33.78,-118.19
1,1.7824,26.0,4.154374,1.084048,2601.0,4.461407,33.91,-118.21
2,3.8426,16.0,4.706941,1.002571,1347.0,3.462725,33.85,-118.03
3,2.9321,14.0,4.778058,1.021438,2551.0,3.216898,34.02,-118.08
4,7.5925,39.0,7.150463,0.997685,1175.0,2.719907,32.72,-117.24
...,...,...,...,...,...,...,...,...
16507,4.7143,14.0,5.757576,0.989899,839.0,2.824916,38.94,-121.07
16508,3.7500,19.0,4.473146,1.038363,969.0,2.478261,33.84,-117.95
16509,2.4234,28.0,3.023480,1.053582,2558.0,1.540036,37.83,-122.25
16510,6.3132,18.0,6.735363,0.990632,1395.0,3.266979,37.69,-121.91



training output data


Unnamed: 0,0
0,1.625
1,1.102
2,3.444
3,1.442
4,4.667
...,...
16507,1.507
16508,1.734
16509,2.185
16510,2.592



testing input data


Unnamed: 0,0,1,2,3,4,5,6,7
0,9.7037,8.0,8.946759,1.071759,1548.0,3.583333,37.76,-121.96
1,2.5847,17.0,5.764344,1.157787,1345.0,2.756148,33.78,-117.06
2,1.7120,36.0,6.307229,1.289157,651.0,3.921687,37.75,-122.18
3,3.2121,37.0,4.617211,1.062315,1304.0,1.934718,32.69,-117.18
4,4.3403,38.0,5.068000,1.360000,442.0,1.768000,32.77,-117.28
...,...,...,...,...,...,...,...,...
4123,4.3810,34.0,6.005764,0.976945,868.0,2.501441,36.15,-120.37
4124,2.7375,15.0,17.408560,3.861868,1316.0,2.560311,34.25,-116.83
4125,2.8295,52.0,4.940260,1.054545,1233.0,3.202597,34.03,-118.31
4126,3.0217,52.0,4.424242,1.040404,176.0,1.777778,38.57,-121.47



testing output data


Unnamed: 0,0
0,4.251
1,1.453
2,0.821
3,3.208
4,5.000
...,...
4123,0.863
4124,0.908
4125,1.322
4126,2.000


**STEP 4:** Determine the hyperparameters (if any)

In [6]:
model = RandomForestRegressor()

**STEP 5:** Train the model



In [7]:
model.fit(train_in, train_out)

RandomForestRegressor()

**STEP 6:** Predict training outputs

In [8]:
pred_train_out = model.predict(train_in)

# compare predicted values (left) against actual values (right)
train_compare = pd.DataFrame((np.transpose((np.vstack((pred_train_out,np.transpose(train_out)))))))
train_compare.columns = ["Predicted", "Actual"] 
display(train_compare)

Unnamed: 0,Predicted,Actual
0,1.73452,1.625
1,1.14854,1.102
2,2.86658,3.444
3,1.61753,1.442
4,4.16491,4.667
...,...,...
16507,1.61874,1.507
16508,1.79755,1.734
16509,2.22470,2.185
16510,2.71604,2.592


**STEP 7:** Evaluate the training data

In [9]:
eval_method = "mean squared error"
train_score = metrics.mean_squared_error(train_out, pred_train_out)

train_results = ["Training " + eval_method + " (%)", 100 * train_score]
print(*train_results, sep="\n")

Training mean squared error (%)
3.6403736985775237


**STEP 8:** Predict test outputs

In [10]:
pred_test_out = model.predict(test_in)

# compare predicted values (left) against actual values (right)
test_compare = pd.DataFrame((np.transpose((np.vstack((pred_test_out,np.transpose(test_out)))))))
test_compare.columns = ["Predicted", "Actual"] 
display(test_compare)

Unnamed: 0,Predicted,Actual
0,4.617255,4.251
1,1.168830,1.453
2,0.895150,0.821
3,3.745553,3.208
4,4.456495,5.000
...,...,...
4123,1.942470,0.863
4124,1.053630,0.908
4125,1.469970,1.322
4126,1.611640,2.000


**STEP 9:** Get the testing score

In [11]:
test_score = metrics.mean_squared_error(test_out, pred_test_out)

test_results = ["Testing " + eval_method + " (%)", 100 * test_score]
print(*test_results, sep='\n')

Testing mean squared error (%)
23.48267461559259


**STEP 10:** Save evaluation results to a file

In [12]:
# training and testing results
results = np.array([train_results,test_results])
results_file = pd.DataFrame(results)

pd.DataFrame(results).to_csv("Saved Files" + os.sep + eval_method + ".csv", index = False, header = False)

**STEP 11:** Display results to the console

In [13]:
for elt in results: print(*elt, sep="\n")

Training mean squared error (%)
3.6403736985775237
Testing mean squared error (%)
23.48267461559259
