In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, ConfusionMatrixDisplay

In [5]:
#Data Acquisition

df = pd.read_csv('temps.csv')
df.head()

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
0,2016,1,1,Fri,45,45,45.6,45,43,50,44,29
1,2016,1,2,Sat,44,45,45.7,44,41,50,44,61
2,2016,1,3,Sun,45,44,45.8,41,43,46,47,56
3,2016,1,4,Mon,44,41,45.9,40,44,48,46,53
4,2016,1,5,Tues,41,40,46.0,44,46,46,46,41


In [7]:
print(df.shape)
df.describe()

(348, 12)


Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,2016.0,6.477011,15.514368,62.652299,62.701149,59.760632,62.543103,57.238506,62.373563,59.772989,60.034483
std,0.0,3.49838,8.772982,12.165398,12.120542,10.527306,11.794146,10.605746,10.549381,10.705256,15.626179
min,2016.0,1.0,1.0,35.0,35.0,45.1,35.0,41.0,46.0,44.0,28.0
25%,2016.0,3.0,8.0,54.0,54.0,49.975,54.0,48.0,53.0,50.0,47.75
50%,2016.0,6.0,15.0,62.5,62.5,58.2,62.5,56.0,61.0,58.0,60.0
75%,2016.0,10.0,23.0,71.0,71.0,69.025,71.0,66.0,72.0,69.0,71.0
max,2016.0,12.0,31.0,117.0,117.0,77.4,92.0,77.0,82.0,79.0,95.0


In [10]:
#convert categorical values to integers and check for missing values
df = pd.get_dummies(df)
df.head()
print(df.isnull().sum())

year              0
month             0
day               0
temp_2            0
temp_1            0
average           0
actual            0
forecast_noaa     0
forecast_acc      0
forecast_under    0
friend            0
week_Fri          0
week_Mon          0
week_Sat          0
week_Sun          0
week_Thurs        0
week_Tues         0
week_Wed          0
dtype: int64


In [14]:
#split target variables and convert to numpy array

target = np.array(df.actual)

df = df.drop('actual', axis=1)

df = np.array(df)

print(df)
print(target)

[[2.016e+03 1.000e+00 1.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.016e+03 1.000e+00 2.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.016e+03 1.000e+00 3.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 ...
 [2.016e+03 1.200e+01 2.900e+01 ... 1.000e+00 0.000e+00 0.000e+00]
 [2.016e+03 1.200e+01 3.000e+01 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.016e+03 1.200e+01 3.100e+01 ... 0.000e+00 0.000e+00 0.000e+00]]
[45 44 41 40 44 51 45 48 50 52 45 49 55 49 48 54 50 54 48 52 52 57 48 51
 54 56 57 56 52 48 47 46 51 49 49 53 49 51 57 62 56 55 58 55 56 57 53 51
 53 51 51 60 59 61 60 57 53 58 55 59 57 64 60 53 54 55 56 55 52 54 49 51
 53 58 63 61 55 56 57 53 54 57 59 51 56 64 68 73 71 63 69 60 57 68 77 76
 66 59 58 60 59 59 60 68 77 89 81 81 73 64 65 55 59 60 61 64 61 68 77 87
 74 60 68 77 82 63 67 75 81 77 82 65 57 60 71 64 63 66 59 66 65 66 66 65
 64 64 64 71 79 75 71 80 81 92 86 85 67 65 67 65 70 66 60 67 71 67 65 70
 76 73 75 68 69 71 78 85 79 74 73 76 76 71 68 69 76 68 74 71 74 74 77 75
 77 76 7

In [15]:
#split data into training and testing data

x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=.3)

In [16]:
#fit random forest model

rf = RandomForestRegressor(n_estimators = 1000, random_state = 40)
rf.fit(x_train, y_train)

In [32]:
#test the accuracy of the model

preds = rf.predict(x_test)

error = np.round(np.mean((abs(preds - y_test))), 2)

print("Mean error = "  + str(error) + " degrees")

error_perc = 100 * error/y_test

print("Accuracy = " + str(np.round(100 - np.mean(error_perc), 2)) + "%")


Mean error = 3.59 degrees
Accuracy = 94.16%


In [36]:
#Hyperparameter tuning 
import random

param_dist = {'n_estimators': random.sample(range(500, 1500), 5), 'max_depth': random.sample(range(1, 20), 5)}

rf_1 = RandomForestRegressor()

rand_search = RandomizedSearchCV(rf_1, param_distributions = param_dist, n_iter=5, cv=5)

rand_search.fit(x_train, y_train)

best_rf = rand_search.best_estimator_

print("Best hyperparameters: ", rand_search.best_params_)

Best hyperparameters:  {'n_estimators': 753, 'max_depth': 4}


In [37]:
preds = rand_search.predict(x_test)
error = np.round(np.mean((abs(preds - y_test))), 2)
print(error)

3.6


In [49]:
#Our tuned model is just as accurate with our test predictions
#Determine our most imporant features

best_features = list(rf.feature_importances_)

column_list = ["Year", "Month", "Day", "Temp_2", "Temp_1", "Average", "Actual", "Forecast_noaa", "Forecast_acc",
              "Forecast_under", "Friend", "Friday", "Monday", "Saturday", "Sunday", "Thursday", "Tuesday", "Wednesday"]

feature_importance = sorted(zip(column_list, best_features), key=lambda x:x[1])

print(feature_importance)



[('Year', 0.0), ('Sunday', 0.0010017356809977652), ('Tuesday', 0.0015843722270139065), ('Thursday', 0.0019438397499593602), ('Saturday', 0.002330618259726176), ('Friend', 0.0025183549613273064), ('Monday', 0.0030119059421887424), ('Friday', 0.0036170628097486855), ('Month', 0.014300614296381112), ('Actual', 0.015619078919677945), ('Temp_2', 0.017896577041419764), ('Forecast_under', 0.01828881853531538), ('Day', 0.019260786424563137), ('Forecast_acc', 0.027746452732487388), ('Forecast_noaa', 0.06539743189274028), ('Average', 0.3544048715402039), ('Temp_1', 0.4510774789862492)]
