In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from dask.distributed import Client
import joblib

In [2]:
client = Client()

In [3]:
#reading the data into pandas dataframe
pollution = pd.read_csv('C:/Users/2130988/Documents/DDA/cleandata/pollution.csv', index_col = 'X')

In [4]:
pollution

Unnamed: 0_level_0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Aotizhongxin
1,2013,3,1,1,8.0,8.0,4.0,7.0,300.0,77.0,-1.1,1023.2,-18.2,0.0,N,4.7,Aotizhongxin
2,2013,3,1,2,7.0,7.0,5.0,10.0,300.0,73.0,-1.1,1023.5,-18.2,0.0,NNW,5.6,Aotizhongxin
3,2013,3,1,3,6.0,6.0,11.0,11.0,300.0,72.0,-1.4,1024.5,-19.4,0.0,NW,3.1,Aotizhongxin
4,2013,3,1,4,3.0,3.0,12.0,12.0,300.0,72.0,-2.0,1025.2,-19.5,0.0,N,2.0,Aotizhongxin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420763,2017,2,28,19,11.0,32.0,3.0,24.0,400.0,72.0,12.5,1013.5,-16.2,0.0,NW,2.4,Wanshouxigong
420764,2017,2,28,20,13.0,32.0,3.0,41.0,500.0,50.0,11.6,1013.6,-15.1,0.0,WNW,0.9,Wanshouxigong
420765,2017,2,28,21,14.0,28.0,4.0,38.0,500.0,54.0,10.8,1014.2,-13.3,0.0,NW,1.1,Wanshouxigong
420766,2017,2,28,22,12.0,23.0,4.0,30.0,400.0,59.0,10.5,1014.4,-12.9,0.0,NNW,1.2,Wanshouxigong


In [5]:
#removing the factor columns of station and wind direction as these can cause very complex random forest trees
pollution=pollution.drop(columns=['wd', 'station'])
pollution

Unnamed: 0_level_0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,WSPM
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,4.4
1,2013,3,1,1,8.0,8.0,4.0,7.0,300.0,77.0,-1.1,1023.2,-18.2,0.0,4.7
2,2013,3,1,2,7.0,7.0,5.0,10.0,300.0,73.0,-1.1,1023.5,-18.2,0.0,5.6
3,2013,3,1,3,6.0,6.0,11.0,11.0,300.0,72.0,-1.4,1024.5,-19.4,0.0,3.1
4,2013,3,1,4,3.0,3.0,12.0,12.0,300.0,72.0,-2.0,1025.2,-19.5,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420763,2017,2,28,19,11.0,32.0,3.0,24.0,400.0,72.0,12.5,1013.5,-16.2,0.0,2.4
420764,2017,2,28,20,13.0,32.0,3.0,41.0,500.0,50.0,11.6,1013.6,-15.1,0.0,0.9
420765,2017,2,28,21,14.0,28.0,4.0,38.0,500.0,54.0,10.8,1014.2,-13.3,0.0,1.1
420766,2017,2,28,22,12.0,23.0,4.0,30.0,400.0,59.0,10.5,1014.4,-12.9,0.0,1.2


In [6]:
#removing target column
labels = np.array(pollution['PM10'])
pollution= pollution.drop('PM10', axis = 1)
#list of predictors
factors_list = list(pollution.columns)
# Convert to numpy array
pollution = np.array(pollution)

In [7]:
# Split the data into training and testing sets
train_pollution, test_pollution, train_labels, test_labels = train_test_split(pollution, labels, test_size = 0.25, random_state = 42)

In [8]:
#checking if split correctly
print('Training Features Shape:', train_pollution.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_pollution.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (315576, 14)
Training Labels Shape: (315576,)
Testing Features Shape: (105192, 14)
Testing Labels Shape: (105192,)


In [9]:
# Instantiate model with 100 decision trees and run in parallel by setting n_jobs=-1
with joblib.parallel_backend('dask'):
    rf = RandomForestRegressor(n_estimators = 100, random_state = 42, n_jobs=-1)
    # Train the model on training data
    rf.fit(train_pollution, train_labels);

In [10]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_pollution)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (MAE)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 13.99 degrees.


In [11]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 68.59 %.


In [12]:
#Trying to improve the model
# Instantiate model with 200 decision trees and run in parallel by setting n_jobs=-1
with joblib.parallel_backend('dask'):
    rf1 = RandomForestRegressor(n_estimators = 200, random_state = 42, n_jobs=-1)
    # Train the model on training data
    rf1.fit(train_pollution, train_labels);

In [13]:
# Use the forest's predict method on the test data
predictions = rf1.predict(test_pollution)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (MAE)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 13.94 degrees.


In [14]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 68.66 %.


In [18]:
#Trying to improve the model again
# Instantiate model with 300 decision trees and run in parallel by setting n_jobs=-1
with joblib.parallel_backend('dask'):
    rf2 = RandomForestRegressor(n_estimators = 300, random_state = 42, n_jobs=-1)
    # Train the model on training data
    rf2.fit(train_pollution, train_labels);

In [19]:
# Use the forest's predict method on the test data
predictions = rf2.predict(test_pollution)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (MAE)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 13.93 degrees.


In [20]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 68.68 %.


In [21]:
#Trying to improve the model again
# Instantiate model with 500 decision trees and run in parallel by setting n_jobs=-1
with joblib.parallel_backend('dask'):
    rf3 = RandomForestRegressor(n_estimators = 500, random_state = 42, n_jobs=-1)
    # Train the model on training data
    rf3.fit(train_pollution, train_labels);

In [22]:
# Use the forest's predict method on the test data
predictions = rf3.predict(test_pollution)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (MAE)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 13.92 degrees.


In [23]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 68.7 %.


In [28]:
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(test_labels, predictions)))

Root Mean Squared Error: 21.260099905652076


In [29]:
#Trying to improve the model again
# Instantiate model with 750 decision trees and run in parallel by setting n_jobs=-1
with joblib.parallel_backend('dask'):
    rf4 = RandomForestRegressor(n_estimators = 750, random_state = 42, n_jobs=-1)
    # Train the model on training data
    rf4.fit(train_pollution, train_labels);

In [30]:
# Use the forest's predict method on the test data
predictions = rf4.predict(test_pollution)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (MAE)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 13.92 degrees.


In [31]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 68.72 %.


In [32]:
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(test_labels, predictions)))

Root Mean Squared Error: 21.24529217429724


In [34]:
# Get numerical feature importances
importances = list(rf4.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(pollution, round(importance, 2)) for pollution, importance in zip(factors_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: PM2.5                Importance: 0.65
Variable: NO2                  Importance: 0.04
Variable: TEMP                 Importance: 0.04
Variable: PRES                 Importance: 0.04
Variable: DEWP                 Importance: 0.04
Variable: day                  Importance: 0.03
Variable: SO2                  Importance: 0.03
Variable: O3                   Importance: 0.03
Variable: WSPM                 Importance: 0.03
Variable: month                Importance: 0.02
Variable: hour                 Importance: 0.02
Variable: CO                   Importance: 0.02
Variable: year                 Importance: 0.01
Variable: RAIN                 Importance: 0.0


In [None]:
#Reference for code: Adapted from
    #Koehrsen, W. (2017). 'Random Forest in Python' Available: https://towardsdatascience.com/random-forest-in-python-24d0893d51c0. [Accessed 7th April 2022].