In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("CHLearnTarget.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,Day-Result,D-AQI,PM10 AQI,NO2 AQI,Ozone AQI,SO2 AQI,CO AQI,PM-WEIGHT,DMax NO2,NO2-WEIGHT,...,SO2-WEIGHT,DMax CO,CO-WEIGHT,Year,Month,Day,Site Name,Lat,Lng,Date
0,Good,44,6,12,44,0,1,1,13.6,24,...,24,0.1,24,2016,1,2,Cheyenne NCore,41.182227,-104.778334,1/2/2016
1,Good,38,8,34,38,0,2,1,36.0,22,...,21,0.2,24,2016,1,3,Cheyenne NCore,41.182227,-104.778334,1/3/2016
2,Good,37,13,37,32,1,2,1,39.5,24,...,24,0.2,24,2016,1,4,Cheyenne NCore,41.182227,-104.778334,1/4/2016
3,Good,37,7,10,37,0,2,1,11.3,23,...,22,0.2,24,2016,1,5,Cheyenne NCore,41.182227,-104.778334,1/5/2016
4,Good,36,7,24,36,0,2,1,25.4,24,...,24,0.2,24,2016,1,6,Cheyenne NCore,41.182227,-104.778334,1/6/2016


# Select your features (columns)

In [9]:
# Set target, features and feature_names.
target = df["Day-Result"]
data = df.drop("Day-Result", axis=1)
data = data.iloc[:,1:18]
feature_names = data.columns
data.head()

Unnamed: 0,PM10 AQI,NO2 AQI,Ozone AQI,SO2 AQI,CO AQI,PM-WEIGHT,DMax NO2,NO2-WEIGHT,DMax Ozone,OZONE-WEIGHT,DMax SO2,SO2-WEIGHT,DMax CO,CO-WEIGHT,Year,Month,Day
0,6,12,44,0,1,1,13.6,24,0.047,17,-0.2,24,0.1,24,2016,1,2
1,8,34,38,0,2,1,36.0,22,0.041,17,0.5,21,0.2,24,2016,1,3
2,13,37,32,1,2,1,39.5,24,0.035,17,1.2,24,0.2,24,2016,1,4
3,7,10,37,0,2,1,11.3,23,0.04,17,0.1,22,0.2,24,2016,1,5
4,7,24,36,0,2,1,25.4,24,0.039,17,0.3,24,0.2,24,2016,1,6


# Create a Train Test Split

Use `Day-Result` for the y values

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [11]:
X_train.head()

Unnamed: 0,PM10 AQI,NO2 AQI,Ozone AQI,SO2 AQI,CO AQI,PM-WEIGHT,DMax NO2,NO2-WEIGHT,DMax Ozone,OZONE-WEIGHT,DMax SO2,SO2-WEIGHT,DMax CO,CO-WEIGHT,Year,Month,Day
579,4,3,36,0,1,1,3.4,21,0.039,15,-0.1,21,0.1,18,2020,1,28
60,7,16,41,0,1,1,17.2,24,0.044,17,0.0,24,0.1,24,2016,3,4
199,19,22,40,0,2,1,23.3,23,0.043,17,0.4,23,0.2,24,2016,11,8
606,12,26,37,0,2,1,28.2,23,0.04,17,-0.1,23,0.2,24,2020,3,6
630,7,8,41,0,2,1,8.2,24,0.044,17,0.0,24,0.2,24,2020,4,6


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [12]:
from sklearn.preprocessing import MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)

X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)

# Train the Model



In [13]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_minmax, y_train)

RandomForestClassifier()

In [14]:
print(f"Training Data Score: {rf.score(X_train_minmax, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_minmax, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.9952830188679245


In [15]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.43598324185031856, 'Ozone AQI'),
 (0.3981341305382815, 'DMax Ozone'),
 (0.06348256003198025, 'PM10 AQI'),
 (0.017997745342131766, 'Month'),
 (0.014943263164535986, 'DMax CO'),
 (0.013571497272016618, 'DMax SO2'),
 (0.011456789391611264, 'CO AQI'),
 (0.009313879789031466, 'DMax NO2'),
 (0.00889804961037453, 'Year'),
 (0.008072812208687307, 'SO2 AQI'),
 (0.0046720921998622275, 'Day'),
 (0.004111298093649471, 'NO2 AQI'),
 (0.0032531640897663083, 'SO2-WEIGHT'),
 (0.0027711630663790343, 'NO2-WEIGHT'),
 (0.0017103129987588255, 'OZONE-WEIGHT'),
 (0.0016280003526149096, 'CO-WEIGHT'),
 (0.0, 'PM-WEIGHT')]

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [16]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [250, 300, 350, 400],
              'max_depth': [125, 150, 175, 200]}
grid = GridSearchCV(rf, param_grid, verbose=3)

In [17]:
# Train the model with GridSearch
grid.fit(X_train_minmax, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ...max_depth=125, n_estimators=250;, score=0.984 total time=   0.2s
[CV 2/5] END ...max_depth=125, n_estimators=250;, score=1.000 total time=   0.2s
[CV 3/5] END ...max_depth=125, n_estimators=250;, score=1.000 total time=   0.2s
[CV 4/5] END ...max_depth=125, n_estimators=250;, score=0.992 total time=   0.2s
[CV 5/5] END ...max_depth=125, n_estimators=250;, score=1.000 total time=   0.2s
[CV 1/5] END ...max_depth=125, n_estimators=300;, score=0.984 total time=   0.3s
[CV 2/5] END ...max_depth=125, n_estimators=300;, score=1.000 total time=   0.3s
[CV 3/5] END ...max_depth=125, n_estimators=300;, score=1.000 total time=   0.2s
[CV 4/5] END ...max_depth=125, n_estimators=300;, score=0.992 total time=   0.2s
[CV 5/5] END ...max_depth=125, n_estimators=300;, score=1.000 total time=   0.3s
[CV 1/5] END ...max_depth=125, n_estimators=350;, score=0.984 total time=   0.3s
[CV 2/5] END ...max_depth=125, n_estimators=350;

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [125, 150, 175, 200],
                         'n_estimators': [250, 300, 350, 400]},
             verbose=3)

In [18]:
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 125, 'n_estimators': 250}
0.9953001968503937


In [19]:
# Training score:
grid.score(X_train_minmax, y_train)

1.0

In [20]:
# Testing score:
grid.score(X_test_minmax, y_test)

0.9952830188679245

In [21]:
# Make prediction and save to variable for report.
predictions = grid.predict(X_test_minmax)

In [22]:
# Print Classification Report.
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

                    precision    recall  f1-score   support

              Good       1.00      0.99      1.00       189
Moderate Pollution       0.96      1.00      0.98        23

          accuracy                           1.00       212
         macro avg       0.98      1.00      0.99       212
      weighted avg       1.00      1.00      1.00       212



# Save the Model

In [23]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'FinalRF_CHTarget.sav'
joblib.dump(rf, filename)

['FinalRF_CHTarget.sav']