In [1]:
import pandas as pd

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("CHLearnTarget.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,Day-Result,D-AQI,PM10 AQI,NO2 AQI,Ozone AQI,SO2 AQI,CO AQI,PM-WEIGHT,DMax NO2,NO2-WEIGHT,...,SO2-WEIGHT,DMax CO,CO-WEIGHT,Year,Month,Day,Site Name,Lat,Lng,Date
0,Good,44,6,12,44,0,1,1,13.6,24,...,24,0.1,24,2016,1,2,Cheyenne NCore,41.182227,-104.778334,1/2/2016
1,Good,38,8,34,38,0,2,1,36.0,22,...,21,0.2,24,2016,1,3,Cheyenne NCore,41.182227,-104.778334,1/3/2016
2,Good,37,13,37,32,1,2,1,39.5,24,...,24,0.2,24,2016,1,4,Cheyenne NCore,41.182227,-104.778334,1/4/2016
3,Good,37,7,10,37,0,2,1,11.3,23,...,22,0.2,24,2016,1,5,Cheyenne NCore,41.182227,-104.778334,1/5/2016
4,Good,36,7,24,36,0,2,1,25.4,24,...,24,0.2,24,2016,1,6,Cheyenne NCore,41.182227,-104.778334,1/6/2016


In [3]:
print(df["Day-Result"].unique())

['Good' 'Moderate Pollution']


# Select your features (columns)

In [5]:
# Set X equal to the entire data set, except for the first column
X = df.iloc[:,1:19]
print(X)
# Set y equal to the first column
y = df.iloc[:,0]
print(y)

     D-AQI  PM10 AQI  NO2 AQI  Ozone AQI  SO2 AQI  CO AQI  PM-WEIGHT  \
0       44         6       12         44        0       1          1   
1       38         8       34         38        0       2          1   
2       37        13       37         32        1       2          1   
3       37         7       10         37        0       2          1   
4       36         7       24         36        0       2          1   
..     ...       ...      ...        ...      ...     ...        ...   
843     38         2       10         38        0       1          1   
844     39         3        4         39        0       1          1   
845     34         1        2         34        0       1          1   
846     29         4        9         29        0       1          1   
847     41         4       14         41        0       1          1   

     DMax NO2  NO2-WEIGHT  DMax Ozone  OZONE-WEIGHT  DMax SO2  SO2-WEIGHT  \
0        13.6          24       0.047            17      -

In [6]:
# Search for top 10 features according to feature importances
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=250,learning_rate=1.0,max_depth=1,random_state=0)
model.fit(X,y)
model.feature_importances_

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])

In [8]:
# Store the top 10 features as a series, using the column headers as the index
feat_imp = pd.Series(model.feature_importances_, index=X.columns).nlargest(19)
feat_imp

D-AQI           1.0
PM10 AQI        0.0
Month           0.0
Year            0.0
CO-WEIGHT       0.0
DMax CO         0.0
SO2-WEIGHT      0.0
DMax SO2        0.0
OZONE-WEIGHT    0.0
DMax Ozone      0.0
NO2-WEIGHT      0.0
DMax NO2        0.0
PM-WEIGHT       0.0
CO AQI          0.0
SO2 AQI         0.0
Ozone AQI       0.0
NO2 AQI         0.0
Day             0.0
dtype: float64

In [10]:
# Set features based on feature importances
X = df[feat_imp.index]

# Use koi_disposition for y values
y = df['Day-Result']

# Create a Train Test Split

Use Day-Result for the y values

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [12]:
X_train.head()

Unnamed: 0,D-AQI,PM10 AQI,Month,Year,CO-WEIGHT,DMax CO,SO2-WEIGHT,DMax SO2,OZONE-WEIGHT,DMax Ozone,NO2-WEIGHT,DMax NO2,PM-WEIGHT,CO AQI,SO2 AQI,Ozone AQI,NO2 AQI,Day
344,48,11,4,2018,24,0.1,23,0.5,17,0.052,23,2.5,1,1,0,48,2,30
99,43,2,4,2016,19,0.1,22,0.0,17,0.046,23,10.0,1,1,0,43,9,20
32,42,5,2,2016,24,0.1,24,0.1,17,0.045,24,4.9,1,1,0,42,4,3
228,39,2,12,2016,24,0.1,22,0.7,17,0.042,22,6.0,1,1,0,39,6,11
182,42,13,10,2016,24,0.1,24,0.7,17,0.045,24,22.2,1,1,0,42,21,18


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [13]:
from sklearn.preprocessing import MinMaxScaler

# Create scaler object
X_scaler = MinMaxScaler().fit(X_train)

# Scale training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [14]:
from sklearn.linear_model import LogisticRegression

# Initialize model
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled, y_train)

LogisticRegression()

In [15]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.9622641509433962
Testing Data Score: 0.9575471698113207


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [16]:
predictions = model.predict(X_test_scaled)
df_pred = pd.DataFrame({"Actual":y_test, "Predicted":predictions}) 
df_pred.head()

Unnamed: 0,Actual,Predicted
444,Good,Good
820,Good,Good
125,Good,Good
655,Good,Good
276,Good,Good


In [17]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
import numpy as np

# Create the GridSearchCV model
param_grid = {'C':np.logspace(-4, 4, 20),
             'penalty':['l1','l2']}

grid = GridSearchCV(model, param_grid, verbose=3)

In [18]:
# Train the model with GridSearch
# Train the model with GridSearch
best_model = grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 1/5] END ..............C=0.0001, penalty=l2;, score=0.891 total time=   0.0s
[CV 2/5] END ..............C=0.0001, penalty=l2;, score=0.898 total time=   0.0s
[CV 3/5] END ..............C=0.0001, penalty=l2;, score=0.890 total time=   0.0s
[CV 4/5] END ..............C=0.0001, penalty=l2;, score=0.890 total time=   0.0s
[CV 5/5] END ..............C=0.0001, penalty=l2;, score=0.890 total time=   0.0s
[CV 1/5] END C=0.00026366508987303583, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END C=0.00026366508987303583, penalty=

[CV 4/5] END ...C=1.623776739188721, penalty=l2;, score=0.969 total time=   0.0s
[CV 5/5] END ...C=1.623776739188721, penalty=l2;, score=0.961 total time=   0.0s
[CV 1/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 1/5] END ...C=4.281332398719396, penalty=l2;, score=0.977 total time=   0.0s
[CV 2/5] END ...C=4.281332398719396, penalty=l2;, score=0.969 total time=   0.0s
[CV 3/5] END ...C=4.281332398719396, penalty=l2;, score=0.984 total time=   0.0s
[CV 4/5] END ...C=4.281332398719396, penalty=l2;, score=0.984 total time=   0.0s
[CV 5/5] END ...C=4.281332398719396, penalty=l2;, score=0.961 total time=   0.0s
[CV 1/5] END ....C=11.288378

In [19]:
print(best_model.best_params_)
print('Best Score: ', best_model.best_score_)

{'C': 3792.690190732246, 'penalty': 'l2'}
Best Score:  0.9905757874015748


In [20]:
# Make predictions
grid_predictions = best_model.predict(X_test_scaled)
df_grid = pd.DataFrame({"Actual":y_test, "Predicted":grid_predictions}) 
df_grid.head()

Unnamed: 0,Actual,Predicted
444,Good,Good
820,Good,Good
125,Good,Good
655,Good,Good
276,Good,Good


In [21]:
# Score the model
best_model.score(X_test_scaled, y_test)

0.9669811320754716

# Save the Model

In [22]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'LR_GradientBoostCHTarget.sav'
joblib.dump(best_model, filename)

['LR_GradientBoostCHTarget.sav']

In [23]:
# Print Classification Report.
from sklearn.metrics import classification_report
print(classification_report(y_test, grid_predictions))

                    precision    recall  f1-score   support

              Good       0.98      0.98      0.98       189
Moderate Pollution       0.83      0.87      0.85        23

          accuracy                           0.97       212
         macro avg       0.91      0.92      0.92       212
      weighted avg       0.97      0.97      0.97       212

