In [2]:
# Import dependencies
import pandas as pd

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [3]:
df = pd.read_csv("CHLearnTarget.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,Day-Result,D-AQI,PM10 AQI,NO2 AQI,Ozone AQI,SO2 AQI,CO AQI,PM-WEIGHT,DMax NO2,NO2-WEIGHT,...,SO2-WEIGHT,DMax CO,CO-WEIGHT,Year,Month,Day,Site Name,Lat,Lng,Date
0,Good,44,6,12,44,0,1,1,13.6,24,...,24,0.1,24,2016,1,2,Cheyenne NCore,41.182227,-104.778334,1/2/2016
1,Good,38,8,34,38,0,2,1,36.0,22,...,21,0.2,24,2016,1,3,Cheyenne NCore,41.182227,-104.778334,1/3/2016
2,Good,37,13,37,32,1,2,1,39.5,24,...,24,0.2,24,2016,1,4,Cheyenne NCore,41.182227,-104.778334,1/4/2016
3,Good,37,7,10,37,0,2,1,11.3,23,...,22,0.2,24,2016,1,5,Cheyenne NCore,41.182227,-104.778334,1/5/2016
4,Good,36,7,24,36,0,2,1,25.4,24,...,24,0.2,24,2016,1,6,Cheyenne NCore,41.182227,-104.778334,1/6/2016


In [4]:
print(df["Day-Result"].unique())

['Good' 'Moderate Pollution']


# Select your features (columns)

In [6]:
# Set X equal to the entire data set, except for the first column
X = df.iloc[:,1:19]
print(X)
# Set y equal to the first column
y = df.iloc[:,0]

     D-AQI  PM10 AQI  NO2 AQI  Ozone AQI  SO2 AQI  CO AQI  PM-WEIGHT  \
0       44         6       12         44        0       1          1   
1       38         8       34         38        0       2          1   
2       37        13       37         32        1       2          1   
3       37         7       10         37        0       2          1   
4       36         7       24         36        0       2          1   
..     ...       ...      ...        ...      ...     ...        ...   
843     38         2       10         38        0       1          1   
844     39         3        4         39        0       1          1   
845     34         1        2         34        0       1          1   
846     29         4        9         29        0       1          1   
847     41         4       14         41        0       1          1   

     DMax NO2  NO2-WEIGHT  DMax Ozone  OZONE-WEIGHT  DMax SO2  SO2-WEIGHT  \
0        13.6          24       0.047            17      -

In [7]:
# Search for top 10 features according to feature importances
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=250,learning_rate=1.0,max_depth=1,random_state=0)
model.fit(X,y)
model.feature_importances_

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])

In [8]:
# Store the top 10 features as a series, using the column headers as the index
feat_imp = pd.Series(model.feature_importances_, index=X.columns).nlargest(10)
feat_imp

D-AQI         1.0
PM10 AQI      0.0
NO2 AQI       0.0
Ozone AQI     0.0
SO2 AQI       0.0
CO AQI        0.0
PM-WEIGHT     0.0
DMax NO2      0.0
NO2-WEIGHT    0.0
DMax Ozone    0.0
dtype: float64

In [10]:
# Set features based on feature importances
X = df[feat_imp.index]

# Use koi_disposition for y values
y = df['Day-Result']

# Create a Train Test Split

Use `Day-Result` for the y values

In [11]:
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

In [12]:
X_train.head()

Unnamed: 0,D-AQI,PM10 AQI,NO2 AQI,Ozone AQI,SO2 AQI,CO AQI,PM-WEIGHT,DMax NO2,NO2-WEIGHT,DMax Ozone
286,41,4,2,41,0,1,1,2.8,23,0.044
311,38,4,2,38,0,2,1,2.8,23,0.041
544,39,7,9,39,0,2,1,10.7,21,0.042
24,40,5,8,40,0,1,1,9.0,24,0.043
746,57,57,8,48,3,1,1,9.9,24,0.052


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [13]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

# Create scaler object
X_scaler = MinMaxScaler().fit(X_train)

# Scale training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [14]:
from sklearn.svm import SVC

# Initialize model
model = SVC(kernel='poly')

# Train the model
model.fit(X_train_scaled, y_train)

SVC(kernel='poly')

In [15]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.9911971830985915
Testing Data Score: 0.9821428571428571


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [16]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

# Create the GridSearchCV model
param_grid = [{'C':[1, 5, 10, 50,100], 'kernel':['poly']},
             {'C':[1, 5, 10, 50], 'kernel':['rbf'], 'gamma': [0.0001, 0.0005, 0.001, 0.005]}]

grid = GridSearchCV(model, param_grid, verbose=3)

In [17]:
# Train the new model with GridSearch
best_model = grid.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 21 candidates, totalling 105 fits
[CV 1/5] END ..................C=1, kernel=poly;, score=0.982 total time=   0.0s
[CV 2/5] END ..................C=1, kernel=poly;, score=0.991 total time=   0.0s
[CV 3/5] END ..................C=1, kernel=poly;, score=0.991 total time=   0.0s
[CV 4/5] END ..................C=1, kernel=poly;, score=0.991 total time=   0.0s
[CV 5/5] END ..................C=1, kernel=poly;, score=0.982 total time=   0.0s
[CV 1/5] END ..................C=5, kernel=poly;, score=0.982 total time=   0.0s
[CV 2/5] END ..................C=5, kernel=poly;, score=0.991 total time=   0.0s
[CV 3/5] END ..................C=5, kernel=poly;, score=0.991 total time=   0.0s
[CV 4/5] END ..................C=5, kernel=poly;, score=0.991 total time=   0.0s
[CV 5/5] END ..................C=5, kernel=poly;, score=0.982 total time=   0.0s
[CV 1/5] END .................C=10, kernel=poly;, score=0.982 total time=   0.0s
[CV 2/5] END .................C=10, kernel=poly

In [18]:
print(best_model.best_params_)
print("Best Score: ", best_model.best_score_)

{'C': 100, 'kernel': 'poly'}
Best Score:  0.9947213165657507


In [19]:
# Predict with best_model
grid_predictions = best_model.predict(X_test_scaled)
df_grid = pd.DataFrame({"Actual":y_test, "Predicted":grid_predictions}) 
df_grid.head()

Unnamed: 0,Actual,Predicted
664,Good,Good
363,Good,Good
110,Good,Good
678,Good,Good
39,Good,Good


In [20]:
# Score the model
best_model.score(X_test_scaled, y_test)

0.9892857142857143

# Save the Model

In [21]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'SVCTwoKernelsNewFeaturesCHTarget.sav'
joblib.dump(best_model, filename)

['SVCTwoKernelsNewFeaturesCHTarget.sav']

In [22]:
# Print Classification Report.
from sklearn.metrics import classification_report
print(classification_report(y_test, grid_predictions))

                    precision    recall  f1-score   support

              Good       0.99      1.00      0.99       247
Moderate Pollution       1.00      0.91      0.95        33

          accuracy                           0.99       280
         macro avg       0.99      0.95      0.97       280
      weighted avg       0.99      0.99      0.99       280

