In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [82]:
%run Preprocessing.py

In [83]:
data = pd.read_csv('austin_weather.csv')

In [84]:
X_train, X_test, y_train, y_test, X_val, y_val = preprocess_for_selection(data)

In [85]:
X = X_train
y = y_train

In [86]:
# we will make a multilabel feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# select the best 5 features
bestfeatures = SelectKBest(score_func=chi2, k=6)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(6,'Score'))  #print 5 best features

                     Specs       Score
14      VisibilityLowMiles  127.770400
18  PrecipitationSumInches  102.781157
8       HumidityLowPercent   28.799874
7       HumidityAvgPercent   16.494646
13      VisibilityAvgMiles   15.838999
15             WindHighMPH    6.061261


In [93]:
# sort the features based on the score
featureScores.nlargest(30,'Score')

Unnamed: 0,Specs,Score
14,VisibilityLowMiles,127.7704
18,PrecipitationSumInches,102.781157
8,HumidityLowPercent,28.799874
7,HumidityAvgPercent,16.494646
13,VisibilityAvgMiles,15.838999
15,WindHighMPH,6.061261
6,HumidityHighPercent,5.921861
17,WindGustMPH,3.904154
4,DewPointAvgF,3.727302
3,DewPointHighF,3.563962


In [74]:

from sklearn.linear_model   import   LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


In [75]:
model_gbc_rain = GradientBoostingClassifier(n_estimators=150, random_state=101)
model_gbc_fog = GradientBoostingClassifier(n_estimators=150, random_state=101)
logmodel_thunderstorm = LogisticRegression()

In [76]:
featureScores.nlargest(5,'Score')['Specs'].values

array(['VisibilityLowMiles', 'PrecipitationSumInches',
       'HumidityLowPercent', 'HumidityAvgPercent', 'VisibilityAvgMiles'],
      dtype=object)

In [77]:
# fit the model with the selected features
model_gbc_rain.fit(X_train[featureScores.nlargest(6,'Score')['Specs'].values], y_train['Rain'])
model_gbc_fog.fit(X_train[featureScores.nlargest(6,'Score')['Specs'].values], y_train['Fog'])
logmodel_thunderstorm.fit(X_train[featureScores.nlargest(6,'Score')['Specs'].values], y_train['Thunderstorm'])


In [78]:
# predict the validation set
y_pred_gbc_rain = model_gbc_rain.predict(X_val[featureScores.nlargest(6,'Score')['Specs'].values])
y_pred_gbc_fog = model_gbc_fog.predict(X_val[featureScores.nlargest(6,'Score')['Specs'].values])
y_pred_logmodel_thunderstorm = logmodel_thunderstorm.predict(X_val[featureScores.nlargest(6,'Score')['Specs'].values])

# evaluate the model
from sklearn.metrics import classification_report, confusion_matrix
print('Rain')
# print(confusion_matrix(y_val['Rain'],y_pred_gbc_rain))
print(classification_report(y_val['Rain'],y_pred_gbc_rain))


Rain
              precision    recall  f1-score   support

           0       0.93      0.98      0.95       140
           1       0.94      0.83      0.88        58

    accuracy                           0.93       198
   macro avg       0.94      0.90      0.92       198
weighted avg       0.93      0.93      0.93       198



In [79]:
print('Fog')
# print(confusion_matrix(y_val['Fog'],y_pred_gbc_fog))
print(classification_report(y_val['Fog'],y_pred_gbc_fog))

Fog
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       186
           1       1.00      1.00      1.00        12

    accuracy                           1.00       198
   macro avg       1.00      1.00      1.00       198
weighted avg       1.00      1.00      1.00       198



In [80]:
print('Thunderstorm')
# print(confusion_matrix(y_val['Thunderstorm'],y_pred_logmodel_thunderstorm))
print(classification_report(y_val['Thunderstorm'],y_pred_logmodel_thunderstorm))

Thunderstorm
              precision    recall  f1-score   support

           0       0.87      0.99      0.92       165
           1       0.80      0.24      0.37        33

    accuracy                           0.86       198
   macro avg       0.83      0.62      0.65       198
weighted avg       0.86      0.86      0.83       198

