In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

In [2]:
%run Preprocessing.py

In [3]:
df = pd.read_csv("austin_weather.csv")

In [4]:
X_train, X_test, y_train, y_test, X_val, y_val = preprocess(df)

In [5]:
# We will train for Rain first
y_train_rain = y_train["Rain"]
y_val_rain = y_val["Rain"]

In [6]:
# Lets try Logistic Regression, KNN, Random Forest, and SVM, and see which one performs the best
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

logmodel_rain = LogisticRegression()
knnmodel_rain = KNeighborsClassifier()
rfmodel_rain = RandomForestClassifier()
svmmodel_rain = SVC()

logmodel_rain.fit(X_train, y_train_rain)
knnmodel_rain.fit(X_train, y_train_rain)
rfmodel_rain.fit(X_train, y_train_rain)
svmmodel_rain.fit(X_train, y_train_rain)

In [7]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

def get_metrics(model, X, y):
    y_pred = model.predict(X)
    from sklearn.metrics import classification_report
    class_rep = classification_report(y, model.predict(X))
    class_rep = class_rep.split()
    weighted_avg = class_rep[-5:]

    from sklearn.metrics import accuracy_score
    acc = accuracy_score(y, model.predict(X_val))
    return acc,weighted_avg[1], weighted_avg[2], weighted_avg[3]

In [8]:
rain_logmodel_metrics = get_metrics(logmodel_rain, X_val, y_val_rain)
rain_knnmodel_metrics = get_metrics(knnmodel_rain, X_val, y_val_rain)
rain_rfmodel_metrics = get_metrics(rfmodel_rain, X_val, y_val_rain)
rain_svmmodel_metrics = get_metrics(svmmodel_rain, X_val, y_val_rain)

# Create a dataframe to store the metrics
rain_metrics = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1"])

# Add the metrics to the dataframe

rain_metrics = rain_metrics.append({"Model": "Logistic Regression", "Accuracy": rain_logmodel_metrics[0], "Precision": rain_logmodel_metrics[1], "Recall": rain_logmodel_metrics[2], "F1": rain_logmodel_metrics[3]}, ignore_index=True)
rain_metrics = rain_metrics.append({"Model": "KNN", "Accuracy": rain_knnmodel_metrics[0], "Precision": rain_knnmodel_metrics[1], "Recall": rain_knnmodel_metrics[2], "F1": rain_knnmodel_metrics[3]}, ignore_index=True)
rain_metrics = rain_metrics.append({"Model": "Random Forest", "Accuracy": rain_rfmodel_metrics[0], "Precision": rain_rfmodel_metrics[1], "Recall": rain_rfmodel_metrics[2], "F1": rain_rfmodel_metrics[3]}, ignore_index=True)
rain_metrics = rain_metrics.append({"Model": "SVM", "Accuracy": rain_svmmodel_metrics[0], "Precision": rain_svmmodel_metrics[1], "Recall": rain_svmmodel_metrics[2], "F1": rain_svmmodel_metrics[3]}, ignore_index=True)

rain_metrics

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.883838,0.89,0.88,0.88
1,KNN,0.848485,0.85,0.85,0.84
2,Random Forest,0.944444,0.95,0.94,0.94
3,SVM,0.883838,0.89,0.88,0.88


In [9]:
y_train_fog = y_train["Fog"]
y_val_fog = y_val["Fog"]

logmodel_fog = LogisticRegression()
knnmodel_fog = KNeighborsClassifier()
rfmodel_fog = RandomForestClassifier()
svmmodel_fog = SVC()

logmodel_fog.fit(X_train, y_train_fog)
knnmodel_fog.fit(X_train, y_train_fog)
rfmodel_fog.fit(X_train, y_train_fog)
svmmodel_fog.fit(X_train, y_train_fog)

In [10]:
fog_logmodel_metrics = get_metrics(logmodel_fog, X_val, y_val_fog)
fog_knnmodel_metrics = get_metrics(knnmodel_fog, X_val, y_val_fog)
fog_rfmodel_metrics = get_metrics(rfmodel_fog, X_val, y_val_fog)
fog_svmmodel_metrics = get_metrics(svmmodel_fog, X_val, y_val_fog)

fog_metrics = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1"])

fog_metrics = fog_metrics.append({"Model": "Logistic Regression", "Accuracy": fog_logmodel_metrics[0], "Precision": fog_logmodel_metrics[1], "Recall": fog_logmodel_metrics[2], "F1": fog_logmodel_metrics[3]}, ignore_index=True)
fog_metrics = fog_metrics.append({"Model": "KNN", "Accuracy": fog_knnmodel_metrics[0], "Precision": fog_knnmodel_metrics[1], "Recall": fog_knnmodel_metrics[2], "F1": fog_knnmodel_metrics[3]}, ignore_index=True)
fog_metrics = fog_metrics.append({"Model": "Random Forest", "Accuracy": fog_rfmodel_metrics[0], "Precision": fog_rfmodel_metrics[1], "Recall": fog_rfmodel_metrics[2], "F1": fog_rfmodel_metrics[3]}, ignore_index=True)
fog_metrics = fog_metrics.append({"Model": "SVM", "Accuracy": fog_svmmodel_metrics[0], "Precision": fog_svmmodel_metrics[1], "Recall": fog_svmmodel_metrics[2], "F1": fog_svmmodel_metrics[3]}, ignore_index=True)

fog_metrics

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.989899,0.99,0.99,0.99
1,KNN,0.949495,0.94,0.95,0.94
2,Random Forest,0.994949,0.99,0.99,0.99
3,SVM,0.939394,0.88,0.94,0.91


In [11]:
y_train_thunderstorm = y_train["Thunderstorm"]
y_val_thunderstorm = y_val["Thunderstorm"]

logmodel_rain_thunderstorm = LogisticRegression()
knnmodel_thunderstorm = KNeighborsClassifier()
rfmodel_thunderstorm = RandomForestClassifier()
svmmodel_thunderstorm = SVC()

logmodel_rain_thunderstorm.fit(X_train, y_train_thunderstorm)
knnmodel_thunderstorm.fit(X_train, y_train_thunderstorm)
rfmodel_thunderstorm.fit(X_train, y_train_thunderstorm)
svmmodel_thunderstorm.fit(X_train, y_train_thunderstorm)

In [12]:

thunderstorm_logmodel_metrics = get_metrics(logmodel_rain_thunderstorm, X_val, y_val_thunderstorm)
thunderstorm_knnmodel_metrics = get_metrics(knnmodel_thunderstorm, X_val, y_val_thunderstorm)
thunderstorm_rfmodel_metrics = get_metrics(rfmodel_thunderstorm, X_val, y_val_thunderstorm)
thunderstorm_svmmodel_metrics = get_metrics(svmmodel_thunderstorm, X_val, y_val_thunderstorm)


thunderstorm_metrics = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1"])

thunderstorm_metrics = thunderstorm_metrics.append({"Model": "Logistic Regression", "Accuracy": thunderstorm_logmodel_metrics[0], "Precision": thunderstorm_logmodel_metrics[1], "Recall": thunderstorm_logmodel_metrics[2], "F1": thunderstorm_logmodel_metrics[3]}, ignore_index=True)
thunderstorm_metrics = thunderstorm_metrics.append({"Model": "KNN", "Accuracy": thunderstorm_knnmodel_metrics[0], "Precision": thunderstorm_knnmodel_metrics[1], "Recall": thunderstorm_knnmodel_metrics[2], "F1": thunderstorm_knnmodel_metrics[3]}, ignore_index=True)
thunderstorm_metrics = thunderstorm_metrics.append({"Model": "Random Forest", "Accuracy": thunderstorm_rfmodel_metrics[0], "Precision": thunderstorm_rfmodel_metrics[1], "Recall": thunderstorm_rfmodel_metrics[2], "F1": thunderstorm_rfmodel_metrics[3]}, ignore_index=True)
thunderstorm_metrics = thunderstorm_metrics.append({"Model": "SVM", "Accuracy": thunderstorm_svmmodel_metrics[0], "Precision": thunderstorm_svmmodel_metrics[1], "Recall": thunderstorm_svmmodel_metrics[2], "F1": thunderstorm_svmmodel_metrics[3]}, ignore_index=True)

thunderstorm_metrics

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.90404,0.9,0.9,0.89
1,KNN,0.888889,0.88,0.89,0.88
2,Random Forest,0.893939,0.89,0.89,0.88
3,SVM,0.888889,0.89,0.89,0.87


In [13]:
#import xgcb and boosting models
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier


In [14]:
model_xgbc_rain = XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=3, random_state=101)
model_xgbc_rain.fit(X_train, y_train[["Rain"]])
rain_model_xgbc_metrics = get_metrics(model_xgbc_rain, X_val, y_val[["Rain"]])


model_ada_rain = AdaBoostClassifier(n_estimators=150, random_state=101)
model_ada_rain.fit(X_train, y_train[["Rain"]])
rain_model_ada_metrics = get_metrics(model_ada_rain, X_val, y_val[["Rain"]])


model_gbc_rain = GradientBoostingClassifier(n_estimators=150, random_state=101)
model_gbc_rain.fit(X_train, y_train[["Rain"]])
rain_model_gbc_metrics = get_metrics(model_gbc_rain, X_val, y_val[["Rain"]])

In [15]:
rain_metrics = rain_metrics.append({"Model": "GradientBoostingClassifier", "Accuracy": rain_model_gbc_metrics[0], "Precision": rain_model_gbc_metrics[1], "Recall": rain_model_gbc_metrics[2], "F1": rain_model_gbc_metrics[3]}, ignore_index=True)
rain_metrics = rain_metrics.append({"Model": "AdaBoostClassifier", "Accuracy": rain_model_ada_metrics[0], "Precision": rain_model_ada_metrics[1], "Recall": rain_model_ada_metrics[2], "F1": rain_model_ada_metrics[3]}, ignore_index=True)
rain_metrics = rain_metrics.append({"Model": "XGBClassifier", "Accuracy": rain_model_xgbc_metrics[0], "Precision": rain_model_xgbc_metrics[1], "Recall": rain_model_xgbc_metrics[2], "F1": rain_model_xgbc_metrics[3]}, ignore_index=True)
rain_metrics.sort_values(by="F1", ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
4,GradientBoostingClassifier,0.959596,0.96,0.96,0.96
6,XGBClassifier,0.954545,0.96,0.95,0.95
2,Random Forest,0.944444,0.95,0.94,0.94
5,AdaBoostClassifier,0.944444,0.95,0.94,0.94
0,Logistic Regression,0.883838,0.89,0.88,0.88
3,SVM,0.883838,0.89,0.88,0.88
1,KNN,0.848485,0.85,0.85,0.84


In [16]:
model_xgbc_fog = XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=3, random_state=101)
model_xgbc_fog.fit(X_train, y_train[["Fog"]])
fog_model_xgbc_metrics = get_metrics(model_xgbc_fog, X_val, y_val[["Fog"]])


model_ada_fog = AdaBoostClassifier(n_estimators=150, random_state=101)
model_ada_fog.fit(X_train, y_train[["Fog"]])
fog_model_ada_metrics = get_metrics(model_ada_fog, X_val, y_val[["Fog"]])


model_gbc_fog = GradientBoostingClassifier(n_estimators=150, random_state=101)
model_gbc_fog.fit(X_train, y_train[["Fog"]])
fog_model_gbc_metrics = get_metrics(model_gbc_fog, X_val, y_val[["Fog"]])

In [17]:
fog_metrics = fog_metrics.append({"Model": "GradientBoostingClassifier", "Accuracy": fog_model_gbc_metrics[0], "Precision": fog_model_gbc_metrics[1], "Recall": fog_model_gbc_metrics[2], "F1": fog_model_gbc_metrics[3]}, ignore_index=True)
fog_metrics = fog_metrics.append({"Model": "AdaBoostClassifier", "Accuracy": fog_model_ada_metrics[0], "Precision": fog_model_ada_metrics[1], "Recall": fog_model_ada_metrics[2], "F1": fog_model_ada_metrics[3]}, ignore_index=True)
fog_metrics = fog_metrics.append({"Model": "XGBClassifier", "Accuracy": fog_model_xgbc_metrics[0], "Precision": fog_model_xgbc_metrics[1], "Recall": fog_model_xgbc_metrics[2], "F1": fog_model_xgbc_metrics[3]}, ignore_index=True)
fog_metrics.sort_values(by="F1", ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
4,GradientBoostingClassifier,1.0,1.0,1.0,1.0
5,AdaBoostClassifier,1.0,1.0,1.0,1.0
6,XGBClassifier,1.0,1.0,1.0,1.0
0,Logistic Regression,0.989899,0.99,0.99,0.99
2,Random Forest,0.994949,0.99,0.99,0.99
1,KNN,0.949495,0.94,0.95,0.94
3,SVM,0.939394,0.88,0.94,0.91


In [18]:
model_xgbc_thunderstorm = XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=3, random_state=101)
model_xgbc_thunderstorm.fit(X_train, y_train[["Thunderstorm"]])
thunderstorm_model_xgbc_metrics = get_metrics(model_xgbc_thunderstorm, X_val, y_val[["Thunderstorm"]])

model_ada_thunderstorm = AdaBoostClassifier(n_estimators=150, random_state=101)
model_ada_thunderstorm.fit(X_train, y_train[["Thunderstorm"]])
thunderstorm_model_ada_metrics = get_metrics(model_ada_thunderstorm, X_val, y_val[["Thunderstorm"]])

model_gbc_thunderstorm = GradientBoostingClassifier(n_estimators=150, random_state=101)
model_gbc_thunderstorm.fit(X_train, y_train[["Thunderstorm"]])
thunderstorm_model_gbc_metrics = get_metrics(model_gbc_thunderstorm, X_val, y_val[["Thunderstorm"]])

In [19]:
thunderstorm_metrics = thunderstorm_metrics.append({"Model": "GradientBoostingClassifier", "Accuracy": thunderstorm_model_gbc_metrics[0], "Precision": thunderstorm_model_gbc_metrics[1], "Recall": thunderstorm_model_gbc_metrics[2], "F1": thunderstorm_model_gbc_metrics[3]}, ignore_index=True)
thunderstorm_metrics = thunderstorm_metrics.append({"Model": "AdaBoostClassifier", "Accuracy": thunderstorm_model_ada_metrics[0], "Precision": thunderstorm_model_ada_metrics[1], "Recall": thunderstorm_model_ada_metrics[2], "F1": thunderstorm_model_ada_metrics[3]}, ignore_index=True)
thunderstorm_metrics = thunderstorm_metrics.append({"Model": "XGBClassifier", "Accuracy": thunderstorm_model_xgbc_metrics[0], "Precision": thunderstorm_model_xgbc_metrics[1], "Recall": thunderstorm_model_xgbc_metrics[2], "F1": thunderstorm_model_xgbc_metrics[3]}, ignore_index=True)


In [20]:
thunderstorm_metrics.sort_values(by="F1", ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.90404,0.9,0.9,0.89
1,KNN,0.888889,0.88,0.89,0.88
2,Random Forest,0.893939,0.89,0.89,0.88
4,GradientBoostingClassifier,0.888889,0.88,0.89,0.88
3,SVM,0.888889,0.89,0.89,0.87
6,XGBClassifier,0.883838,0.87,0.88,0.87
5,AdaBoostClassifier,0.868687,0.86,0.87,0.86


In [21]:
#export the best models
import pickle
pickle.dump(model_gbc_rain, open("best_rain.pkl", "wb"))
pickle.dump(model_gbc_fog, open("best_fog.pkl", "wb"))
pickle.dump(logmodel_rain_thunderstorm, open("best_thunderstorm.pkl", "wb"))