## Intepretability Techniques

In [1]:
import pandas as pd
import numpy as np
import shap

from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

from joblib import dump, load

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
df = pd.read_csv('../../Data/processed/weatherAUS_imputed.csv')
display(df.head())

Unnamed: 0,Date,Location,Year,Month,RainToday,RainTomorrow,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindGustDir_angle,Latitude,Longitude,Cloud,Pressure,Temperature,Humidity,temp_fluctuation
0,2009-01-01,Cobar,2009,1,0.0,0.0,-0.254034,1.452543,1.231645,0.544048,0.367568,-0.016581,0.134517,-0.346762,-1.566909,1.483789,-2.239319,1.266185
1,2009-01-02,Cobar,2009,1,0.0,0.0,-0.254034,2.09229,1.417426,-0.274399,0.142577,-0.016581,0.134517,-1.33762,-0.534263,0.523016,-2.103988,-0.123978
2,2009-01-03,Cobar,2009,1,0.0,0.0,-0.254034,1.589632,1.497046,-0.795229,-0.307405,-0.016581,0.134517,-1.33762,-0.664246,1.892307,-2.753579,1.531952
3,2009-01-04,Cobar,2009,1,0.0,0.0,-0.254034,1.178365,0.780464,0.395239,-1.432358,-0.016581,0.134517,-0.346762,-0.787008,1.756134,-1.400263,1.450178
4,2009-01-05,Cobar,2009,1,0.0,0.0,-0.254034,1.315454,1.205105,-0.720824,1.26753,-0.016581,0.134517,-0.544934,-0.765344,1.839351,-1.535595,1.102637


In [3]:
data = df.drop(columns=['RainTomorrow', 'Location', 'Date'])
target = df['RainTomorrow']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data,target,test_size=0.2,random_state=4)

In [5]:
def location_scores(sampler, classifier):
    locations = df.Location.unique()
    region_models = {}
    results = []

    for location in locations:
        region_data = df[df['Location'] == location]
        X_loc = region_data.drop(['RainTomorrow', 'Location', 'Date'], axis=1)
        y_loc = region_data['RainTomorrow']
        X_train_loc, X_test_loc, y_train_loc, y_test_loc = train_test_split(X_loc, y_loc, test_size=0.2, random_state=42)
        
        X_resampled,y_resampled = sampler.fit_resample(X_train_loc,y_train_loc)
    
        #Train Classifier for each region
        classifier.fit(X_resampled,  y_resampled)
        region_models[location] = classifier
    
        #Score for each region
        y_pred_loc = classifier.predict(X_test_loc)
        accuracy = accuracy_score(y_test_loc, y_pred_loc)
        precision = precision_score(y_test_loc, y_pred_loc)
        recall = recall_score(y_test_loc, y_pred_loc)
        f1 = f1_score(y_test_loc, y_pred_loc)
        
        results.append({
            'Location': location,
            'Accuracy': round(accuracy, 2),
            'Precision': round(precision, 2),
            'Recall': round(recall, 2),
            'F1': round(f1, 2)
        })

    # Create DataFrame
    results_df = pd.DataFrame(results)

    return results_df

In [6]:
def year_scores(sampler, classifier):
    years = df.Year.unique()
    years_models = {}

    for year in years:
        year_data = df[df['Year'] == year]
        X_year = year_data.drop(['RainTomorrow', 'Location', 'Date'], axis=1)
        y_year = year_data['RainTomorrow']
        X_train_year, X_test_year, y_train_year, y_test_year = train_test_split(X_year, y_year, test_size=0.2, random_state=42)
        X_year,y_year = sampler.fit_resample(X_year,y_year)
    
        #Train Classifier for each year
        classifier.fit(X_train_year, y_train_year)
        years_models[year] = classifier
    
        #Evaluate scores for each year
        y_pred_year = classifier.predict(X_test_year)
        accuracy = accuracy_score(y_test_year, y_pred_year)
        precision = precision_score(y_test_year, y_pred_year)
        recall = recall_score(y_test_year, y_pred_year)
        f1 = f1_score(y_test_year, y_pred_year)
        print(f"{year} - Accuracy: {np.round(accuracy,2)}, Precision: {np.round(precision,2)}, Recall: {np.round(recall,2)}, F1: {np.round(f1,2)}")

### KNN with Random Undersampling and Best Parameter

In [7]:
rus = RandomUnderSampler()

In [8]:
knn_rus = KNeighborsClassifier(n_neighbors=23, metric="manhattan")

In [9]:
location_scores(rus, knn_rus)

Unnamed: 0,Location,Accuracy,Precision,Recall,F1
0,Cobar,0.8,0.37,0.73,0.49
1,CoffsHarbour,0.78,0.6,0.77,0.67
2,Moree,0.85,0.43,0.86,0.58
3,NorfolkIsland,0.72,0.54,0.66,0.59
4,SydneyAirport,0.78,0.54,0.78,0.64
5,WaggaWagga,0.82,0.5,0.79,0.61
6,Williamtown,0.8,0.55,0.77,0.64
7,Canberra,0.78,0.43,0.7,0.53
8,Sale,0.73,0.42,0.72,0.53
9,MelbourneAirport,0.75,0.43,0.81,0.56


In [10]:
year_scores(rus, knn_rus)

2009 - Accuracy: 0.86, Precision: 0.75, Recall: 0.45, F1: 0.56
2010 - Accuracy: 0.83, Precision: 0.77, Recall: 0.49, F1: 0.6
2011 - Accuracy: 0.84, Precision: 0.74, Recall: 0.51, F1: 0.6
2012 - Accuracy: 0.87, Precision: 0.76, Recall: 0.44, F1: 0.55
2013 - Accuracy: 0.85, Precision: 0.75, Recall: 0.42, F1: 0.54
2014 - Accuracy: 0.84, Precision: 0.72, Recall: 0.33, F1: 0.45
2015 - Accuracy: 0.85, Precision: 0.76, Recall: 0.4, F1: 0.52
2016 - Accuracy: 0.83, Precision: 0.74, Recall: 0.43, F1: 0.54
2017 - Accuracy: 0.84, Precision: 0.72, Recall: 0.35, F1: 0.47


### Decision Classifier with SMOTE, best parameter and adaptive boosting

In [11]:
sm = SMOTE()

In [12]:
dt_ac_fit = load('dt_ac_fit.joblib')

In [13]:
location_scores(sm, dt_ac_fit)

Unnamed: 0,Location,Accuracy,Precision,Recall,F1
0,Cobar,0.91,0.64,0.68,0.66
1,CoffsHarbour,0.83,0.73,0.67,0.7
2,Moree,0.91,0.59,0.68,0.63
3,NorfolkIsland,0.77,0.64,0.6,0.62
4,SydneyAirport,0.81,0.62,0.62,0.62
5,WaggaWagga,0.86,0.62,0.6,0.61
6,Williamtown,0.83,0.65,0.59,0.62
7,Canberra,0.87,0.65,0.62,0.63
8,Sale,0.82,0.57,0.58,0.58
9,MelbourneAirport,0.81,0.51,0.58,0.55


In [14]:
year_scores(sm, dt_ac_fit)

2009 - Accuracy: 0.86, Precision: 0.75, Recall: 0.47, F1: 0.58
2010 - Accuracy: 0.85, Precision: 0.79, Recall: 0.57, F1: 0.66
2011 - Accuracy: 0.84, Precision: 0.71, Recall: 0.54, F1: 0.62
2012 - Accuracy: 0.87, Precision: 0.73, Recall: 0.49, F1: 0.59
2013 - Accuracy: 0.86, Precision: 0.76, Recall: 0.49, F1: 0.59
2014 - Accuracy: 0.86, Precision: 0.75, Recall: 0.45, F1: 0.56
2015 - Accuracy: 0.87, Precision: 0.78, Recall: 0.5, F1: 0.61
2016 - Accuracy: 0.85, Precision: 0.75, Recall: 0.53, F1: 0.62
2017 - Accuracy: 0.86, Precision: 0.77, Recall: 0.46, F1: 0.58


### Random Forest Classifier with Random Oversampling 

In [15]:
ros = RandomOverSampler()

In [16]:
rfc_ros = RandomForestClassifier(n_jobs=-1, random_state=321)

In [17]:
location_scores(ros, rfc_ros)

Unnamed: 0,Location,Accuracy,Precision,Recall,F1
0,Cobar,0.91,0.7,0.6,0.65
1,CoffsHarbour,0.82,0.74,0.62,0.68
2,Moree,0.92,0.69,0.62,0.65
3,NorfolkIsland,0.75,0.61,0.56,0.59
4,SydneyAirport,0.8,0.61,0.57,0.59
5,WaggaWagga,0.87,0.67,0.59,0.62
6,Williamtown,0.81,0.61,0.51,0.55
7,Canberra,0.87,0.67,0.49,0.57
8,Sale,0.82,0.59,0.54,0.57
9,MelbourneAirport,0.82,0.54,0.49,0.52


In [18]:
year_scores(ros, rfc_ros)

2009 - Accuracy: 0.86, Precision: 0.75, Recall: 0.47, F1: 0.58
2010 - Accuracy: 0.84, Precision: 0.77, Recall: 0.56, F1: 0.65
2011 - Accuracy: 0.84, Precision: 0.73, Recall: 0.53, F1: 0.62
2012 - Accuracy: 0.87, Precision: 0.73, Recall: 0.49, F1: 0.59
2013 - Accuracy: 0.86, Precision: 0.75, Recall: 0.49, F1: 0.59
2014 - Accuracy: 0.86, Precision: 0.75, Recall: 0.45, F1: 0.56
2015 - Accuracy: 0.86, Precision: 0.76, Recall: 0.49, F1: 0.59
2016 - Accuracy: 0.84, Precision: 0.73, Recall: 0.51, F1: 0.6
2017 - Accuracy: 0.86, Precision: 0.73, Recall: 0.46, F1: 0.57
