In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import normalize

In [24]:
labels_bin = pd.read_csv('labels_bin.csv', compression='zip')
features = pd.read_csv('features.csv', compression='zip')
engineered_features = pd.read_csv('engineered_features.csv', compression='zip')

labels_bin.drop(['Unnamed: 0'],axis=1, inplace=True)
features.drop(['Unnamed: 0'],axis=1, inplace=True)
engineered_features.drop(['Unnamed: 0'],axis=1, inplace=True)

In [25]:
engineered_features.head()

Unnamed: 0,humidity,temperature,pressure,wind direction,wind speed,Latitude,Longitude,month,day,hour,day_avg_hum,month_avg_hum,year_avg_hum,day_avg_temp,month_avg_temp,year_avg_temp,day_avg_press,month_avg_press,year_avg_press
0,87.0,284.590217,807.0,268.0,0.0,49.24966,-123.119339,10,2,9,67.822292,67.819934,67.756567,287.58953,287.593295,287.600263,1018.114301,1018.116066,1018.091305
1,88.0,284.588174,849.0,281.0,0.0,49.24966,-123.119339,10,2,10,67.822292,67.819934,67.756567,287.58953,287.593295,287.600263,1018.114301,1018.116066,1018.091305
2,89.0,284.58613,890.0,295.0,0.0,49.24966,-123.119339,10,2,11,67.822292,67.819934,67.756567,287.58953,287.593295,287.600263,1018.114301,1018.116066,1018.091305
3,89.0,284.584087,932.0,309.0,0.0,49.24966,-123.119339,10,2,12,67.822292,67.819934,67.756567,287.58953,287.593295,287.600263,1018.114301,1018.116066,1018.091305
4,90.0,284.582043,973.0,323.0,0.0,49.24966,-123.119339,10,2,13,67.822292,67.819934,67.756567,287.58953,287.593295,287.600263,1018.114301,1018.116066,1018.091305


In [26]:
norm_features = normalize(engineered_features)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(engineered_features, labels_bin, test_size=0.33, random_state=42)

In [28]:
# Create the model with 100 trees
model = RandomForestClassifier(n_estimators = 100, oob_score = True, n_jobs = -1,random_state =50,                                         max_features ="auto", verbose=True)
# Fit on training data
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.1min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   12.2s finished


In [29]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[265027  17594]
 [ 31473 124951]]
              precision    recall  f1-score   support

       False       0.89      0.94      0.92    282621
        True       0.88      0.80      0.84    156424

    accuracy                           0.89    439045
   macro avg       0.89      0.87      0.88    439045
weighted avg       0.89      0.89      0.89    439045

0.8882415242173354


In [30]:
train_rf_predictions = model.predict(X_train)
train_rf_probs = model.predict_proba(X_train)[:, 1]

# Actual class predictions
rf_predictions = model.predict(X_test)
# Probabilities for each class
rf_probs = model.predict_proba(X_test)[:, 1]

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   10.8s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   24.7s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   12.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   29.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.4s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   13.7s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   12.7s finished


In [31]:
# Calculate roc auc
roc_value = roc_auc_score(y_test, rf_probs)
roc_value_train = roc_auc_score(y_train, train_rf_probs)

In [32]:
print(roc_value)

0.9501782549452488


In [33]:
print(roc_value_train)

0.9999999999343067
