<a href="https://colab.research.google.com/github/granantuin/Model_vs_data_label/blob/master/randomforest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from sklearn.metrics import precision_recall_fscore_support,confusion_matrix,accuracy_score
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate,GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV

In [0]:
from google.colab import drive
drive.mount('/content/drive')


In [0]:
def evaluate(y_test,y_pred):
  index=["E","N","NE","NW","S","SE","SW","W"]
  print(pd.DataFrame(confusion_matrix(y_test, y_pred), index=index, columns=index))
  print("****************")
  print("Accuracy=","{:.2%}".format(accuracy_score(y_test, y_pred)),"// Model Accuracy=45%")
  results= precision_recall_fscore_support(y_test, y_pred, average=None, )
  df=pd.DataFrame({"Precision":results[0],"Recall":results[1],"F1":results[2],"W_DIR":index})
  df=df.set_index("W_DIR")
  print("Average precision =","{:.2%}".format(df["Precision"].mean()),"// Model precision=28%")
  print("Average recall =","{:.2%}".format(df["Recall"].mean()),"// Model recall=28%")
  results= precision_recall_fscore_support(y_test, y_pred, average='weighted', )
  print("Precision weighted=","{:.2%}".format(results[0]),"//Model weighted=56%")
  print("Recall weighted =","{:.2%}".format(results[1]),"//Model weighted=53%")
  print("****************")
  print(df)

In [0]:
drive_4km="/content/drive/My Drive/Colab Notebooks/model_vs_data/Coron_4km_h24toh48_dir/"
drive_1km="/content/drive/My Drive/Colab Notebooks/model_vs_data/Coron_Mars_H24to48_dir/"
dr=[drive_4km,drive_1km]
y_data=pd.read_excel(dr[1]+"y_data.xlsx")
x_data=pd.read_csv(dr[1]+"x_data",usecols=range(1,10))
y_sp=pd.read_excel(dr[1]+"y_sp.xlsx") # units=Km/h to m/s
y_sp["spd"]=y_sp["spd"]/3.6

In [0]:
NE=y_data[(y_data["value"]>22.5) & (y_data["value"]<67.5)]
NE["label"]="NE"
E=y_data[(y_data["value"]>67.5) & (y_data["value"]<112.5)]
E["label"]="E"
SE=y_data[(y_data["value"]>112.5) & (y_data["value"]<157.5)]
SE["label"]="SE"
S=y_data[(y_data["value"]>157.5) & (y_data["value"]<202.5)]
S["label"]="S"
SW=y_data[(y_data["value"]>202.5) & (y_data["value"]<247.5)]
SW["label"]="SW"
W=y_data[(y_data["value"]>247.5) & (y_data["value"]<292.5)]
W["label"]="W"
NW=y_data[(y_data["value"]>292.5) & (y_data["value"]<337.5)]
NW["label"]="NW"
N=y_data[(y_data["value"]>337.5) | (y_data["value"]<22.5)]
N["label"]="N"
winds=[NE,N,E,SE,S,SW,W,NW]
y_data=pd.concat(winds)
y_data=y_data.sort_values(by="date")

**Dummy class**

In [6]:
#strategy=["stratified","most_frequent”]
#“stratified”: generates predictions by respecting the training set’s class distribution
#“most_frequent”: always predicts the most frequent label in the training set.
print("Strategy = stratified")
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data["label"], test_size=0.2, random_state=5)
y_pred = DummyClassifier(strategy="stratified").fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)
print("Strategy = most_frequent")
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data["label"], test_size=0.2, random_state=5)
y_pred = DummyClassifier(strategy="most_frequent").fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)

Strategy = stratified
    E  N  NE  NW  S  SE  SW  W
E   0  0   5   0  1   0   1  1
N   3  0   7   0  0   0   4  1
NE  5  3  39   0  7   2  13  7
NW  0  1   6   0  1   0   1  0
S   0  0   5   0  0   0   2  1
SE  2  1   3   0  1   0   2  1
SW  3  0   9   0  1   1   4  3
W   0  1  12   0  0   1   2  1
****************
Accuracy= 26.83% // Model Accuracy=45%
Average precision = 8.23% // Model precision=28%
Average recall = 9.53% // Model recall=28%
Precision weighted= 23.47% //Model weighted=56%
Recall weighted = 26.83% //Model weighted=53%
****************
       Precision    Recall        F1
W_DIR                               
E       0.000000  0.000000  0.000000
N       0.000000  0.000000  0.000000
NE      0.453488  0.513158  0.481481
NW      0.000000  0.000000  0.000000
S       0.000000  0.000000  0.000000
SE      0.000000  0.000000  0.000000
SW      0.137931  0.190476  0.160000
W       0.066667  0.058824  0.062500
Strategy = most_frequent
    E  N  NE  NW  S  SE  SW  W
E   0  0   8  

  'precision', 'predicted', average, warn_for)


**Random Forest Class**

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data["label"], test_size=0.2, random_state=5)
y_pred = RandomForestClassifier(n_estimators=40).fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)

    E  N  NE  NW  S  SE  SW  W
E   3  0   3   0  1   0   0  1
N   1  5   5   0  0   1   0  3
NE  0  4  66   0  1   0   5  0
NW  1  0   1   0  0   0   5  2
S   0  0   2   0  1   0   3  2
SE  0  0   5   0  2   1   1  1
SW  1  0   4   0  0   1  10  5
W   0  0   6   1  0   0   3  7
****************
Accuracy= 56.71% // Model Accuracy=45%
Average precision = 37.62% // Model precision=28%
Average recall = 33.62% // Model recall=28%
Precision weighted= 51.97% //Model weighted=56%
Recall weighted = 56.71% //Model weighted=53%
****************
       Precision    Recall        F1
W_DIR                               
E       0.500000  0.375000  0.428571
N       0.555556  0.333333  0.416667
NE      0.717391  0.868421  0.785714
NW      0.000000  0.000000  0.000000
S       0.200000  0.125000  0.153846
SE      0.333333  0.100000  0.153846
SW      0.370370  0.476190  0.416667
W       0.333333  0.411765  0.368421


**Synthetic samples**

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data["label"], test_size=0.4, random_state=27)
print("Original train samples=",x_train.shape[0])
sm = SMOTE(random_state=27,)
x_train, y_train = sm.fit_sample(x_train, y_train)
print("Synthetic train samples=",x_train.shape[0])
print("****Randomforest****")
y_pred = RandomForestClassifier(n_estimators=40).fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)

Original train samples= 489
Synthetic train samples= 1944
****Randomforest****
     E   N  NE  NW  S  SE  SW   W
E    4   2  10   1  0   3   4   0
N    3   8  11   0  0   1   0   0
NE  12  11  96   1  2   7   2   2
NW   1   1   3   4  0   1   4   0
S    1   1   2   2  8   3   7   0
SE   4   0   3   1  1   3   1   0
SW   1   3   4   4  7   5  23   4
W    1   5   9   5  2   4   6  13
****************
Accuracy= 48.62% // Model Accuracy=45%
Average precision = 37.61% // Model precision=28%
Average recall = 35.32% // Model recall=28%
Precision weighted= 52.57% //Model weighted=56%
Recall weighted = 48.62% //Model weighted=53%
****************
       Precision    Recall        F1
W_DIR                               
E       0.148148  0.166667  0.156863
N       0.258065  0.347826  0.296296
NE      0.695652  0.721805  0.708487
NW      0.222222  0.285714  0.250000
S       0.400000  0.333333  0.363636
SE      0.111111  0.230769  0.150000
SW      0.489362  0.450980  0.469388
W       0.684211  0.2

**Filtering wind speed thershold**

In [0]:

x_data["date"]=y_data["date"]
x_data=x_data.set_index("date")
y_data=y_data.set_index("date")
y_sp=y_sp.set_index("date")
tot=pd.concat([x_data,y_data,y_sp],axis=1).dropna()
x_data=tot[tot["spd"]>2].iloc[:, 0:9]
y_data=tot[tot["spd"]>2].iloc[:, 10:11]


In [12]:
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data["label"], test_size=0.4, random_state=5)
clas=RandomForestClassifier(n_estimators=40)
y_pred = clas.fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)

    E  N   NE  NW  S  SE  SW   W
E   0  0    6   0  1   0   1   1
N   0  3    5   0  1   0   1   1
NE  0  3  114   0  1   0   0   3
NW  0  1    1   3  0   0   0   3
S   0  0    1   0  2   0   2   1
SE  0  0    1   0  0   0   0   0
SW  0  0    5   2  7   1  26   7
W   0  0    4   3  0   0  16  15
****************
Accuracy= 67.36% // Model Accuracy=45%
Average precision = 35.64% // Model precision=28%
Average recall = 35.75% // Model recall=28%
Precision weighted= 64.02% //Model weighted=56%
Recall weighted = 67.36% //Model weighted=53%
****************
       Precision    Recall        F1
W_DIR                               
E       0.000000  0.000000  0.000000
N       0.428571  0.272727  0.333333
NE      0.832117  0.942149  0.883721
NW      0.375000  0.375000  0.375000
S       0.166667  0.333333  0.222222
SE      0.000000  0.000000  0.000000
SW      0.565217  0.541667  0.553191
W       0.483871  0.394737  0.434783


  'precision', 'predicted', average, warn_for)


**K Folds**

In [13]:
scoring = ['precision_macro', 'recall_macro','f1_macro',"accuracy","precision_weighted","recall_weighted","f1_weighted"]
scores = cross_validate(clas, x_data, y_data["label"], scoring=scoring,
                        cv=5, return_train_score=False)
print("Accuracy: {:.2%} (+/- {:.2%})" .format (scores["test_accuracy"].mean(), scores["test_accuracy"].std() * 2))
print("Recall: {:.2%} (+/- {:.2%})" .format (scores["test_recall_macro"].mean(), scores["test_recall_macro"].std() * 2))
print("Precision: {:.2%} (+/- {:.2%})" .format (scores["test_precision_macro"].mean(), scores["test_precision_macro"].std() * 2))
print("f1 : {:.2%} (+/-{:.2%} )".format (scores["test_f1_macro"].mean(), scores["test_f1_macro"].std() * 2))
print("Recall weighted: {:.2%} (+/- {:.2%})" .format (scores["test_recall_weighted"].mean(), scores["test_recall_weighted"].std() * 2))
print("Precision weighted: {:.2%} (+/- {:.2%})" .format (scores["test_precision_weighted"].mean(), scores["test_precision_weighted"].std() * 2))
print("f1 weighted: {:.2%} (+/-{:.2%} )".format (scores["test_f1_weighted"].mean(), scores["test_f1_weighted"].std() * 2))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy: 64.89% (+/- 16.05%)
Recall: 29.68% (+/- 13.74%)
Precision: 33.84% (+/- 16.41%)
f1 : 30.16% (+/-14.20% )
Recall weighted: 64.89% (+/- 16.05%)
Precision weighted: 61.63% (+/- 17.14%)
f1 weighted: 62.34% (+/-16.55% )


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
