<a href="https://colab.research.google.com/github/granantuin/Model_vs_data_label/blob/master/randomforest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from sklearn.metrics import precision_recall_fscore_support,confusion_matrix,accuracy_score
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate,GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV

In [0]:
from google.colab import drive
drive.mount('/content/drive')


In [0]:
def evaluate(y_test,y_pred):
  index=["E","N","NE","NW","S","SE","SW","W"]
  print(pd.DataFrame(confusion_matrix(y_test, y_pred), index=index, columns=index))
  print("****************")
  print("Accuracy=","{:.2%}".format(accuracy_score(y_test, y_pred)),"// Model Accuracy=45%")
  results= precision_recall_fscore_support(y_test, y_pred, average=None, )
  df=pd.DataFrame({"Precision":results[0],"Recall":results[1],"F1":results[2],"W_DIR":index})
  df=df.set_index("W_DIR")
  print("Average precision =","{:.2%}".format(df["Precision"].mean()),"// Model precision=28%")
  print("Average recall =","{:.2%}".format(df["Recall"].mean()),"// Model recall=28%")
  results= precision_recall_fscore_support(y_test, y_pred, average='weighted', )
  print("Precision weighted=","{:.2%}".format(results[0]),"//Model weighted=56%")
  print("Recall weighted =","{:.2%}".format(results[1]),"//Model weighted=53%")
  print("****************")
  print(df)

In [0]:
drive_4km="/content/drive/My Drive/Colab Notebooks/model_vs_data/Coron_4km_h24toh48_dir/"
drive_1km="/content/drive/My Drive/Colab Notebooks/model_vs_data/Coron_Mars_H24to48_dir/"
dr=[drive_4km,drive_1km]
y_data=pd.read_excel(dr[1]+"y_data.xlsx")
x_data=pd.read_csv(dr[1]+"x_data",usecols=range(1,10))
y_sp=pd.read_excel(dr[1]+"y_sp.xlsx") # units=Km/h to m/s
y_sp["spd"]=y_sp["spd"]/3.6

In [0]:
NE=y_data[(y_data["value"]>22.5) & (y_data["value"]<67.5)]
NE["label"]="NE"
E=y_data[(y_data["value"]>67.5) & (y_data["value"]<112.5)]
E["label"]="E"
SE=y_data[(y_data["value"]>112.5) & (y_data["value"]<157.5)]
SE["label"]="SE"
S=y_data[(y_data["value"]>157.5) & (y_data["value"]<202.5)]
S["label"]="S"
SW=y_data[(y_data["value"]>202.5) & (y_data["value"]<247.5)]
SW["label"]="SW"
W=y_data[(y_data["value"]>247.5) & (y_data["value"]<292.5)]
W["label"]="W"
NW=y_data[(y_data["value"]>292.5) & (y_data["value"]<337.5)]
NW["label"]="NW"
N=y_data[(y_data["value"]>337.5) | (y_data["value"]<22.5)]
N["label"]="N"
winds=[NE,N,E,SE,S,SW,W,NW]
y_data=pd.concat(winds)
y_data=y_data.sort_values(by="date")

**Dummy class**

In [6]:
#strategy=["stratified","most_frequent”]
#“stratified”: generates predictions by respecting the training set’s class distribution
#“most_frequent”: always predicts the most frequent label in the training set.
print("Strategy = stratified")
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data["label"], test_size=0.2, random_state=5)
y_pred = DummyClassifier(strategy="stratified").fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)
print("Strategy = most_frequent")
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data["label"], test_size=0.2, random_state=5)
y_pred = DummyClassifier(strategy="most_frequent").fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)

Strategy = stratified
    E  N  NE  NW  S  SE  SW  W
E   1  0   2   1  1   1   1  1
N   0  0   8   0  0   0   2  5
NE  2  2  34   6  3   2  19  8
NW  1  0   4   0  0   0   3  1
S   0  0   3   0  2   0   1  2
SE  0  0   1   1  3   0   2  3
SW  0  0  13   0  1   1   3  3
W   2  0   8   0  1   2   3  1
****************
Accuracy= 25.00% // Model Accuracy=45%
Average precision = 11.80% // Model precision=28%
Average recall = 12.80% // Model recall=28%
Precision weighted= 24.85% //Model weighted=56%
Recall weighted = 25.00% //Model weighted=53%
****************
       Precision    Recall        F1
W_DIR                               
E       0.166667  0.125000  0.142857
N       0.000000  0.000000  0.000000
NE      0.465753  0.447368  0.456376
NW      0.000000  0.000000  0.000000
S       0.181818  0.250000  0.210526
SE      0.000000  0.000000  0.000000
SW      0.088235  0.142857  0.109091
W       0.041667  0.058824  0.048780
Strategy = most_frequent
    E  N  NE  NW  S  SE  SW  W
E   0  0   8

  'precision', 'predicted', average, warn_for)


**Random Forest Class**

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data["label"], test_size=0.2, random_state=5)
y_pred = RandomForestClassifier(n_estimators=40).fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)

    E  N  NE  NW  S  SE  SW  W
E   1  0   4   0  0   0   2  1
N   0  5   6   0  0   0   0  4
NE  1  2  66   0  1   0   4  2
NW  1  0   1   0  0   0   4  3
S   1  0   1   0  0   0   4  2
SE  0  0   4   0  2   1   2  1
SW  1  1   4   0  3   0   8  4
W   0  0   6   1  0   0   3  7
****************
Accuracy= 53.66% // Model Accuracy=45%
Average precision = 39.13% // Model precision=28%
Average recall = 27.74% // Model recall=28%
Precision weighted= 52.85% //Model weighted=56%
Recall weighted = 53.66% //Model weighted=53%
****************
       Precision    Recall        F1
W_DIR                               
E       0.200000  0.125000  0.153846
N       0.625000  0.333333  0.434783
NE      0.717391  0.868421  0.785714
NW      0.000000  0.000000  0.000000
S       0.000000  0.000000  0.000000
SE      1.000000  0.100000  0.181818
SW      0.296296  0.380952  0.333333
W       0.291667  0.411765  0.341463


**Synthetic samples**

In [0]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data["label"], test_size=0.4, random_state=27)
print("Original train samples=",x_train.shape[0])
sm = SMOTE(random_state=27,)
x_train, y_train = sm.fit_sample(x_train, y_train)
print("Synthetic train samples=",x_train.shape[0])
print("****Randomforest****")
y_pred = RandomForestClassifier(n_estimators=40).fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)

Original train samples= 489
Synthetic train samples= 1944
****Randomforest****
     E   N  NE  NW  S  SE  SW   W
E    4   2  10   1  0   3   4   0
N    3   8  11   0  0   1   0   0
NE  12  11  96   1  2   7   2   2
NW   1   1   3   4  0   1   4   0
S    1   1   2   2  8   3   7   0
SE   4   0   3   1  1   3   1   0
SW   1   3   4   4  7   5  23   4
W    1   5   9   5  2   4   6  13
****************
Accuracy= 48.62% // Model Accuracy=45%
Average precision = 37.61% // Model precision=28%
Average recall = 35.32% // Model recall=28%
Precision weighted= 52.57% //Model weighted=56%
Recall weighted = 48.62% //Model weighted=53%
****************
       Precision    Recall        F1
W_DIR                               
E       0.148148  0.166667  0.156863
N       0.258065  0.347826  0.296296
NE      0.695652  0.721805  0.708487
NW      0.222222  0.285714  0.250000
S       0.400000  0.333333  0.363636
SE      0.111111  0.230769  0.150000
SW      0.489362  0.450980  0.469388
W       0.684211  0.2

**Filtering wind speed thershold**

In [0]:

x_data["date"]=y_data["date"]
x_data=x_data.set_index("date")
y_data=y_data.set_index("date")
y_sp=y_sp.set_index("date")
tot=pd.concat([x_data,y_data,y_sp],axis=1).dropna()
x_data=tot[tot["spd"]>2].iloc[:, 0:9]
y_data=tot[tot["spd"]>2].iloc[:, 10:11]


In [30]:
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data["label"], test_size=0.5, random_state=5)
clas=RandomForestClassifier(n_estimators= 200, min_samples_split= 10, min_samples_leaf= 2, max_features= 'sqrt', max_depth= 50, bootstrap= True)
y_pred = clas.fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)

    E  N   NE  NW  S  SE  SW   W
E   0  1    7   0  1   0   1   1
N   0  4    7   0  0   0   1   2
NE  0  3  139   0  1   0   1   6
NW  0  2    2   0  0   0   0   6
S   0  0    2   0  1   0   6   2
SE  0  0    0   0  0   0   1   0
SW  0  0    8   0  4   0  34  12
W   0  2    4   0  0   0  13  28
****************
Accuracy= 68.21% // Model Accuracy=45%
Average precision = 29.83% // Model precision=28%
Average recall = 31.07% // Model recall=28%
Precision weighted= 62.02% //Model weighted=56%
Recall weighted = 68.21% //Model weighted=53%
****************
       Precision    Recall        F1
W_DIR                               
E       0.000000  0.000000  0.000000
N       0.333333  0.285714  0.307692
NE      0.822485  0.926667  0.871473
NW      0.000000  0.000000  0.000000
S       0.142857  0.090909  0.111111
SE      0.000000  0.000000  0.000000
SW      0.596491  0.586207  0.591304
W       0.491228  0.595745  0.538462


  'precision', 'predicted', average, warn_for)


**K Folds**

In [31]:
scoring = ['precision_macro', 'recall_macro','f1_macro',"accuracy","precision_weighted","recall_weighted","f1_weighted"]
scores = cross_validate(clas, x_data, y_data["label"], scoring=scoring,
                        cv=5, return_train_score=False)
print("Accuracy: {:.2%} (+/- {:.2%})" .format (scores["test_accuracy"].mean(), scores["test_accuracy"].std() * 2))
print("Recall: {:.2%} (+/- {:.2%})" .format (scores["test_recall_macro"].mean(), scores["test_recall_macro"].std() * 2))
print("Precision: {:.2%} (+/- {:.2%})" .format (scores["test_precision_macro"].mean(), scores["test_precision_macro"].std() * 2))
print("f1 : {:.2%} (+/-{:.2%} )".format (scores["test_f1_macro"].mean(), scores["test_f1_macro"].std() * 2))
print("Recall weighted: {:.2%} (+/- {:.2%})" .format (scores["test_recall_weighted"].mean(), scores["test_recall_weighted"].std() * 2))
print("Precision weighted: {:.2%} (+/- {:.2%})" .format (scores["test_precision_weighted"].mean(), scores["test_precision_weighted"].std() * 2))
print("f1 weighted: {:.2%} (+/-{:.2%} )".format (scores["test_f1_weighted"].mean(), scores["test_f1_weighted"].std() * 2))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy: 66.16% (+/- 10.76%)
Recall: 28.92% (+/- 9.91%)
Precision: 31.23% (+/- 21.43%)
f1 : 28.14% (+/-12.06% )
Recall weighted: 66.16% (+/- 10.76%)
Precision weighted: 61.14% (+/- 17.90%)
f1 weighted: 62.36% (+/-12.93% )


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


**Tuning RandomForest**

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25,
                                                    random_state=27)
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
                               n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(x_train, y_train)
print("best parameters",rf_random.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   36.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.0min finished
  self.best_estimator_.fit(X, y, **fit_params)


best parameters {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}


**K folds**

In [17]:
clas=RandomForestClassifier(n_estimators= 200, min_samples_split= 10, min_samples_leaf= 2, max_features= 'sqrt', max_depth= 50, bootstrap= True)
scoring = ['precision_macro', 'recall_macro','f1_macro',"accuracy","precision_weighted","recall_weighted","f1_weighted"]
scores = cross_validate(clas, x_data, y_data["label"], scoring=scoring,
                        cv=5, return_train_score=False)
print("Accuracy: {:.2%} (+/- {:.2%})" .format (scores["test_accuracy"].mean(), scores["test_accuracy"].std() * 2))
print("Recall: {:.2%} (+/- {:.2%})" .format (scores["test_recall_macro"].mean(), scores["test_recall_macro"].std() * 2))
print("Precision: {:.2%} (+/- {:.2%})" .format (scores["test_precision_macro"].mean(), scores["test_precision_macro"].std() * 2))
print("f1 : {:.2%} (+/-{:.2%} )".format (scores["test_f1_macro"].mean(), scores["test_f1_macro"].std() * 2))
print("Recall weighted: {:.2%} (+/- {:.2%})" .format (scores["test_recall_weighted"].mean(), scores["test_recall_weighted"].std() * 2))
print("Precision weighted: {:.2%} (+/- {:.2%})" .format (scores["test_precision_weighted"].mean(), scores["test_precision_weighted"].std() * 2))
print("f1 weighted: {:.2%} (+/-{:.2%} )".format (scores["test_f1_weighted"].mean(), scores["test_f1_weighted"].std() * 2))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy: 66.00% (+/- 11.94%)
Recall: 28.28% (+/- 9.98%)
Precision: 30.63% (+/- 20.64%)
f1 : 27.84% (+/-12.23% )
Recall weighted: 66.00% (+/- 11.94%)
Precision weighted: 60.83% (+/- 18.36%)
f1 weighted: 62.28% (+/-14.08% )


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


**Dummy**

In [32]:
#strategy=["stratified","most_frequent”]
#“stratified”: generates predictions by respecting the training set’s class distribution
#“most_frequent”: always predicts the most frequent label in the training set.
print("Strategy = stratified")
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data["label"], test_size=0.2, random_state=5)
y_pred = DummyClassifier(strategy="stratified").fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)
print("Strategy = most_frequent")
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data["label"], test_size=0.2, random_state=5)
y_pred = DummyClassifier(strategy="most_frequent").fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)

Strategy = stratified
    E  N  NE  NW  S  SE  SW   W
E   0  1   2   0  1   0   0   0
N   0  0   7   0  0   0   0   0
NE  2  4  26   1  4   2  10  13
NW  1  0   2   0  0   0   1   1
S   0  1   0   0  0   0   2   0
SE  0  0   0   0  0   0   1   0
SW  0  1  10   2  1   0   8   1
W   0  0  10   0  2   0   3   1
****************
Accuracy= 28.93% // Model Accuracy=45%
Average precision = 10.48% // Model precision=28%
Average recall = 10.37% // Model recall=28%
Precision weighted= 30.28% //Model weighted=56%
Recall weighted = 28.93% //Model weighted=53%
****************
       Precision    Recall        F1
W_DIR                               
E        0.00000  0.000000  0.000000
N        0.00000  0.000000  0.000000
NE       0.45614  0.419355  0.436975
NW       0.00000  0.000000  0.000000
S        0.00000  0.000000  0.000000
SE       0.00000  0.000000  0.000000
SW       0.32000  0.347826  0.333333
W        0.06250  0.062500  0.062500
Strategy = most_frequent
    E  N  NE  NW  S  SE  SW  W
E  

  'precision', 'predicted', average, warn_for)
