<a href="https://colab.research.google.com/github/granantuin/Model_vs_data_label/blob/master/four_class_thershold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from sklearn.metrics import precision_recall_fscore_support,confusion_matrix,accuracy_score
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate,GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV

In [0]:
from google.colab import drive
drive.mount('/content/drive')


In [0]:
def evaluate(y_test,y_pred):
  index=["E","N","NE","NW","S","SE","SW","W"]
  print(pd.DataFrame(confusion_matrix(y_test, y_pred), index=index, columns=index))
  print("****************")
  print("Accuracy=","{:.2%}".format(accuracy_score(y_test, y_pred)),"// Model Accuracy=45%")
  results= precision_recall_fscore_support(y_test, y_pred, average=None, )
  df=pd.DataFrame({"Precision":results[0],"Recall":results[1],"F1":results[2],"W_DIR":index})
  df=df.set_index("W_DIR")
  print("Average precision =","{:.2%}".format(df["Precision"].mean()),"// Model precision=28%")
  print("Average recall =","{:.2%}".format(df["Recall"].mean()),"// Model recall=28%")
  results= precision_recall_fscore_support(y_test, y_pred, average='weighted', )
  print("Precision weighted=","{:.2%}".format(results[0]),"//Model weighted=56%")
  print("Recall weighted =","{:.2%}".format(results[1]),"//Model weighted=53%")
  print("****************")
  print(df)

In [0]:
drive_4km="/content/drive/My Drive/Colab Notebooks/model_vs_data/Coron_4km_h24toh48_dir/"
drive_1km="/content/drive/My Drive/Colab Notebooks/model_vs_data/Coron_Mars_H24to48_dir/"
dr=[drive_4km,drive_1km]
y_data=pd.read_excel(dr[0]+"y_coron_dir.xlsx",index_col=0)
x_data=pd.read_csv(dr[0]+"x_coron.csv",index_col=0)
x_data=x_data.iloc[:,0:9]
y_data=y_data[y_data>0]# delete station errors
result = x_data.join(y_data, how='outer').dropna()
x_data=result.iloc[:,0:9]
y_data=result.iloc[:,9:10]

In [0]:
NE=y_data[(y_data["value"]>22.5) & (y_data["value"]<67.5)]
NE["label"]="NE"
E=y_data[(y_data["value"]>67.5) & (y_data["value"]<112.5)]
E["label"]="E"
SE=y_data[(y_data["value"]>112.5) & (y_data["value"]<157.5)]
SE["label"]="SE"
S=y_data[(y_data["value"]>157.5) & (y_data["value"]<202.5)]
S["label"]="S"
SW=y_data[(y_data["value"]>202.5) & (y_data["value"]<247.5)]
SW["label"]="SW"
W=y_data[(y_data["value"]>247.5) & (y_data["value"]<292.5)]
W["label"]="W"
NW=y_data[(y_data["value"]>292.5) & (y_data["value"]<337.5)]
NW["label"]="NW"
N=y_data[(y_data["value"]>337.5) | (y_data["value"]<22.5)]
N["label"]="N"
winds=[NE,N,E,SE,S,SW,W,NW]
y_data=pd.concat(winds)
y_data=y_data.sort_index()

**Classifiers**

In [8]:
clas = [DummyClassifier(),RandomForestClassifier(n_estimators= 200, max_depth= 100, bootstrap= True),
       ExtraTreesClassifier(),AdaBoostClassifier()]


x_train, x_test, y_train, y_test = train_test_split(x_data,y_data["label"], test_size=0.3, random_state=5)
y_pred = clas[1].fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)

    E   N   NE  NW    S  SE   SW   W
E   3   8   83   1   20   2   10   3
N   3  45  196   8   12   0   18  19
NE  2  39  731   0   16   4   14  19
NW  1  16   33   4    6   0    8  14
S   2   6   53   1  281   8  132  13
SE  5   1   35   0   29   4    8   7
SW  1   9   51   4  128   2  257  35
W   1  14   25   7   17   1   64  57
****************
Accuracy= 53.24% // Model Accuracy=45%
Average precision = 35.56% // Model precision=28%
Average recall = 31.91% // Model recall=28%
Precision weighted= 47.45% //Model weighted=56%
Recall weighted = 53.24% //Model weighted=53%
****************
       Precision    Recall        F1
W_DIR                               
E       0.166667  0.023077  0.040541
N       0.326087  0.149502  0.205011
NE      0.605634  0.886061  0.719488
NW      0.160000  0.048780  0.074766
S       0.552063  0.566532  0.559204
SE      0.190476  0.044944  0.072727
SW      0.502935  0.527721  0.515030
W       0.341317  0.306452  0.322946


**Synthetic samples**

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data["label"], test_size=0.4, random_state=27)
print("Original train samples=",x_train.shape[0])
sm = SMOTE(random_state=27,)
x_train, y_train = sm.fit_sample(x_train, y_train)
print("Synthetic train samples=",x_train.shape[0])

y_pred = clas[1].fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)

Original train samples= 5191
Synthetic train samples= 13552
     E    N   NE  NW    S  SE   SW   W
E   33    8   47  14   12  22    8   8
N   21  115  148  22    9   9    8  32
NE  94  204  749  41   16  44   21  22
NW   8   27    9  20    6   7    6  20
S   30   13   37  23  333  50  150  25
SE  18    7   16   5   21  19   11   6
SW  12   23   23  23  127  24  315  76
W    8   20   23  38   21  10   60  85
****************
Accuracy= 48.21% // Model Accuracy=45%
Average precision = 35.13% // Model precision=28%
Average recall = 35.88% // Model recall=28%
Precision weighted= 52.50% //Model weighted=56%
Recall weighted = 48.21% //Model weighted=53%
****************
       Precision    Recall        F1
W_DIR                               
E       0.147321  0.217105  0.175532
N       0.275779  0.315934  0.294494
NE      0.711977  0.628883  0.667856
NW      0.107527  0.194175  0.138408
S       0.611009  0.503782  0.552239
SE      0.102703  0.184466  0.131944
SW      0.544041  0.505618  0.52

**K_ folds**

In [10]:
scoring = ['precision_macro', 'recall_macro','f1_macro',"accuracy","precision_weighted","recall_weighted","f1_weighted"]
scores = cross_validate(clas[1], x_data, y_data["label"], scoring=scoring,
                        cv=5, return_train_score=False)
print("Accuracy: {:.2%} (+/- {:.2%})" .format (scores["test_accuracy"].mean(), scores["test_accuracy"].std() * 2))
print("Recall: {:.2%} (+/- {:.2%})" .format (scores["test_recall_macro"].mean(), scores["test_recall_macro"].std() * 2))
print("Precision: {:.2%} (+/- {:.2%})" .format (scores["test_precision_macro"].mean(), scores["test_precision_macro"].std() * 2))
print("f1 : {:.2%} (+/-{:.2%} )".format (scores["test_f1_macro"].mean(), scores["test_f1_macro"].std() * 2))
print("Recall weighted: {:.2%} (+/- {:.2%})" .format (scores["test_recall_weighted"].mean(), scores["test_recall_weighted"].std() * 2))
print("Precision weighted: {:.2%} (+/- {:.2%})" .format (scores["test_precision_weighted"].mean(), scores["test_precision_weighted"].std() * 2))
print("f1 weighted: {:.2%} (+/-{:.2%} )".format (scores["test_f1_weighted"].mean(), scores["test_f1_weighted"].std() * 2))

Accuracy: 52.66% (+/- 3.93%)
Recall: 31.44% (+/- 5.58%)
Precision: 34.81% (+/- 8.22%)
f1 : 30.98% (+/-6.38% )
Recall weighted: 52.66% (+/- 3.93%)
Precision weighted: 47.79% (+/- 5.04%)
f1 weighted: 48.60% (+/-5.13% )


**Filtering wind speed thershold**

In [0]:
y_spd=pd.read_excel(dr[0]+"y_coron_spd.xlsx",index_col=0) 
y_spd=y_spd[y_spd>0]/3.6# units=Km/h to m/s
y_spd=y_spd[y_spd>2]#threshold 2m/s
y_spd=y_spd.rename(index=str, columns={"value": "spd"})
res1= x_data.join(y_spd, how='outer').dropna()
res2=res1.join(y_data, how='outer').dropna()
x_data=res2.iloc[:,0:9]
y_data=res2.iloc[:,11:12]
y_data.describe()

In [21]:
clas = [DummyClassifier(),RandomForestClassifier(n_estimators= 300, max_depth= 40, bootstrap= True),
       ExtraTreesClassifier(),AdaBoostClassifier()]

x_train, x_test, y_train, y_test = train_test_split(x_data,y_data["label"], test_size=0.4, random_state=5)
y_pred = clas[1].fit(x_train, y_train).predict(x_test)
evaluate(y_test,y_pred)

    E   N   NE  NW    S  SE   SW   W
E   0   4   13   0    4   1    2   1
N   0  62  211   3    1   0    7   9
NE  0  80  823   3    8   0   13  11
NW  0  19   15   5    2   0    2  15
S   1   5   13   0  326   3  168  20
SE  0   1    3   0   10   1    4   1
SW  0  13   27   2  114   0  351  39
W   0   8   16   7    9   0   54  78
****************
Accuracy= 63.60% // Model Accuracy=45%
Average precision = 40.34% // Model precision=28%
Average recall = 36.62% // Model recall=28%
Precision weighted= 60.52% //Model weighted=56%
Recall weighted = 63.60% //Model weighted=53%
****************
       Precision    Recall        F1
W_DIR                               
E       0.000000  0.000000  0.000000
N       0.322917  0.211604  0.255670
NE      0.734166  0.877399  0.799417
NW      0.250000  0.086207  0.128205
S       0.687764  0.608209  0.645545
SE      0.200000  0.050000  0.080000
SW      0.584027  0.642857  0.612031
W       0.448276  0.453488  0.450867


**K Folds**

In [22]:
scoring = ['precision_macro', 'recall_macro','f1_macro',"accuracy","precision_weighted","recall_weighted","f1_weighted"]
scores = cross_validate(clas[1], x_data, y_data["label"], scoring=scoring,
                        cv=5, return_train_score=False)
print("Accuracy: {:.2%} (+/- {:.2%})" .format (scores["test_accuracy"].mean(), scores["test_accuracy"].std() * 2))
print("Recall: {:.2%} (+/- {:.2%})" .format (scores["test_recall_macro"].mean(), scores["test_recall_macro"].std() * 2))
print("Precision: {:.2%} (+/- {:.2%})" .format (scores["test_precision_macro"].mean(), scores["test_precision_macro"].std() * 2))
print("f1 : {:.2%} (+/-{:.2%} )".format (scores["test_f1_macro"].mean(), scores["test_f1_macro"].std() * 2))
print("Recall weighted: {:.2%} (+/- {:.2%})" .format (scores["test_recall_weighted"].mean(), scores["test_recall_weighted"].std() * 2))
print("Precision weighted: {:.2%} (+/- {:.2%})" .format (scores["test_precision_weighted"].mean(), scores["test_precision_weighted"].std() * 2))
print("f1 weighted: {:.2%} (+/-{:.2%} )".format (scores["test_f1_weighted"].mean(), scores["test_f1_weighted"].std() * 2))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy: 62.31% (+/- 1.40%)
Recall: 35.67% (+/- 5.35%)
Precision: 37.47% (+/- 4.77%)
f1 : 35.42% (+/-5.02% )
Recall weighted: 62.31% (+/- 1.40%)
Precision weighted: 59.30% (+/- 3.24%)
f1 weighted: 59.64% (+/-2.93% )


**Tuning RandomForest**

In [0]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25,
                                                    random_state=27)
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
                               n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(x_train, y_train)
print("best parameters",rf_random.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   36.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.0min finished
  self.best_estimator_.fit(X, y, **fit_params)


best parameters {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}


**K folds**

In [0]:
clas=RandomForestClassifier(n_estimators= 200, min_samples_split= 10, min_samples_leaf= 2, max_features= 'sqrt', max_depth= 50, bootstrap= True)
scoring = ['precision_macro', 'recall_macro','f1_macro',"accuracy","precision_weighted","recall_weighted","f1_weighted"]
scores = cross_validate(clas, x_data, y_data["label"], scoring=scoring,
                        cv=5, return_train_score=False)
print("Accuracy: {:.2%} (+/- {:.2%})" .format (scores["test_accuracy"].mean(), scores["test_accuracy"].std() * 2))
print("Recall: {:.2%} (+/- {:.2%})" .format (scores["test_recall_macro"].mean(), scores["test_recall_macro"].std() * 2))
print("Precision: {:.2%} (+/- {:.2%})" .format (scores["test_precision_macro"].mean(), scores["test_precision_macro"].std() * 2))
print("f1 : {:.2%} (+/-{:.2%} )".format (scores["test_f1_macro"].mean(), scores["test_f1_macro"].std() * 2))
print("Recall weighted: {:.2%} (+/- {:.2%})" .format (scores["test_recall_weighted"].mean(), scores["test_recall_weighted"].std() * 2))
print("Precision weighted: {:.2%} (+/- {:.2%})" .format (scores["test_precision_weighted"].mean(), scores["test_precision_weighted"].std() * 2))
print("f1 weighted: {:.2%} (+/-{:.2%} )".format (scores["test_f1_weighted"].mean(), scores["test_f1_weighted"].std() * 2))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy: 66.00% (+/- 11.94%)
Recall: 28.28% (+/- 9.98%)
Precision: 30.63% (+/- 20.64%)
f1 : 27.84% (+/-12.23% )
Recall weighted: 66.00% (+/- 11.94%)
Precision weighted: 60.83% (+/- 18.36%)
f1 weighted: 62.28% (+/-14.08% )


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
