In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
from multiprocessing import Pool

### Function used

In [2]:
def model_info(x, y, test_date, data_tested):
    cm = confusion_matrix(x,y)
    acc = accuracy_score(x,y)
    rec = recall_score(x,y)
    rec_0 = recall_score(x,y, pos_label=0)
    data = [{"Accuracy":acc,"Specificity":rec_0, "Sensitivity":rec}]
    df = pd.DataFrame(data)
    df['test_date'] = test_date
    df['data_tested'] = data_tested
    if len(x.value_counts()) == 1:
        l30 = x.value_counts()[0]
        df['greter_than_30'] = '0'
        df['less_than_30'] = l30
    else:
        g30 = x.value_counts()[1]
        l30 = x.value_counts()[0]
        df['greter_than_30'] = g30
        df['less_than_30'] = l30
    return (df)

### Load Data

In [3]:
df = pd.read_csv("15min_dateset.csv")
df.drop('Unnamed: 0', inplace = True, axis = 1)
print(len(df['EXP'].unique()))
df.head(2)

32


Unnamed: 0,Date,Type,from,EXP,jul_date,hour_angle,max_wind_speed,avg_wind_speed,R1,R2,...,HYBL_one,HYBL_two,HYBL_three,HYBL_four,HYBL_five,diff_temp,geo_cbl,veg_sfc,best_4_layer,ws_700
0,2017-01-07 06:00:00,ANDE,MESONET,1/7/17,7,-1.570795,2.332608,1.295893,2.630564,2.322274,...,0.62923,0.903721,3.610055,5.01167,6.054984,0.739,-5000.0,40.293,23.757,16.787335
1,2017-01-07 06:15:00,ANDE,MESONET,1/7/17,7,-1.505345,1.555072,0.647947,2.055383,1.813005,...,0.748575,1.493288,3.643217,5.232982,6.394255,1.05,-5000.0,40.293,23.959,16.195689


### Create a Column for Classification

In [4]:
df["max_th"] = np.where(df["max_wind_speed"] > 30, 1, 0 )
table = df["max_th"].value_counts()
total = table[1] + table[0]
print(table)
print('% greater than 30:', round(table[1]/total,3))
print('% less than 30:', round(table[0]/total,3))

0    69019
1     3630
Name: max_th, dtype: int64
% greater than 30: 0.05
% less than 30: 0.95


In [5]:
le = LabelEncoder()
df['from'] = le.fit_transform(df['from'])
df.head(2)

Unnamed: 0,Date,Type,from,EXP,jul_date,hour_angle,max_wind_speed,avg_wind_speed,R1,R2,...,HYBL_two,HYBL_three,HYBL_four,HYBL_five,diff_temp,geo_cbl,veg_sfc,best_4_layer,ws_700,max_th
0,2017-01-07 06:00:00,ANDE,1,1/7/17,7,-1.570795,2.332608,1.295893,2.630564,2.322274,...,0.903721,3.610055,5.01167,6.054984,0.739,-5000.0,40.293,23.757,16.787335,0
1,2017-01-07 06:15:00,ANDE,1,1/7/17,7,-1.505345,1.555072,0.647947,2.055383,1.813005,...,1.493288,3.643217,5.232982,6.394255,1.05,-5000.0,40.293,23.959,16.195689,0


### Grid search best parameters 

In [25]:
#X = df.drop(['max_wind_speed', 'Type', 'EXP','max_th','jul_date','hour_angle'.'avg_wind_speed'], 1)
#Y = df.max_th.values

In [31]:
#pram = {'max_depth' : [3,5,7,9],'n_estimators':[250 ,400,500], 'bootstrap' :[True, False],'min_samples_split' : [2, 5, 7]}

In [35]:
#clf = GridSearchCV(RandomForestClassifier(random_state=19, max_features = 'sqrt'), pram, cv=3,scoring='accuracy', n_jobs=-1, verbose=3)
#clf.fit(X,Y)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed: 26.9min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='sqrt',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=19,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'bo

In [44]:
#clf.best_score_, clf.best_params_,clf.scorer_

(0.904293245605583,
 {'bootstrap': False,
  'max_depth': 3,
  'min_samples_split': 2,
  'n_estimators': 250},
 make_scorer(accuracy_score))

### Random Forest Model Non-resampling

In [8]:
def rf_model(x):
    test_1 = df[df.EXP == x]
    train_1 = df[df.EXP != x]
    
    train_1 = train_1.drop(['EXP','jul_date','hour_angle','avg_wind_speed'], axis = 1)
    test_1 = test_1.drop(['EXP','jul_date','hour_angle','avg_wind_speed'], axis = 1)
    
    x_train = train_1.drop('max_th', 1)
    x_train.drop(['Date','Type','max_wind_speed'], axis=1, inplace=True)
    y_train = train_1.max_th
    x_test = test_1.drop('max_th', 1)
    x_test_copy = x_test.copy()
    x_test.drop(['Date','Type','max_wind_speed'], axis=1, inplace=True)
    y_test = test_1.max_th
    
    model = RandomForestClassifier(n_estimators=250, bootstrap= False, max_depth = 3, max_features= 'sqrt', min_samples_split = 3,
                               random_state=19)
    model.fit(x_train, y_train)
    
    pred_train = model.predict(x_train)
    train_info = model_info(y_train, pred_train, x , "train")

    pred_test = model.predict(x_test)
    test_info = model_info(y_test, pred_test, x , "test")

    table = train_info.append(test_info)
    
    print(x)
    print(table)
    print("--" * 40)
    prod = model.predict_proba(x_test)
    prod_data = pd.DataFrame(prod, columns=["less_30","greater_30"])
    df4 = pd.DataFrame(y_test)
    df4 = df4.rename(columns={'max_th': 'actual'})
    df4 = df4.reset_index()
    df4.drop('index', axis=1, inplace=True)
    x_test_copy = x_test_copy.reset_index()
    x_test_copy.drop("index", axis=1, inplace=True)
    df5 = pd.concat([x_test_copy, df4, prod_data], axis=1)
    pred_df = pd.DataFrame(pred_test, columns=["pred"])
    df6 = pd.concat([df5, pred_df], axis=1)
    df6['From'] = np.where(df6['from'] == 1, 'Mesonet', 'ASOS')
    df6.drop('from', inplace=True, axis=1)
    df6['test_date'] = x
    return(table, df6)


In [9]:
%%time
table = pd.DataFrame()
data = pd.DataFrame()
for i in df['EXP'].unique():
    t,d = rf_model(i)
    table = table.append(t)
    data = data.append(d)

1/7/17
   Accuracy  Specificity  Sensitivity test_date data_tested greter_than_30  \
0  0.948649          1.0          0.0    1/7/17       train           3630   
0  1.000000          1.0          0.0    1/7/17        test              0   

   less_than_30  
0         67060  
0          1959  
--------------------------------------------------------------------------------
2/9/17
   Accuracy  Specificity  Sensitivity test_date data_tested  greter_than_30  \
0  0.949326          1.0          0.0    2/9/17       train            3577   
0  0.974284          1.0          0.0    2/9/17        test              53   

   less_than_30  
0         67011  
0          2008  
--------------------------------------------------------------------------------
2/13/17
   Accuracy  Specificity  Sensitivity test_date data_tested  greter_than_30  \
0  0.956303          1.0          0.0   2/13/17       train            3089   
0  0.723556          1.0          0.0   2/13/17        test             541  

9/4/18
   Accuracy  Specificity  Sensitivity test_date data_tested greter_than_30  \
0  0.948522          1.0          0.0    9/4/18       train           3630   
0  1.000000          1.0          0.0    9/4/18        test              0   

   less_than_30  
0         66886  
0          2133  
--------------------------------------------------------------------------------
9/6/18
   Accuracy  Specificity  Sensitivity test_date data_tested  greter_than_30  \
0  0.948533          1.0          0.0    9/6/18       train            3624   
0  0.997315          1.0          0.0    9/6/18        test               6   

   less_than_30  
0         66790  
0          2229  
--------------------------------------------------------------------------------
9/10/18
   Accuracy  Specificity  Sensitivity test_date data_tested  greter_than_30  \
0  0.948499          1.0          0.0   9/10/18       train            3625   
0  0.997790          1.0          0.0   9/10/18        test               5  

In [10]:
table.head()

Unnamed: 0,Accuracy,Specificity,Sensitivity,test_date,data_tested,greter_than_30,less_than_30
0,0.948649,1.0,0.0,1/7/17,train,3630,67060
0,1.0,1.0,0.0,1/7/17,test,0,1959
0,0.949326,1.0,0.0,2/9/17,train,3577,67011
0,0.974284,1.0,0.0,2/9/17,test,53,2008
0,0.956303,1.0,0.0,2/13/17,train,3089,67603


In [11]:
data.tail()

Unnamed: 0,Date,Type,max_wind_speed,R1,R2,Pressure_reduced_to_MSL_.Pa..0.MSL,Derived_radar_reflectivity_.dB..1.HYBL,u.component_of_wind_.m.s..85000.ISBL,v.component_of_wind_.m.s..85000.ISBL,wind_speed_85000,...,geo_cbl,veg_sfc,best_4_layer,ws_700,actual,less_30,greater_30,pred,From,test_date
2331,2019-02-25 05:00:00,WANT,37.90488,0.09892,0.157647,100027.32,0.0,32.948,-5.489,33.402093,...,-5000.0,40.175,21.381,17.64008,1,0.841885,0.158115,0,Mesonet,2/24/19
2332,2019-02-25 05:15:00,WANT,46.457776,0.098133,0.156265,100043.773,0.0,33.102,-5.86,33.616692,...,-5000.0,40.175,21.517,19.861816,1,0.841885,0.158115,0,Mesonet,2/24/19
2333,2019-02-25 05:30:00,WANT,37.710496,0.098374,0.154478,100029.609,0.0,33.439,-5.925,33.959864,...,-5000.0,40.175,21.8,22.21766,1,0.841295,0.158705,0,Mesonet,2/24/19
2334,2019-02-25 05:45:00,WANT,39.071184,0.097222,0.15372,100029.547,0.0,33.765,-5.858,34.269394,...,-5000.0,40.175,21.817,22.936901,1,0.838933,0.161067,0,Mesonet,2/24/19
2335,2019-02-25 06:00:00,WANT,36.349808,0.094813,0.153293,100049.734,0.0,33.951,-5.796,34.442184,...,-5000.0,40.175,21.817,23.463382,1,0.838933,0.161067,0,Mesonet,2/24/19


In [12]:
#table.to_csv('RF_CLASS_greater_30_no_sample_stats.csv')

In [13]:
#data.to_csv('RF_CLASS_greater_30_no_sample_data.csv')

### Random Forest Model sampleling 

In [14]:
def rf_model2(x):
    test_1 = df[df.EXP == x]
    train_1 = df[df.EXP != x]
    train_1 = train_1.drop(['EXP','jul_date','hour_angle','avg_wind_speed'], axis = 1)
    test_1 = test_1.drop(['EXP','jul_date','hour_angle','avg_wind_speed'], axis = 1)
    
    df_0 = train_1[train_1['max_th'] == 0]
    df_1 = train_1[train_1['max_th'] == 1]
    df_under = df_0.sample(9000 ,random_state=19)
    df_new = pd.concat([df_under, df_1], axis = 0)
    
    
    x_train = df_new.drop('max_th', 1)
    x_train.drop(['Date','Type','max_wind_speed'], axis=1, inplace=True)
    y_train = df_new.max_th
    x_test = test_1.drop('max_th', 1)
    x_test_copy = x_test.copy()
    x_test.drop(['Date','Type','max_wind_speed'], axis=1, inplace=True)
    y_test = test_1.max_th
    
    model = RandomForestClassifier(n_estimators=250, bootstrap= False, max_depth = 3, max_features= 'sqrt', min_samples_split = 3,
                               random_state=19)
    model.fit(x_train, y_train)
    
    pred_train = model.predict(x_train)
    train_info = model_info(y_train, pred_train, x , "train")

    pred_test = model.predict(x_test)
    test_info = model_info(y_test, pred_test, x , "test")

    table = train_info.append(test_info)
    
    print(x)
    print(table)
    print("--" * 40)
    prod = model.predict_proba(x_test)
    prod_data = pd.DataFrame(prod, columns=["less_30","greater_30"])
    df4 = pd.DataFrame(y_test)
    df4 = df4.rename(columns={'max_th': 'actual'})
    df4 = df4.reset_index()
    df4.drop('index', axis=1, inplace=True)
    x_test_copy = x_test_copy.reset_index()
    x_test_copy.drop("index", axis=1, inplace=True)
    df5 = pd.concat([x_test_copy, df4, prod_data], axis=1)
    pred_df = pd.DataFrame(pred_test, columns=["pred"])
    df6 = pd.concat([df5, pred_df], axis=1)
    df6['From'] = np.where(df6['from'] == 1, 'Mesonet', 'ASOS')
    df6.drop('from', inplace=True, axis=1)
    df6['test_date'] = x
    return(table, df6)

In [15]:
%%time
table2 = pd.DataFrame()
data2 = pd.DataFrame()
for i in df['EXP'].unique():
    t,d = rf_model2(i)
    table2 = table2.append(t)
    data2 = data2.append(d)

1/7/17
   Accuracy  Specificity  Sensitivity test_date data_tested greter_than_30  \
0  0.868250     0.898667     0.792837    1/7/17       train           3630   
0  0.997958     0.997958     0.000000    1/7/17        test              0   

   less_than_30  
0          9000  
0          1959  
--------------------------------------------------------------------------------
2/9/17
   Accuracy  Specificity  Sensitivity test_date data_tested  greter_than_30  \
0  0.872307     0.903556     0.793682    2/9/17       train            3577   
0  0.928190     0.936753     0.603774    2/9/17        test              53   

   less_than_30  
0          9000  
0          2008  
--------------------------------------------------------------------------------
2/13/17
   Accuracy  Specificity  Sensitivity test_date data_tested  greter_than_30  \
0  0.878650     0.919333     0.760117   2/13/17       train            3089   
0  0.654573     0.584040     0.839187   2/13/17        test             541  

9/4/18
   Accuracy  Specificity  Sensitivity test_date data_tested greter_than_30  \
0  0.874188     0.905333      0.79697    9/4/18       train           3630   
0  1.000000     1.000000      0.00000    9/4/18        test              0   

   less_than_30  
0          9000  
0          2133  
--------------------------------------------------------------------------------
9/6/18
   Accuracy  Specificity  Sensitivity test_date data_tested  greter_than_30  \
0  0.872861        0.901      0.80298    9/6/18       train            3624   
0  0.997315        1.000      0.00000    9/6/18        test               6   

   less_than_30  
0          9000  
0          2229  
--------------------------------------------------------------------------------
9/10/18
   Accuracy  Specificity  Sensitivity test_date data_tested  greter_than_30  \
0  0.869545     0.898444     0.797793   9/10/18       train            3625   
0  0.997790     1.000000     0.000000   9/10/18        test               5  

In [16]:
table2

Unnamed: 0,Accuracy,Specificity,Sensitivity,test_date,data_tested,greter_than_30,less_than_30
0,0.868250,0.898667,0.792837,1/7/17,train,3630,9000
0,0.997958,0.997958,0.000000,1/7/17,test,0,1959
0,0.872307,0.903556,0.793682,2/9/17,train,3577,9000
0,0.928190,0.936753,0.603774,2/9/17,test,53,2008
0,0.878650,0.919333,0.760117,2/13/17,train,3089,9000
...,...,...,...,...,...,...,...
0,0.762590,0.796957,0.396648,12/17/18,test,179,1906
0,0.875632,0.903444,0.803184,12/22/18,train,3455,9000
0,0.775356,0.796430,0.525714,12/22/18,test,175,2073
0,0.873826,0.914778,0.764005,2/24/19,train,3356,9000


In [188]:
#table2.to_csv('RF_CLASS_greater_30_sample10000_stats.csv')

In [189]:
#data2.to_csv('RF_CLASS_greater_30_sample10000_data.csv')