In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
from multiprocessing import Pool

### Functions Used

In [2]:
def model_info(x, y, test_date, data_tested, num):
    cm = confusion_matrix(x,y)
    acc = accuracy_score(x,y)
    cm = confusion_matrix(x,y)
    if len(cm) > 1:
        hit = cm[1][1]
        false_alarms = cm[0][1]
        misses = cm[1][0]
        no_events = cm[0][0]
        csi = hit / (false_alarms + misses + hit)
        hit_rate = hit / (hit + misses + false_alarms)
        false_alarm_rate = false_alarms / (hit + false_alarms)
    else:
        hit = 0
        false_alarms = 0
        misses = 0
        no_events = cm[0][0]
        csi = 0
        hit_rate =0
        false_alarm_rate = 0
    data = [{"Accuracy":acc,"Hits":hit, "False Alarms":false_alarms, 'Misses':misses,'No Events': no_events, 'CSI':csi, 'Hit Rate':hit_rate, 'False Alarm Rate':false_alarm_rate}]
    df = pd.DataFrame(data)
    df['test_date'] = test_date
    df['data_tested'] = data_tested
    if len(x.value_counts()) == 1:
        l30 = x.value_counts()[0]
        df['greter_than_30_knots'] = '0'
        df['less_than_30_knots'] = l30
    else:
        g30 = x.value_counts()[1]
        l30 = x.value_counts()[0]
        df['greter_than_30_knots'] = g30
        df['less_than_30_knots'] = l30
    df['sample_number'] = num
    return (df)

### Load data

In [3]:
df = pd.read_csv("15min_dateset.csv")
df.drop('Unnamed: 0', inplace = True, axis = 1)
print(len(df['EXP'].unique()))
df.head(2)

32


Unnamed: 0,Date,Type,from,EXP,jul_date,hour_angle,max_wind_speed,avg_wind_speed,R1,R2,...,HYBL_one,HYBL_two,HYBL_three,HYBL_four,HYBL_five,diff_temp,geo_cbl,veg_sfc,best_4_layer,ws_700
0,2017-01-07 06:00:00,ANDE,MESONET,1/7/17,7,-1.570795,2.332608,1.295893,2.630564,2.322274,...,0.62923,0.903721,3.610055,5.01167,6.054984,0.739,-5000.0,40.293,23.757,16.787335
1,2017-01-07 06:15:00,ANDE,MESONET,1/7/17,7,-1.505345,1.555072,0.647947,2.055383,1.813005,...,0.748575,1.493288,3.643217,5.232982,6.394255,1.05,-5000.0,40.293,23.959,16.195689


In [4]:
df["max_th"] = np.where(df["max_wind_speed"] > 30, 1, 0 )
table = df["max_th"].value_counts()
total = table[1] + table[0]
print(table)
print('% greater than 30:', round(table[1]/total,3))
print('% less than 30:', round(table[0]/total,3))

0    69019
1     3630
Name: max_th, dtype: int64
% greater than 30: 0.05
% less than 30: 0.95


In [5]:
df2 = df[['Type', 'from', 'EXP',
       'max_wind_speed', 'R1', 'R2',
       'Pressure_reduced_to_MSL_.Pa..0.MSL',
       'Derived_radar_reflectivity_.dB..1.HYBL',
       'u.component_of_wind_.m.s..85000.ISBL',
       'v.component_of_wind_.m.s..85000.ISBL', 'wind_speed_85000',
       'wind_shear_85000', 'Wind_speed_.gust._.m.s..0.SFC', 'HYBL_one',
       'HYBL_two', 'HYBL_three', 'HYBL_four', 'HYBL_five', 'diff_temp',
       'geo_cbl', 'veg_sfc', 'best_4_layer', 'ws_700', 'max_th']]
le = LabelEncoder()
df2['from'] = le.fit_transform(df2['from'])
df2.head(2)

Unnamed: 0,Type,from,EXP,max_wind_speed,R1,R2,Pressure_reduced_to_MSL_.Pa..0.MSL,Derived_radar_reflectivity_.dB..1.HYBL,u.component_of_wind_.m.s..85000.ISBL,v.component_of_wind_.m.s..85000.ISBL,...,HYBL_two,HYBL_three,HYBL_four,HYBL_five,diff_temp,geo_cbl,veg_sfc,best_4_layer,ws_700,max_th
0,ANDE,1,1/7/17,2.332608,2.630564,2.322274,102879.7,0.0,5.447,-1.457,...,0.903721,3.610055,5.01167,6.054984,0.739,-5000.0,40.293,23.757,16.787335,0
1,ANDE,1,1/7/17,1.555072,2.055383,1.813005,102951.0,0.0,5.629,-1.464,...,1.493288,3.643217,5.232982,6.394255,1.05,-5000.0,40.293,23.959,16.195689,0


### Testing different samples of majority

In [6]:
def rf_model2(x,num):
    test_1 = df2[df2.EXP == x]
    train_1 = df2[df2.EXP != x]
    train_1 = train_1[['Type', 'from', 'R1', 'R2',
       'Pressure_reduced_to_MSL_.Pa..0.MSL',
       'Derived_radar_reflectivity_.dB..1.HYBL',
       'u.component_of_wind_.m.s..85000.ISBL',
       'v.component_of_wind_.m.s..85000.ISBL', 'wind_speed_85000',
       'wind_shear_85000', 'Wind_speed_.gust._.m.s..0.SFC', 'HYBL_one',
       'HYBL_two', 'HYBL_three', 'HYBL_four', 'HYBL_five', 'diff_temp',
       'geo_cbl', 'veg_sfc', 'best_4_layer', 'ws_700','max_th','max_wind_speed']]


    test_1 = test_1[['Type', 'from', 'R1', 'R2',
       'Pressure_reduced_to_MSL_.Pa..0.MSL',
       'Derived_radar_reflectivity_.dB..1.HYBL',
       'u.component_of_wind_.m.s..85000.ISBL',
       'v.component_of_wind_.m.s..85000.ISBL', 'wind_speed_85000',
       'wind_shear_85000', 'Wind_speed_.gust._.m.s..0.SFC', 'HYBL_one',
       'HYBL_two', 'HYBL_three', 'HYBL_four', 'HYBL_five', 'diff_temp',
       'geo_cbl', 'veg_sfc', 'best_4_layer', 'ws_700','max_th','max_wind_speed']]
    
    df_0 = train_1[train_1['max_th'] == 0]
    df_1 = train_1[train_1['max_th'] == 1]
    df_under = df_0.sample(num ,random_state=19)
    df_new = pd.concat([df_under, df_1], axis = 0)
    
    
    x_train = df_new.drop('max_th', 1)
    x_train.drop(['Type','max_wind_speed'], axis=1, inplace=True)
    y_train = df_new.max_th
    x_test = test_1.drop('max_th', 1)
    x_test_copy = x_test.copy()
    x_test.drop(['Type','max_wind_speed'], axis=1, inplace=True)
    y_test = test_1.max_th
    
    model = RandomForestClassifier(n_estimators=250, bootstrap= False, max_depth = 3, max_features= 'sqrt', min_samples_split = 3,
                               random_state=19)
    model.fit(x_train, y_train)
    
    pred_train = model.predict(x_train)
    train_info = model_info(y_train, pred_train, x , "train", num)

    pred_test = model.predict(x_test)
    test_info = model_info(y_test, pred_test, x , "test",num)

    table = train_info.append(test_info)
    return(table)

In [7]:
num_list = [4000,6000,8000,10000,12000,14000,16000,18000,20000,22000,24000,26000,28000,30000]

In [8]:
all_table = pd.DataFrame()
for num in num_list:
    table = pd.DataFrame()
    for i in df['EXP'].unique():
        t = rf_model2(i,num)
        table = table.append(t)
    print(num)
    print('=='* 40)
    all_table = all_table.append(table)

4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000


In [9]:
all_table

Unnamed: 0,Accuracy,Hits,False Alarms,Misses,No Events,CSI,Hit Rate,False Alarm Rate,test_date,data_tested,greter_than_30_knots,less_than_30_knots,sample_number
0,0.861861,3426,850,204,3150,0.764732,0.764732,0.198784,1/7/17,train,3630,4000,4000
0,0.979581,0,40,0,1919,0.000000,0.000000,1.000000,1/7/17,test,0,1959,4000
0,0.859047,3364,855,213,3145,0.759025,0.759025,0.202655,2/9/17,train,3577,4000,4000
0,0.775352,53,463,0,1545,0.102713,0.102713,0.897287,2/9/17,test,53,2008,4000
0,0.865990,2863,724,226,3276,0.750852,0.750852,0.201840,2/13/17,train,3089,4000,4000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.913669,1,2,178,1904,0.005525,0.005525,0.666667,12/17/18,test,179,1906,30000
0,0.909819,477,39,2978,29961,0.136520,0.136520,0.075581,12/22/18,train,3455,30000,30000
0,0.922598,1,0,174,2073,0.005714,0.005714,0.000000,12/22/18,test,175,2073,30000
0,0.912340,501,69,2855,29931,0.146277,0.146277,0.121053,2/24/19,train,3356,30000,30000


In [10]:
#all_table.to_csv('sample_testing_2.csv')

In [18]:
stats_table = all_table.groupby(['data_tested', 'sample_number']).agg({'Accuracy':['mean', 'std'], 'CSI':['mean', 'std'],
                                                        'Hit Rate':['mean', 'std'], 
                                                         'False Alarm Rate':['mean', 'std']})

In [19]:
#stats_table.to_csv('by_sample.csv')

In [33]:
all_table.groupby(['data_tested','test_date']).agg({'Accuracy':['mean', 'std'],'Specificity':['mean', 'std'],
                                                 'Sensitivity':['mean', 'std']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Accuracy,Specificity,Specificity,Sensitivity,Sensitivity
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std
data_tested,test_date,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
test,1/4/18,0.909683,0.011263,0.991591,0.019478,0.156673,0.273147
test,1/7/17,0.998031,0.005497,0.998031,0.005497,0.000000,0.000000
test,10/12/18,0.950382,0.038555,0.966701,0.043221,0.146032,0.194260
test,10/2/18,0.998451,0.000386,0.999897,0.000387,0.000000,0.000000
test,10/27/18,0.900779,0.099283,0.938526,0.119228,0.201058,0.281186
...,...,...,...,...,...,...,...
train,8/17/18,0.882604,0.013681,0.947763,0.068582,0.477623,0.296435
train,9/10/18,0.882420,0.015079,0.946722,0.070938,0.480552,0.294264
train,9/26/18,0.882919,0.013631,0.947618,0.068755,0.479850,0.297374
train,9/4/18,0.882653,0.012751,0.948904,0.065841,0.475364,0.296414
