In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

### Load Data

In [2]:
df = pd.read_csv("15min_dateset.csv")
df.drop('Unnamed: 0', inplace = True, axis = 1)
df.head(2)

Unnamed: 0,Date,Type,from,EXP,jul_date,hour_angle,max_wind_speed,avg_wind_speed,R1,R2,...,HYBL_one,HYBL_two,HYBL_three,HYBL_four,HYBL_five,diff_temp,geo_cbl,veg_sfc,best_4_layer,ws_700
0,2017-01-07 06:00:00,ANDE,MESONET,1/7/17,7,-1.570795,2.332608,1.295893,2.630564,2.322274,...,0.62923,0.903721,3.610055,5.01167,6.054984,0.739,-5000.0,40.293,23.757,16.787335
1,2017-01-07 06:15:00,ANDE,MESONET,1/7/17,7,-1.505345,1.555072,0.647947,2.055383,1.813005,...,0.748575,1.493288,3.643217,5.232982,6.394255,1.05,-5000.0,40.293,23.959,16.195689


In [3]:
#### Clasification results
def model_info1(x, y, test_date, data_tested):
    cm = confusion_matrix(x,y)
    acc = accuracy_score(x,y)
    cm = confusion_matrix(x,y)
    if len(cm) > 1:
        hit = cm[1][1]
        false_alarms = cm[0][1]
        misses = cm[1][0]
        no_events = cm[0][0]
        csi = hit / (false_alarms + misses + hit)
        hit_rate = hit / (hit + misses)
        false_alarm_rate = false_alarms / (hit + false_alarms)
    else:
        hit = 0
        false_alarms = 0
        misses = 0
        no_events = cm[0][0]
        csi = 0
        hit_rate =0
        false_alarm_rate = 0
    data = [{"Accuracy":acc,"Hits":hit, "False Alarms":false_alarms, 'Misses':misses,'No Events': no_events, 'CSI':csi, 'Hit Rate':hit_rate, 'False Alarm Rate':false_alarm_rate}]
    df = pd.DataFrame(data)
    df['test_date'] = test_date
    df['data_tested'] = data_tested
    if len(Counter(x)) == 1:
        l30 = Counter(x)[0]
        df['greter_than_30_knots'] = '0'
        df['less_than_30_knots'] = l30
    else:
        g30 = Counter(x)[1]
        l30 = Counter(x)[0]
        df['greter_than_30_knots'] = g30
        df['less_than_30_knots'] = l30
    return (df)



### Regression results
def model_info2(x, y,test_date,data_tested):
    mae = mean_absolute_error(x,y)
    mse = mean_squared_error(x,y)
    residual_0 = x - y
    res_0 = np.array(residual_0)
    res_mean = round(res_0.mean(),4)
    data = [{"MAE":mae ,"MSE":mse, 'Residual Mean':res_mean}]
    df = pd.DataFrame(data)
    df['test_date'] = test_date
    df['data_tested'] = data_tested
    return (df)

In [4]:
le = LabelEncoder()
df['from'] = le.fit_transform(df['from'])
df.tail(2)

Unnamed: 0,Date,Type,from,EXP,jul_date,hour_angle,max_wind_speed,avg_wind_speed,R1,R2,...,HYBL_one,HYBL_two,HYBL_three,HYBL_four,HYBL_five,diff_temp,geo_cbl,veg_sfc,best_4_layer,ws_700
72647,2018-07-22 05:45:00,WBOU,1,7/21/18,203,-1.636245,17.105792,8.552896,0.279731,0.693772,...,9.63431,16.364893,18.456569,21.401767,24.263238,3.607,554.661,81.379,0.706,8.964475
72648,2018-07-22 06:00:00,WBOU,1,7/21/18,203,-1.570795,17.105792,10.043173,0.231301,0.605321,...,8.939975,16.514877,18.701148,22.068564,24.935163,3.349,555.042,81.379,0.171,7.752404


In [5]:
df["max_th"] = np.where(df["max_wind_speed"] > 30, 2, np.where(df["max_wind_speed"] < 15, 0, 1))
df['max_th'].value_counts()

0    44574
1    24445
2     3630
Name: max_th, dtype: int64

In [6]:
def lin_model(x): 
    test_1 = df[df.EXP == x]
    train_1 = df[df.EXP != x]

    train_1 = train_1.drop(['EXP','avg_wind_speed','jul_date','hour_angle'], axis = 1)

    test_1 = test_1.drop(['EXP','avg_wind_speed','jul_date','hour_angle'], axis = 1)

    df_0 = train_1[train_1['max_th'] == 0]
    df_1 = train_1[train_1['max_th'] == 1]
    df_2 = train_1[train_1['max_th'] == 2]

    df0_sample = df_0.sample(9000 ,random_state=19)
    df1_sample = df_1.sample(9000 ,random_state=19)
    df2_sample = df_2.sample(3000 ,random_state=19)

    df_new = pd.concat([df0_sample, df1_sample, df2_sample], axis = 0)

    x_train = df_new.drop('max_wind_speed', 1)
    x_train.drop(['Date','Type','max_th'], axis=1, inplace=True)
    y_train = df_new.max_wind_speed
    x_test = test_1.drop('max_wind_speed', 1)
    x_test_copy = x_test.copy()
    x_test.drop(['Date','Type','max_th'], axis=1, inplace=True)
    y_test = test_1.max_wind_speed
    scale = MinMaxScaler()
    x_train = scale.fit_transform(x_train)
    x_test = scale.transform(x_test)


    model = LinearRegression()
    model.fit(x_train, y_train)

    pred_train = model.predict(x_train)
    train_info = model_info2(y_train, pred_train, x , "train")

    y_train30 = np.where(y_train > 30, 1, 0 )
    pred_train30 = np.where(pred_train > 30, 1, 0 )
    train_info2 = model_info1(y_train30, pred_train30, x,'train')

    pred_test = model.predict(x_test)
    test_info = model_info2(y_test, pred_test, x , "test")

    y_test30 = np.where(y_test > 30, 1, 0 )
    pred_test30 = np.where(pred_test > 30, 1, 0 )
    test_info2 = model_info1(y_test30, pred_test30, x,'test')

    table = train_info.append(test_info)
    table2 = train_info2.append(test_info2)
    print(x)
    print(table)
    print("==" *10)
    print(table2)
    print('--' * 40)

    df_class = pd.DataFrame(pred_test30, columns=['Class_results'])
    df_class['Class_results'] = np.where(df_class['Class_results'] == 1, 'Greater than 30 knots', 'Less than 30 knots')
    df_class = df_class.reset_index()
    df_class.drop('index', axis=1, inplace=True)
    df4 = pd.DataFrame(y_test)
    df4 = df4.rename(columns={'max_wind_speed': 'actual'})
    df4 = df4.reset_index()
    df4.drop('index', axis=1, inplace=True)
    x_test_copy = x_test_copy.reset_index()
    x_test_copy.drop("index", axis=1, inplace=True)
    df5 = pd.concat([x_test_copy, df4, df_class], axis=1)
    pred_df = pd.DataFrame(pred_test, columns=["pred"])
    df6 = pd.concat([df5, pred_df], axis=1)
    df6['From'] = np.where(df6['from'] == 1, 'Mesonet', 'ASOS')
    df6.drop(['from', 'max_th'], inplace=True, axis=1)
    df6['test_date'] = x
    return(table, table2, df6)

In [7]:
table = pd.DataFrame()
table_class = pd.DataFrame()
data = pd.DataFrame()
for i in df['EXP'].unique():
    t,tc,d = lin_model(i)
    table = table.append(t)
    table_class = table_class.append(tc)
    data = data.append(d)

1/7/17
        MAE        MSE  Residual Mean test_date data_tested
0  4.664122  36.310676        -0.0000    1/7/17       train
0  3.430424  18.494801        -2.6956    1/7/17        test
   Accuracy  Hits  False Alarms  Misses  No Events       CSI  Hit Rate  \
0   0.88619  1010           400    1990      17600  0.297059  0.336667   
0   1.00000     0             0       0       1959  0.000000  0.000000   

   False Alarm Rate test_date data_tested greter_than_30_knots  \
0          0.283688    1/7/17       train                 3000   
0          0.000000    1/7/17        test                    0   

   less_than_30_knots  
0               18000  
0                1959  
--------------------------------------------------------------------------------
2/9/17
        MAE        MSE  Residual Mean test_date data_tested
0  4.660134  36.200782         0.0000    2/9/17       train
0  4.249987  26.654195        -2.7796    2/9/17        test
   Accuracy  Hits  False Alarms  Misses  No Events 

12/9/17
        MAE        MSE  Residual Mean test_date data_tested
0  4.645368  36.019371        -0.0000   12/9/17       train
0  3.320654  15.494701        -2.8769   12/9/17        test
   Accuracy  Hits  False Alarms  Misses  No Events       CSI  Hit Rate  \
0  0.885429   998           404    2002      17596  0.293184  0.332667   
0  1.000000     0             0       0       2531  0.000000  0.000000   

   False Alarm Rate test_date data_tested greter_than_30_knots  \
0           0.28816   12/9/17       train                 3000   
0           0.00000   12/9/17        test                    0   

   less_than_30_knots  
0               18000  
0                2531  
--------------------------------------------------------------------------------
1/4/18
        MAE        MSE  Residual Mean test_date data_tested
0  4.593283  35.454738         0.0000    1/4/18       train
0  4.534424  35.177002         1.6379    1/4/18        test
   Accuracy  Hits  False Alarms  Misses  No Events

9/4/18
        MAE        MSE  Residual Mean test_date data_tested
0  4.660936  36.172434         0.0000    9/4/18       train
0  2.832485  13.636868        -1.3481    9/4/18        test
   Accuracy  Hits  False Alarms  Misses  No Events       CSI  Hit Rate  \
0  0.885714  1004           404    1996      17596  0.294947  0.334667   
0  1.000000     0             0       0       2133  0.000000  0.000000   

   False Alarm Rate test_date data_tested greter_than_30_knots  \
0          0.286932    9/4/18       train                 3000   
0          0.000000    9/4/18        test                    0   

   less_than_30_knots  
0               18000  
0                2133  
--------------------------------------------------------------------------------
9/6/18
        MAE        MSE  Residual Mean test_date data_tested
0  4.640794  35.752321         0.0000    9/6/18       train
0  3.244553  19.351461         0.1042    9/6/18        test
   Accuracy  Hits  False Alarms  Misses  No Events 

In [8]:
#table.to_csv('Lin_sample_regression_results.csv')
table.head()

Unnamed: 0,MAE,MSE,Residual Mean,test_date,data_tested
0,4.664122,36.310676,-0.0,1/7/17,train
0,3.430424,18.494801,-2.6956,1/7/17,test
0,4.660134,36.200782,0.0,2/9/17,train
0,4.249987,26.654195,-2.7796,2/9/17,test
0,4.656341,36.087157,-0.0,2/13/17,train


In [9]:
#table_class.to_csv('lin_sample_classfication_results.csv')
table_class.head()

Unnamed: 0,Accuracy,Hits,False Alarms,Misses,No Events,CSI,Hit Rate,False Alarm Rate,test_date,data_tested,greter_than_30_knots,less_than_30_knots
0,0.88619,1010,400,1990,17600,0.297059,0.336667,0.283688,1/7/17,train,3000,18000
0,1.0,0,0,0,1959,0.0,0.0,0.0,1/7/17,test,0,1959
0,0.884714,989,410,2011,17590,0.290029,0.329667,0.293066,2/9/17,train,3000,18000
0,0.969918,7,16,46,1992,0.101449,0.132075,0.695652,2/9/17,test,53,2008
0,0.883905,921,359,2079,17641,0.274189,0.307,0.280469,2/13/17,train,3000,18000


In [10]:
#data.to_csv('lin_smaple_data.csv')
data.head()

Unnamed: 0,Date,Type,R1,R2,Pressure_reduced_to_MSL_.Pa..0.MSL,Derived_radar_reflectivity_.dB..1.HYBL,u.component_of_wind_.m.s..85000.ISBL,v.component_of_wind_.m.s..85000.ISBL,wind_speed_85000,wind_shear_85000,...,diff_temp,geo_cbl,veg_sfc,best_4_layer,ws_700,actual,Class_results,pred,From,test_date
0,2017-01-07 06:00:00,ANDE,2.630564,2.322274,102879.7,0.0,5.447,-1.457,5.638498,5.269992,...,0.739,-5000.0,40.293,23.757,16.787335,2.332608,Less than 30 knots,6.200024,Mesonet,1/7/17
1,2017-01-07 06:15:00,ANDE,2.055383,1.813005,102951.0,0.0,5.629,-1.464,5.816265,5.773324,...,1.05,-5000.0,40.293,23.959,16.195689,1.555072,Less than 30 knots,6.948922,Mesonet,1/7/17
2,2017-01-07 06:30:00,ANDE,1.792587,1.671478,102890.6,0.0,5.653,-1.56,5.8643,6.23214,...,1.211,-5000.0,40.293,24.118,15.636587,2.332608,Less than 30 knots,7.381036,Mesonet,1/7/17
3,2017-01-07 06:45:00,ANDE,1.442889,1.352497,102931.0,0.0,5.692,-2.445,6.194908,6.806583,...,0.943,-5000.0,40.293,24.22,16.767452,1.555072,Less than 30 knots,8.615823,Mesonet,1/7/17
4,2017-01-07 07:00:00,ANDE,2.283417,1.677404,102882.5,0.0,5.226,-3.06,6.055962,5.554556,...,0.751,-5000.0,40.293,24.149,17.249375,2.526992,Less than 30 knots,8.870546,Mesonet,1/7/17
