# Prediction 

In [3]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import geopandas as gpd
import h3
from shapely import wkt
from sklearn.linear_model import Ridge, LinearRegression, RidgeCV, LassoCV, Lasso, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler
import pydeck as pdk
import seaborn as sns
from sklearn.metrics import mean_squared_error
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

In [2]:
df9 = pd.read_csv('model9.csv')
df10 = pd.read_csv('model10.csv')

In [None]:
%%timeit
parent_str = 'h6'
child_str = 'h10'

test = models(df10, df10.h6, parent_str, child_str , numrows_cond = 1500)
test.fit_forest()

In [89]:
class models:
    def __init__(self, df, parent, parent_str, child_str ,numrows_cond=0):
        self.parent = parent
        self.data = df
        self.numrows_cond = numrows_cond
        self.hexagons = parent.unique() # number of unique parent hexagons in the data
        
    def hexagons_used(self):
        df=self.data
        hex_list = []
        for h in self.hexagons:
            d=df.loc[self.parent==h, :]
            if d.shape[0]>=numrows_cond:
                hex_list.append(h)
        return hex_list
    
    def fit_forest(self):
        df = self.data
        model_info = {} 
        hex_list = self.hexagons_used()
        for h in hex_list:
            hdict = {}
            d=df.loc[self.parent==h, :]
            X = d.iloc[:,5:]
            y = d[['alerts']]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)
            forest = RandomForestRegressor(min_samples_split=8).fit(X_train, y_train)
            R2F = forest.score(X_test, y_test)
            hdict['R2'] = R2F
            hdict['forest'] = forest
            model_info[h] = hdict
        return model_info
    
    def display_forest(self):
        model_info = self.fit_forest()
        for h in model_info:
            print('')
            print('For Hexagon: ' + h)
            print('-------------------------------------------')
            print("R^2: {}".format(model_info[h]['R2']))
            print('-------------------------------------------')
            print('')
            print('')
            
    def fit_lasso(self):
        df = self.data
        model_info = {} 
        hex_list = self.hexagons_used()
        for h in hex_list:
            hdict = {}
            d=df.loc[self.parent==h, :]
            X = d.iloc[:,5:]
            y = d[['alerts']]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)
            lasso = LassoLarsIC(criterion='aic', normalize=True).fit(X_train, y_train)
            hdict['X_test'] = X_test
            hdict['X_train'] = X_train
            hdict['y_train'] = y_train
            hdict['y_pred'] = lasso.predict(X_test)
            hdict['y_test'] = y_test
            hdict['lasso'] = lasso
            hdict['nonzero'] = X.columns[lasso.coef_!=0]
            model_info[h] = hdict
        return model_info

    def display_lasso(self):
        model_info = self.fit_lasso()
        for h in model_info:
            print('')
            print('For Hexagon: ' + h)
            print('-------------------------------------------')
            print("R^2: {}".format(model_info[h]['lasso'].score(model_info[h]['X_test'], model_info[h]['y_test'])))
            print('')
            print(len(list(model_info[h]['nonzero'])), 'non-zero column(s):')
            print(list(model_info[h]['nonzero']))
            print('-------------------------------------------')
            print('')
            
            
    
    def selection(self):
        df = self.data
        model_info = {} 
        hex_list = self.hexagons_used()
        for h in hex_list:
            hdict = {}
            d=df.loc[self.parent==h, :]
            X = d.iloc[:,5:]
            y = d[['alerts']]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)
            lasso = LassoLarsIC(criterion='aic', normalize=True).fit(X_train, y_train)
            ypred = lasso.predict(X_test)
            R2L = lasso.score(y_test, y_pred)
            forest = RandomForestRegressor(criterion = 'mse',random_state=0, max_features=1/3).fit(X_train, y_train)
            R2F = forest.score(X_test, y_test)
            scores, models = [R2F, R2L], ['Random Forest', 'Lasso']
            i = np.argmax(scores)
            R2, selected = scores[i], models[i]
            hdict['R2'] = scores[i][0]
            hdict['model'] = scores[i]
            if selected == 'Lasso':
                hdict['nonzero'] = X.columns[lasso.coef_!=0]
            model_info[h] = hdict
        return model_info
                
                
        

## Using h6 as our parent, h10 as child

In [88]:
%%timeit
h6 = df10['h6'].unique()
for i in h6:
    d=df10.loc[df10.h6==i, :]
    if d.shape[0]>=2000:
        X = d.iloc[:,6:]
        y = d[['alerts']]
        print('')
        print('For Hexagon: ' + i)
        print('-------------------------------------------')
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)
        forest = RandomForestRegressor(min_samples_split=8).fit(X_train, y_train)
        R2F = forest.score(X_test, y_test)
        print('')
        print('OOS R^2: '+ str(R2F))
        print('')
        print('')


For Hexagon: 861e0b237ffffff
-------------------------------------------


TypeError: __init__() got an unexpected keyword argument 'min_sample_split'

In [16]:
h6 = df10['h6'].unique()
for i in h6:
    d=df10.loc[df10.h6==i, :]
    if d.shape[0]>=2000:
        print('')
        X = d.iloc[:,6:]
        y = d[['alerts']]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)
        lasso = LassoLarsIC(criterion='aic', normalize=True).fit(X_train, y_train)
        y_pred = lasso.predict(X_test)
        print('For Hexagon: ' + i)
        print('-------------------------------------------')
        print("R^2: {}".format(lasso.score(X_test, y_test)))
        mse = mean_squared_error(y_test, y_pred)
        print("Mean Squared Error: {}".format(mse))
        print('')
        print(len(list(X.columns[lasso.coef_!=0])), 'non-zero column(s):')
        print(list(X.columns[lasso.coef_!=0]))
        print('-------------------------------------------')
        print('')
        print('')


For Hexagon: 861e0b237ffffff
-------------------------------------------
R^2: 0.020856625897698766
Mean Squared Error: 1.1068890017552597

14 non-zero column(s):
['precip', 'dayofweek', 'IsHoliday', 'stringency stringency^3', 'stringency^2 month', 'dayofweek month', 'dayofweek IsHoliday', 'stringency stringency^3 dayofweek', 'stringency precip^3 dayofweek', 'precip dayofweek HomeGame', 'stringency^2 dayofweek HomeGame', 'stringency^3 precip^2 HomeGame', 'precip^2 precip^3 IsHoliday', 'dayofweek month IsHoliday']
-------------------------------------------



For Hexagon: 861e0b2a7ffffff
-------------------------------------------
R^2: 0.01644638866275927
Mean Squared Error: 1.141155126925949

6 non-zero column(s):
['stringency^2', 'dayofweek', 'month', 'precip dayofweek', 'stringency stringency^2 stringency^3', 'stringency HomeGame IsHoliday']
-------------------------------------------



For Hexagon: 861e0b387ffffff
-------------------------------------------
R^2: 0.1673279678538513

## Testing

In [9]:
model_info = {} 

test_sets=[]
h6 = df10.h6.unique()
for h in h6:
    hdict = {}
    d=df10.loc[df10.h6==h, :]
    if d.shape[0]>=1000:
        X = d.iloc[:,6:]
        y = d[['alerts']]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
        test_sets.append(X_test)
        lasso = LassoLarsIC(criterion='aic', normalize=True).fit(X_train, y_train)
        hdict['lasso'] = lasso
        hdict['OOS predictions'] = lasso.predict(X_test)
        hdict['nonzero'] = X.columns[lasso.coef_!=0]
        model_info[h] = hdict


In [10]:

d = df10[df10['h6'].isin(hex_list)] # dataframe with only hexagons used in the model
d.to_csv()

d = d[['h6', 'h10', 'alerts']]
d = d.reset_index()
d["count_scaled"] = 4 * MinMaxScaler().fit_transform(d["alerts"].values.reshape(-1, 1))
d

Unnamed: 0,index,h6,h10,alerts,count_scaled
0,0,861e0b217ffffff,8a1e0b210d37fff,1,0.000000
1,1,861e0b217ffffff,8a1e0b21282ffff,1,0.000000
2,2,861e0b217ffffff,8a1e0b214537fff,2,0.111111
3,3,861e0b217ffffff,8a1e0b216b4ffff,1,0.000000
4,4,861e0b217ffffff,8a1e0b38926ffff,1,0.000000
...,...,...,...,...,...
198438,199277,861e0b3afffffff,8a1e0b3ac1a7fff,1,0.000000
198439,199278,861e0b3afffffff,8a1e0b3ac54ffff,2,0.111111
198440,199279,861e0b3afffffff,8a1e0b3ae66ffff,1,0.000000
198441,199281,861e0b3afffffff,8a1e0b3ac187fff,1,0.000000


In [12]:
layer = pdk.Layer(
"H3HexagonLayer",
d,
pickable=True,
stroked=True,
filled=True,
extruded=False,
get_hexagon="h6",
get_fill_color="[225, (1 - count_scaled) * 255, (1 - count_scaled)/5 * 255]",
get_line_color="[255, 255, 255]",
line_width_min_pixels=2.5,
)
# Set `ViewState` to center Cluj-Napoca
view_state = pdk.ViewState(latitude=46.770920, longitude=23.589920, zoom=11, bearing=0, pitch=45)

# Render with Deck
r = pdk.Deck(layers=[layer], initial_view_state=view_state, tooltip={"text":  "Count: {alerts}"})

In [None]:
r