# Model

In [6]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import geopandas as gpd
import h3
from shapely import wkt
from sklearn.linear_model import LinearRegression
import seaborn as sns
import statsmodels.formula.api as smf

## Creating our panel 

In [118]:
waze = pd.read_csv('fullclean.csv')
waze.drop(columns=['confidence', 'nThumbsUp', 'country'])
waze['date']= pd.to_datetime(waze['date'])

schools = pd.read_csv('schools.csv')
weather = pd.read_csv('RainLevels.csv')[['Date', 'Precipitation']]
weather.columns = ['date', 'precip']
weather['date']= pd.to_datetime(weather['date'])

covid = pd.read_csv('stringency.csv')
covid = covid[['date', 'stringency_index']]
covid = covid[covid.date<='2022-01-01']  # stringency index is at daily level, only look when we have waze data
covid['date']= pd.to_datetime(covid['date'])

### Merging the Data
Caveat: we need to think about how the different hexagon resolutions are impacting this merge. Are there any potential issues being introduced?

In [119]:
df = waze.merge(covid,how='left', on='date')
df = df.merge(weather, how='left', on='date')

df = df.merge(schools, how='left', on=['h6', 'h7', 'h8', 'h9', 'h10'])
print('The final dataframes will look like...')
x = df.groupby(['date', 'h6', 'stringency_index', 'precip'],as_index=False)[['uuid', 'Denumire_P']].count().head()
x

The final dataframes will look like...


Unnamed: 0,date,h6,stringency_index,precip,uuid,Denumire_P
0,2020-02-26,861e0b217ffffff,16.67,5,6,0
1,2020-02-26,861e0b237ffffff,16.67,5,21,0
2,2020-02-26,861e0b2a7ffffff,16.67,5,5,0
3,2020-02-26,861e0b387ffffff,16.67,5,282,24
4,2020-02-26,861e0b38fffffff,16.67,5,187,14


In [120]:
dfs = []
resolutions = [6,7,8,9,10]
res = dict(zip(np.arange(len(resolutions)),resolutions)) # create dictionary of resolutions

for i in res:
    dfs.append(df.groupby(['date', 'h' + str(res[i]), 'stringency_index', 'precip'],as_index=False)[['uuid', 'Denumire_P']].count()) 
#df = df.groupby(['date', 'h3'],as_index=False)['uuid'].count()


# save csv's with different resolutions to disk
outputpath = '/Users/catherinehayden/WB/cluj'
for i in res:
    dfs[i].to_csv(outputpath + '/modellingR' + str(res[i]) + '.csv', index = False)


next steps: 
 - calculate distance from each hexagon to the points of interest?
 - control for weather
 - control for weekend/weekday

## How do alerts change with the number of schools near and stringency index at the hexagon-day level? 
For variying hexagon resolutions: 6-10
(As hexagon resolution increases, the area it covers decreases)  
Here we are regressing:  
Alerts(h,t) = Schools(h) + Stringency(t); where h: hexagon, t: time  
Looks like number of schools has a positive coefficient and stringency has a negative coefficient for each h3 resolution. As we expected.  
Magnitude of coefficients steadily decrease as resolution increases.

$Alerts_{h,t} = \alpha + \beta Schools_h + \delta Stringency_t + \varepsilon_{h,t}$

In [121]:
betas = []
for i in range(len(dfs)):
    model = smf.ols(formula='uuid ~ Denumire_P + stringency_index + precip', data=dfs[i]).fit() 
    #predictions = model.predict(df[['Denumire_P', 'stringency_index']]) 
    betas.append(model.params)
    print_model = model.summary()
    print('Hex resolution: ' + str(resolutions[i]))
    print(print_model)

Hex resolution: 6
                            OLS Regression Results                            
Dep. Variable:                   uuid   R-squared:                       0.860
Model:                            OLS   Adj. R-squared:                  0.860
Method:                 Least Squares   F-statistic:                 1.222e+04
Date:                Sat, 21 May 2022   Prob (F-statistic):               0.00
Time:                        14:38:38   Log-Likelihood:                -30672.
No. Observations:                5984   AIC:                         6.135e+04
Df Residuals:                    5980   BIC:                         6.138e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           44

Hex resolution: 10
                            OLS Regression Results                            
Dep. Variable:                   uuid   R-squared:                       0.072
Model:                            OLS   Adj. R-squared:                  0.071
Method:                 Least Squares   F-statistic:                     5062.
Date:                Sat, 21 May 2022   Prob (F-statistic):               0.00
Time:                        14:38:38   Log-Likelihood:            -3.4207e+05
No. Observations:              197162   AIC:                         6.842e+05
Df Residuals:                  197158   BIC:                         6.842e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            

In [122]:
betas

[Intercept           44.451394
 Denumire_P           8.419535
 stringency_index    -0.322715
 precip              -0.000336
 dtype: float64,
 Intercept           15.658071
 Denumire_P           6.459335
 stringency_index    -0.093867
 precip              -0.000043
 dtype: float64,
 Intercept           6.686531
 Denumire_P          4.244750
 stringency_index   -0.029974
 precip              0.000028
 dtype: float64,
 Intercept           3.411566
 Denumire_P          1.467573
 stringency_index   -0.010394
 precip              0.000028
 dtype: float64,
 Intercept           1.891958
 Denumire_P          0.585174
 stringency_index   -0.003065
 precip              0.000012
 dtype: float64]