In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

### TODO
- Normalisation
- Select the models
- Define hyperparameters
- Run & Evaluate the models
- Partial dependence, VIF, DD-plot, interaction

- Explaining the best model

# Netherlands

In [2]:
#Import csv and remove non-numerical variables
df = pd.read_csv('weekly_new.csv')
df = df.drop(['city', 'name', 'year', 'week', 'latitude', 'longitude'] , axis=1)

#Select country and drop column
country = df[df['country'] == 'Netherlands']
country = country.drop('country', axis=1)

#Count variable as stratify
target = country.loc[:,'counts_week']
data = country.drop('counts_week', axis=1)

#Normalize counter data
scaler = StandardScaler()
target = target.to_numpy().reshape(-1, 1)
y = scaler.fit_transform(target)


In [3]:
X = sm.add_constant(data)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.418
Model:                            OLS   Adj. R-squared:                  0.361
Method:                 Least Squares   F-statistic:                     7.402
Date:                Tue, 31 Jan 2023   Prob (F-statistic):           1.20e-21
Time:                        09:55:37   Log-Likelihood:                -377.82
No. Observations:                 329   AIC:                             815.6
Df Residuals:                     299   BIC:                             929.5
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 1.287e+04 

In [7]:
#Remove insignificant variables
X_new = X.copy()
p_values = model.pvalues
X = X_new.drop(X_new.columns[p_values > 0.05], axis=1)

In [8]:
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.357
Model:                            OLS   Adj. R-squared:                  0.343
Method:                 Least Squares   F-statistic:                     25.43
Date:                Tue, 31 Jan 2023   Prob (F-statistic):           1.38e-27
Time:                        10:00:25   Log-Likelihood:                -394.25
No. Observations:                 329   AIC:                             804.5
Df Residuals:                     321   BIC:                             834.9
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          67.5765     10.329      6.542      

In [9]:
#Check multicolinearity with VIF
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

print(vif)

            VIF     features
0  53242.067126        const
1      1.816429            n
2      1.222810  restaurants
3      2.021618    ndvi_mean
4      1.120552     dem_mean
5      1.498496     lst_mean
6      1.253108      lst_std
7      1.881523      pop_sum


# USA

In [13]:
#Import csv and remove non-numerical variables
df = pd.read_csv('weekly_new.csv')
df = df.drop(['city', 'name', 'year', 'week', 'latitude', 'longitude'] , axis=1)

#Select country and drop column
country = df[df['country'] == 'USA']
country = country.drop('country', axis=1)

#Count variable as stratify
target = country.loc[:,'counts_week']
data = country.drop('counts_week', axis=1)

#Normalize counter data
scaler = StandardScaler()
target = target.to_numpy().reshape(-1, 1)
y = scaler.fit_transform(target)


In [14]:
X = sm.add_constant(data)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.454
Model:                            OLS   Adj. R-squared:                  0.389
Method:                 Least Squares   F-statistic:                     6.895
Date:                Tue, 31 Jan 2023   Prob (F-statistic):           7.98e-19
Time:                        10:02:01   Log-Likelihood:                -301.30
No. Observations:                 270   AIC:                             662.6
Df Residuals:                     240   BIC:                             770.6
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 1094.1769 

In [15]:
#Remove insignificant variables
X_new = X.copy()
p_values = model.pvalues
p_values.drop('const')
X = X_new.drop(X_new.columns[p_values > 0.05], axis=1)

In [16]:
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.390
Model:                            OLS   Adj. R-squared:                  0.374
Method:                 Least Squares   F-statistic:                     23.95
Date:                Tue, 31 Jan 2023   Prob (F-statistic):           4.33e-25
Time:                        10:02:31   Log-Likelihood:                -316.34
No. Observations:                 270   AIC:                             648.7
Df Residuals:                     262   BIC:                             677.5
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  22.4162    

In [17]:
#Check multicolinearity with VIF
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

print(vif)

           VIF             features
0  8552.781778                const
1     4.822125    edge_length_total
2     5.328666  street_length_total
3     1.128260    street_length_avg
4     1.554083            shop_list
5     1.058610             lst_mean
6     1.092535           lc_entropy
7     1.617918              pop_sum


# UK

In [18]:
#Import csv and remove non-numerical variables
df = pd.read_csv('weekly_new.csv')
df = df.drop(['city', 'name', 'year', 'week', 'latitude', 'longitude'] , axis=1)

#Select country and drop column
country = df[(df['country'] == 'UK') | (df['country'] == 'UK (Scotland)')]
country = country.drop('country', axis=1)

#Count variable as stratify
target = country.loc[:,'counts_week']
data = country.drop('counts_week', axis=1)

#Normalize counter data
scaler = StandardScaler()
target = target.to_numpy().reshape(-1, 1)
y = scaler.fit_transform(target)

In [19]:
X = sm.add_constant(data)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.269
Model:                            OLS   Adj. R-squared:                  0.195
Method:                 Least Squares   F-statistic:                     3.633
Date:                Tue, 31 Jan 2023   Prob (F-statistic):           9.30e-09
Time:                        10:03:20   Log-Likelihood:                -400.24
No. Observations:                 317   AIC:                             860.5
Df Residuals:                     287   BIC:                             973.2
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 1.419e+04 

In [20]:
#Remove insignificant variables
X_new = X.copy()
p_values = model.pvalues
p_values.drop('const')
X = X_new.drop(X_new.columns[p_values > 0.05], axis=1)

In [21]:
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.201
Model:                            OLS   Adj. R-squared:                  0.191
Method:                 Least Squares   F-statistic:                     19.60
Date:                Tue, 31 Jan 2023   Prob (F-statistic):           2.10e-14
Time:                        10:03:40   Log-Likelihood:                -414.27
No. Observations:                 317   AIC:                             838.5
Df Residuals:                     312   BIC:                             857.3
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -9.3956      3.951     -2.378   

In [22]:
#Check multicolinearity with VIF
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

print(vif)

           VIF      features
0  6094.867350         const
1     1.221802   bike_points
2     1.049142  cycle_length
3     1.286355      dem_mean
4     1.361484      lst_mean


# OVERALL

In [36]:
#Import csv and remove non-numerical variables
df = pd.read_csv('weekly_new.csv')
df = df.drop(['city', 'name', 'year', 'week', 'latitude', 'longitude', 'espg'] , axis=1)

country = df.drop('country', axis=1)

#Count variable as stratify
target = country.loc[:,'counts_week']
data = country.drop('counts_week', axis=1)

#Normalize counter data
scaler = StandardScaler()
target = target.to_numpy().reshape(-1, 1)
y = scaler.fit_transform(target)

In [37]:
X = sm.add_constant(data)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.242
Model:                            OLS   Adj. R-squared:                  0.218
Method:                 Least Squares   F-statistic:                     10.13
Date:                Tue, 31 Jan 2023   Prob (F-statistic):           2.07e-37
Time:                        10:09:24   Log-Likelihood:                -1172.7
No. Observations:                 916   AIC:                             2403.
Df Residuals:                     887   BIC:                             2543.
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    0.1193 

In [38]:
#Remove insignificant variables
X_new = X.copy()
p_values = model.pvalues
p_values.drop('const')
X = X_new.drop(X_new.columns[p_values > 0.05], axis=1)
X = X.drop('intersection_count', axis=1)

In [39]:
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.198
Model:                            OLS   Adj. R-squared:                  0.190
Method:                 Least Squares   F-statistic:                     24.85
Date:                Tue, 31 Jan 2023   Prob (F-statistic):           2.40e-38
Time:                        10:09:25   Log-Likelihood:                -1198.7
No. Observations:                 916   AIC:                             2417.
Df Residuals:                     906   BIC:                             2466.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -0.5498      0

In [40]:
#Check multicolinearity with VIF
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

print(vif)

         VIF            features
0  12.703010               const
1   1.259967     3_way_int_count
2   1.205738     4_way_int_count
3   1.066956  dist_to_greenspace
4   1.506665         bike_points
5   1.450182           bus_stops
6   1.732319         restaurants
7   1.101380        cycle_length
8   1.134400             dem_std
9   1.113424             lst_std
