In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

### TODO
- Normalisation
- Select the models
- Define hyperparameters
- Run & Evaluate the models
- Partial dependence, VIF, DD-plot, interaction

- Explaining the best model

# Netherlands

In [5]:
#Import csv and remove non-numerical variables
df = pd.read_csv('weekly_counts_stats.csv')
df = df.drop(['city', 'name', 'year', 'week', 'latitude', 'longitude'] , axis=1)

#Select country and drop column
country = df[df['country'] == 'Netherlands']
country = country.drop('country', axis=1)

#Count variable as stratify
target = country.loc[:,'counts_week']
data = country.drop('counts_week', axis=1)

#Normalize counter data
scaler = StandardScaler()
target = target.to_numpy().reshape(-1, 1)
y = scaler.fit_transform(target)


In [6]:
X = sm.add_constant(data)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.438
Model:                            OLS   Adj. R-squared:                  0.379
Method:                 Least Squares   F-statistic:                     7.390
Date:                Fri, 27 Jan 2023   Prob (F-statistic):           1.99e-22
Time:                        13:09:00   Log-Likelihood:                -368.66
No. Observations:                 326   AIC:                             801.3
Df Residuals:                     294   BIC:                             922.5
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    -3.42

In [11]:
#Remove insignificant variables
X_new = data.copy()
p_values = model.pvalues
p_values.drop([0])
print(X_new.columns)
print(p_values)
# X = X_new.drop(X_new.columns[p_values > 0.05], axis=1)

KeyError: '[0] not found in axis'

In [None]:
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.357
Model:                            OLS   Adj. R-squared:                  0.343
Method:                 Least Squares   F-statistic:                     25.27
Date:                Fri, 27 Jan 2023   Prob (F-statistic):           2.25e-27
Time:                        13:05:04   Log-Likelihood:                -390.49
No. Observations:                 326   AIC:                             797.0
Df Residuals:                     318   BIC:                             827.3
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    -0.80

In [None]:
#Check multicolinearity with VIF
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

print(vif)

          VIF               features
0   14.165739                  const
1  173.430510                  nodes
2  350.192179     intersection_count
3  424.134908  street_segments_count
4    1.160794              bus_stops
5    1.071523               dem_mean
6    1.021306                lst_std
7    1.697417               pop_mean


# USA

In [None]:
#Import csv and remove non-numerical variables
df = pd.read_csv('weekly_counts_stats.csv')
df = df.drop(['city', 'name', 'year', 'week', 'latitude', 'longitude'] , axis=1)

#Select country and drop column
country = df[df['country'] == 'USA']
country = country.drop('country', axis=1)

#Count variable as stratify
target = country.loc[:,'counts_week']
data = country.drop('counts_week', axis=1)

#Normalize counter data
scaler = StandardScaler()
target = target.to_numpy().reshape(-1, 1)
y = scaler.fit_transform(target)


In [None]:
X = sm.add_constant(data)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.570
Model:                            OLS   Adj. R-squared:                  0.514
Method:                 Least Squares   F-statistic:                     10.19
Date:                Fri, 27 Jan 2023   Prob (F-statistic):           6.39e-29
Time:                        13:05:04   Log-Likelihood:                -269.05
No. Observations:                 270   AIC:                             602.1
Df Residuals:                     238   BIC:                             717.3
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    30.48

In [None]:
#Remove insignificant variables
X_new = country.copy()
p_values = model.pvalues
p_values.drop('const')
X = X_new.drop(X_new.columns[p_values > 0.05], axis=1)

In [None]:
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 2.685e+30
Date:                Fri, 27 Jan 2023   Prob (F-statistic):               0.00
Time:                        13:05:04   Log-Likelihood:                 8586.7
No. Observations:                 270   AIC:                        -1.716e+04
Df Residuals:                     262   BIC:                        -1.713e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    -0.45

In [None]:
#Check multicolinearity with VIF
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

print(vif)

           VIF               features
0  8380.202108                  const
1     2.092042            counts_week
2   952.580164     intersection_count
3    21.639581      edge_length_total
4   897.561653  street_segments_count
5     1.388811              ndvi_mean
6     1.247491               dem_mean
7     1.952351               lst_mean


# UK

In [None]:
#Import csv and remove non-numerical variables
df = pd.read_csv('weekly_counts_stats.csv')
df = df.drop(['city', 'name', 'year', 'week', 'latitude', 'longitude'] , axis=1)

#Select country and drop column
country = df[(df['country'] == 'UK') | (df['country'] == 'UK (Scotland)')]
country = country.drop('country', axis=1)

#Count variable as stratify
target = country.loc[:,'counts_week']
data = country.drop('counts_week', axis=1)

#Normalize counter data
scaler = StandardScaler()
target = target.to_numpy().reshape(-1, 1)
y = scaler.fit_transform(target)

In [None]:
X = sm.add_constant(data)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.264
Model:                            OLS   Adj. R-squared:                  0.184
Method:                 Least Squares   F-statistic:                     3.294
Date:                Fri, 27 Jan 2023   Prob (F-statistic):           6.79e-08
Time:                        13:05:05   Log-Likelihood:                -401.27
No. Observations:                 317   AIC:                             866.5
Df Residuals:                     285   BIC:                             986.8
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    -0.26

In [None]:
#Remove insignificant variables
X_new = country.copy()
p_values = model.pvalues
p_values.drop('const')
X = X_new.drop(X_new.columns[p_values > 0.05], axis=1)

In [None]:
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.090
Model:                            OLS   Adj. R-squared:                  0.087
Method:                 Least Squares   F-statistic:                     31.01
Date:                Fri, 27 Jan 2023   Prob (F-statistic):           5.50e-08
Time:                        13:05:05   Log-Likelihood:                -434.92
No. Observations:                 317   AIC:                             873.8
Df Residuals:                     315   BIC:                             881.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.2979      0.076     -3.928      0.0