In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import itertools
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.stattools import grangercausalitytests
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(r'../../data/processed/rate_panel_data.csv')
dataset = pd.read_excel("../../data/raw/File_10_-_IoD2019_Local_Authority_District_Summaries__lower-tier__.xlsx", sheet_name="IMD")

In [3]:
df = df[df['financial_year'] != 2014]
df = df.dropna(subset=['value'])

In [4]:
dataset = dataset.rename(columns={'Local Authority District name (2019)': 'local_authority'})
dataset['local_authority'] = dataset['local_authority'].str.lower().str.strip()
dataset = dataset.rename(columns={'IMD - Average score ': 'imd_score'})
dataset['local_authority'] = (
    dataset['local_authority']
    .str.lower()
    .str.replace('-', ' ', regex=False)
    .str.replace(',', ' ', regex=False)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)
dataset['local_authority'] = dataset['local_authority'].replace({
    #'barrow in furness': 'westmorland and furness',
    #'north somerset': 'somerset',
    'kingston upon hull city of': 'kingston upon hull',
    'herefordshire county of': 'herefordshire',
    'bristol city of': 'bristol',
    #'southend on sea': 'southend-on-sea',
    'st. helens': 'st helens', 
})
unified_map = {
    'buckinghamshire': ['aylesbury vale', 'chiltern', 'south bucks', 'wycombe','south buckinghamshire'],
    'dorset': ['weymouth and portland', 'west dorset', 'north dorset', 'east dorset', 'purbeck', 'christchurch'],
    'somerset': ['taunton deane', 'west somerset', 'mendip', 'sedgemoor', 'south somerset', 'somerset council','somerset west and taunton'],
    'cumberland': ['allerdale', 'carlisle', 'copeland', 'cumberland council'],
    'westmorland and furness': ['barrow in furness', 'barrow-in-furness', 'eden', 'south lakeland'],
    'north yorkshire': ['craven', 'hambleton', 'harrogate', 'richmondshire', 'ryedale', 'scarborough', 'selby', 'north yorkshire council'],
    'bournemouth christchurch and poole': ['bournemouth', 'christchurch', 'poole'],
    'west suffolk': ['forest heath', 'st edmundsbury'],
    'east suffolk': ['suffolk coastal', 'waveney'],
    'bath and north east somerset': ['bath and ne somerset'],
    'southend-on-sea': ['southend on sea'],
    'leicester': ['leicester city'],
    'medway': ['medway towns'],
    'derby': ['derby city'],
    'folkestone and hythe': ['shepway'],
    'county durham': ['durham'],
    "king's lynn and west norfolk": ['kings lynn and west norfolk'],
    'north northamptonshire': ['wellingborough', 'east northamptonshire', 'kettering', 'corby'],
    'west northamptonshire': ['northampton', 'south northamptonshire', 'daventry'],
}
# Create reverse mapping: value → key
reverse_map = {
    alt_name: unified for unified, alts in unified_map.items() for alt_name in alts
}

# Replace values in dataset['local_authority'] using the reverse_map
dataset['local_authority'] = dataset['local_authority'].replace(reverse_map)

dataset = (
    dataset.groupby('local_authority', as_index=False)['imd_score']
    .mean()
)
df = df.merge(
    dataset[['local_authority', 'imd_score']],
    on='local_authority',
    how='left'
)

In [5]:
df['removal_rate_100'] = df['removal_rate']*100
df['removal_rate'] = df['removal_rate'].fillna(0)


In [14]:
# Run OLS regression with fixed effects for LA and year
model = smf.ols(
    formula='removal_rate_100 ~ value + imd_score + C(financial_year) + C(size_category) + value:C(size_category)',
    data=df
).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:       removal_rate_100   R-squared:                       0.171
Model:                            OLS   Adj. R-squared:                  0.170
Method:                 Least Squares   F-statistic:                     117.3
Date:                Thu, 31 Jul 2025   Prob (F-statistic):          1.08e-310
Time:                        22:09:57   Log-Likelihood:                -17892.
No. Observations:                7953   AIC:                         3.581e+04
Df Residuals:                    7938   BIC:                         3.592e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

In [7]:
df['log_value'] = np.log1p(df['value'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
# Run OLS regression with fixed effects for LA and year
model = smf.ols(
    formula='removal_rate_100 ~ log_value + imd_score + C(financial_year) + C(size_category) + log_value:C(size_category)',
    data=df
).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:       removal_rate_100   R-squared:                       0.173
Model:                            OLS   Adj. R-squared:                  0.172
Method:                 Least Squares   F-statistic:                     118.6
Date:                Thu, 31 Jul 2025   Prob (F-statistic):          4.79e-314
Time:                        22:08:13   Log-Likelihood:                -17861.
No. Observations:                7947   AIC:                         3.575e+04
Df Residuals:                    7932   BIC:                         3.586e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [None]:
# Run OLS regression with fixed effects for LA and year
model = smf.ols(
    formula='removal_rate_100 ~ log_value + imd_score + C(financial_year) + C(size_category) + log_value:C(size_category)',
    data=df
).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:       removal_rate_100   R-squared:                       0.172
Model:                            OLS   Adj. R-squared:                  0.170
Method:                 Least Squares   F-statistic:                     126.5
Date:                Thu, 31 Jul 2025   Prob (F-statistic):          4.87e-312
Time:                        22:14:24   Log-Likelihood:                -17868.
No. Observations:                7947   AIC:                         3.576e+04
Df Residuals:                    7933   BIC:                         3.586e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [9]:
for lag in [1, 2, 3]:
    df[f'log_value_lag{lag}'] = df.groupby(['local_authority', 'size_category'])['log_value'].shift(lag)

In [13]:
model = smf.ols(
    formula=(
        'removal_rate ~ '
        'log_value + log_value_lag1 + log_value_lag2 + log_value_lag3 + '
        'C(financial_year) + C(size_category) + imd_score +'
        'log_value:C(size_category) + '
        'log_value_lag1:C(size_category) + '
        'log_value_lag2:C(size_category) + '
        'log_value_lag3:C(size_category)'
    ),
    data=df
).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           removal_rate   R-squared:                       0.258
Model:                            OLS   Adj. R-squared:                  0.255
Method:                 Least Squares   F-statistic:                     91.89
Date:                Thu, 31 Jul 2025   Prob (F-statistic):          2.96e-323
Time:                        22:09:08   Log-Likelihood:                 13302.
No. Observations:                5304   AIC:                        -2.656e+04
Df Residuals:                    5283   BIC:                        -2.642e+04
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------