In [1]:
import pandas as pd
import statsmodels.formula.api as smf

In [2]:
dataset = pd.read_excel("../../data/raw/File_10_-_IoD2019_Local_Authority_District_Summaries__lower-tier__.xlsx", sheet_name="IMD")
filtered_panel = pd.read_csv(r'../../data/processed/final_panel_data.csv')

In [3]:
dataset = dataset.rename(columns={'Local Authority District name (2019)': 'local_authority'})
dataset['local_authority'] = dataset['local_authority'].str.lower().str.strip()
dataset = dataset.rename(columns={'IMD - Average score ': 'imd_score'})
dataset['local_authority'] = (
    dataset['local_authority']
    .str.lower()
    .str.replace('-', ' ', regex=False)
    .str.replace(',', ' ', regex=False)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)
dataset['local_authority'] = dataset['local_authority'].replace({
    #'barrow in furness': 'westmorland and furness',
    #'north somerset': 'somerset',
    'kingston upon hull city of': 'kingston upon hull',
    'herefordshire county of': 'herefordshire',
    'bristol city of': 'bristol',
    #'southend on sea': 'southend-on-sea',
    'st. helens': 'st helens', 
})
unified_map = {
    'buckinghamshire': ['aylesbury vale', 'chiltern', 'south bucks', 'wycombe','south buckinghamshire'],
    'dorset': ['weymouth and portland', 'west dorset', 'north dorset', 'east dorset', 'purbeck', 'christchurch'],
    'somerset': ['taunton deane', 'west somerset', 'mendip', 'sedgemoor', 'south somerset', 'somerset council','somerset west and taunton'],
    'cumberland': ['allerdale', 'carlisle', 'copeland', 'cumberland council'],
    'westmorland and furness': ['barrow in furness', 'barrow-in-furness', 'eden', 'south lakeland'],
    'north yorkshire': ['craven', 'hambleton', 'harrogate', 'richmondshire', 'ryedale', 'scarborough', 'selby', 'north yorkshire council'],
    'bournemouth christchurch and poole': ['bournemouth', 'christchurch', 'poole'],
    'west suffolk': ['forest heath', 'st edmundsbury'],
    'east suffolk': ['suffolk coastal', 'waveney'],
    'bath and north east somerset': ['bath and ne somerset'],
    'southend-on-sea': ['southend on sea'],
    'leicester': ['leicester city'],
    'medway': ['medway towns'],
    'derby': ['derby city'],
    'folkestone and hythe': ['shepway'],
    'county durham': ['durham'],
    "king's lynn and west norfolk": ['kings lynn and west norfolk'],
    'north northamptonshire': ['wellingborough', 'east northamptonshire', 'kettering', 'corby'],
    'west northamptonshire': ['northampton', 'south northamptonshire', 'daventry'],
}
# Create reverse mapping: value → key
reverse_map = {
    alt_name: unified for unified, alts in unified_map.items() for alt_name in alts
}

# Replace values in dataset['local_authority'] using the reverse_map
dataset['local_authority'] = dataset['local_authority'].replace(reverse_map)

dataset = (
    dataset.groupby('local_authority', as_index=False)['imd_score']
    .mean()
)


In [4]:
filtered_panel = filtered_panel.merge(
    dataset[['local_authority', 'imd_score']],
    on='local_authority',
    how='left'
)
filtered_panel.loc[filtered_panel['imd_score'].isna(), 'local_authority'].unique()
filtered_panel["imd_score_sq"] = filtered_panel["imd_score"] ** 2

In [6]:
model = smf.ols(
    formula=(
        'removals ~ '
        'value + '
        'imd_score + '
        'C(financial_year) + C(size_category) + '
        'value:C(size_category)'
    ),
    data=filtered_panel
).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.439
Model:                            OLS   Adj. R-squared:                  0.438
Method:                 Least Squares   F-statistic:                     446.1
Date:                Fri, 01 Aug 2025   Prob (F-statistic):               0.00
Time:                        17:00:59   Log-Likelihood:                -24180.
No. Observations:                7989   AIC:                         4.839e+04
Df Residuals:                    7974   BIC:                         4.849e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

In [9]:
model = smf.ols(
    formula=(
        'removals ~ '
        'value + value_lag1 + value_lag2 + value_lag3 + '
        'imd_score + C(financial_year) + C(size_category) + '
        'value:C(size_category) + '
        'value_lag1:C(size_category) + '
        'value_lag2:C(size_category) + '
        'value_lag3:C(size_category)'
    ),
    data=filtered_panel
).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.500
Model:                            OLS   Adj. R-squared:                  0.498
Method:                 Least Squares   F-statistic:                     264.9
Date:                Fri, 01 Aug 2025   Prob (F-statistic):               0.00
Time:                        17:37:08   Log-Likelihood:                -15969.
No. Observations:                5322   AIC:                         3.198e+04
Df Residuals:                    5301   BIC:                         3.212e+04
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf
from sklearn.metrics import r2_score, mean_squared_error

# ----------------------------
# Step 1: Ensure consistent categorical dtype
# ----------------------------
filtered_panel['financial_year'] = pd.Categorical(
    filtered_panel['financial_year'],
    categories=sorted(filtered_panel['financial_year'].unique())
)
filtered_panel['size_category'] = pd.Categorical(
    filtered_panel['size_category'],
    categories=sorted(filtered_panel['size_category'].unique())
)

# ----------------------------
# Step 2: Stratified 70/30 split within each financial year
# ----------------------------
train_list = []
test_list = []

for year in filtered_panel['financial_year'].cat.categories:
    year_data = filtered_panel[filtered_panel['financial_year'] == year]
    if len(year_data) > 1:
        train_split, test_split = train_test_split(
            year_data, test_size=0.3, random_state=42
        )
        train_list.append(train_split)
        test_list.append(test_split)
    else:
        train_list.append(year_data)

train_data = pd.concat(train_list)
test_data = pd.concat(test_list)

# ----------------------------
# Step 3: Clean test set — drop rows with missing predictors
# ----------------------------
required_vars = ['value', 'imd_score', 'financial_year', 'size_category']
test_data_clean = test_data.dropna(subset=required_vars).copy()

# Ensure categorical levels match
for col in ['financial_year', 'size_category']:
    test_data_clean[col] = pd.Categorical(
        test_data_clean[col],
        categories=train_data[col].cat.categories
    )

# ----------------------------
# Step 4: Fit the model
# ----------------------------
train_model = smf.ols(
    formula=(
        'removals ~ '
        'value + '
        'imd_score + C(financial_year) + C(size_category) + '
        'value:C(size_category)'
    ),
    data=train_data
).fit()

# ----------------------------
# Step 5: Predict and evaluate
# ----------------------------
y_pred = train_model.predict(test_data_clean)
y_true = test_data_clean['removals']

print("Test R-squared:", r2_score(y_true, y_pred))
print("Test RMSE:", mean_squared_error(y_true, y_pred) ** 0.5)
print("Training R-squared:", train_model.rsquared)


In [11]:
model = smf.ols(
    formula=(
        'removals ~ '
        'value + '
        'imd_score  + imd_score_sq + '
        'C(financial_year) + C(size_category) + '
        'value:C(size_category)'
    ),
    data=filtered_panel
).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.440
Model:                            OLS   Adj. R-squared:                  0.439
Method:                 Least Squares   F-statistic:                     417.3
Date:                Fri, 01 Aug 2025   Prob (F-statistic):               0.00
Time:                        17:39:58   Log-Likelihood:                -24175.
No. Observations:                7989   AIC:                         4.838e+04
Df Residuals:                    7973   BIC:                         4.849e+04
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

In [13]:
model = smf.ols(
    formula=(
        'removals ~ '
        'value + value_lag1 + value_lag2 + value_lag3 + '
        'imd_score + imd_score_sq + C(financial_year) + C(size_category) + '
        'value:C(size_category) + '
        'value_lag1:C(size_category) + '
        'value_lag2:C(size_category) + '
        'value_lag3:C(size_category)'
    ),
    data=filtered_panel
).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.500
Model:                            OLS   Adj. R-squared:                  0.498
Method:                 Least Squares   F-statistic:                     252.6
Date:                Fri, 01 Aug 2025   Prob (F-statistic):               0.00
Time:                        17:41:26   Log-Likelihood:                -15967.
No. Observations:                5322   AIC:                         3.198e+04
Df Residuals:                    5300   BIC:                         3.212e+04
Df Model:                          21                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------