In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import re
import itertools

In [2]:
council_housing = pd.read_csv(r'../../data/processed/council_housing_cleaned.csv')
dataset = pd.read_csv(r'../../data/processed/charity_main_cleaned.csv')

In [3]:
# Step 1: Create full panel of all combinations (excluding NaN size categories)
authorities = dataset['local_authority'].unique()
years = dataset['removal_fy'].unique()
sizes = dataset['size_category'].dropna().unique()

panel = pd.DataFrame(
    itertools.product(authorities, years, sizes),
    columns=['local_authority', 'financial_year', 'size_category']
)

# Step 2: Count removals
removals = (
    dataset
    .groupby(['local_authority', 'removal_fy', 'size_category'])
    .size()
    .reset_index(name='removals')
    .rename(columns={'removal_fy': 'financial_year'})
)

# Step 3: Merge with full panel and fill missing with 0
panel = panel.merge(removals, on=['local_authority', 'financial_year', 'size_category'], how='left').fillna({'removal': 0})

# Step 4: Add category-level totals
category = (
    dataset
    .groupby(['local_authority', 'removal_fy'])[
        ['Housing_And_Infrastructure', 'Community_And_Social_Welfare', 'Grantmaking_And_Financial_Support']
    ]
    .sum()
    .reset_index()
    .rename(columns={'removal_fy': 'financial_year'})
)
panel = panel.merge(category, on=['local_authority', 'financial_year'], how='left')
panel = panel[(panel['financial_year'] >= 2015) & (panel['financial_year'] <= 2023)]

# Step 5: Merge with council housing data
council_housing = council_housing.rename(columns={
    'Local authority': 'local_authority', 'Financial_Year': 'financial_year'
})
panel = panel.merge(council_housing, on=['local_authority', 'financial_year'], how='left')

# Step 6: Process sale columns
sale_cols = [
    'Right_to_Buy_total_number_of_dwellings', 'Social_Homebuy_number_of_dwellings',
    'Other_sales_to_sitting_tenants_number_of_dwellings', 'Other_sales_number_of_dwellings',
    'Transfers_to_PRPs', 'Sales_of_Shared_Ownership_number_of_dwellings'
]
panel[sale_cols] = panel[sale_cols].apply(pd.to_numeric, errors='coerce')
panel = panel.dropna(subset=['Right_to_Buy_total_number_of_dwellings'])
panel[sale_cols[1:]] = panel[sale_cols[1:]].fillna(0)
panel['total_sales'] = panel[sale_cols].sum(axis=1)

# Step 7: Generate lag variables
panel['financial_year'] = panel['financial_year'].astype(int)
panel.sort_values(by=['local_authority', 'size_category', 'financial_year'], inplace=True)
for lag in [1, 2, 3]:
    panel[f'totalsales_lag{lag}'] = panel.groupby(['local_authority', 'size_category'])['total_sales'].shift(lag)

# Step 8: Format categories
panel['local_authority'] = panel['local_authority'].astype('category')
panel['financial_year'] = panel['financial_year'].astype('category')
panel['size_category'] = panel['size_category'].str.capitalize().astype('category')

In [4]:
panel

Unnamed: 0,local_authority,financial_year,size_category,removals,Housing_And_Infrastructure,Community_And_Social_Welfare,Grantmaking_And_Financial_Support,Right_to_Buy_total_number_of_dwellings,Social_Homebuy_number_of_dwellings,Other_sales_to_sitting_tenants_number_of_dwellings,Other_sales_number_of_dwellings,Transfers_to_PRPs,Sales_of_Shared_Ownership_number_of_dwellings,total_sales,totalsales_lag1,totalsales_lag2,totalsales_lag3
6056,Adur,2015,Large,,0.0,2.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,9.0,,,
6041,Adur,2016,Large,,3.0,4.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,7.0,9.0,,
6044,Adur,2017,Large,,1.0,4.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,8.0,7.0,9.0,
6038,Adur,2018,Large,,0.0,2.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,9.0,8.0,7.0,9.0
6035,Adur,2019,Large,,0.0,1.0,2.0,12.0,0.0,0.0,0.0,0.0,0.0,12.0,9.0,8.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2956,York,2019,Small,24.0,6.0,25.0,15.0,58.0,0.0,0.0,0.0,0.0,24.0,82.0,68.0,73.0,79.0
2971,York,2020,Small,11.0,4.0,13.0,4.0,46.0,0.0,0.0,0.0,0.0,27.0,73.0,82.0,68.0,73.0
2968,York,2021,Small,19.0,8.0,18.0,7.0,74.0,0.0,0.0,0.0,0.0,19.0,93.0,73.0,82.0,68.0
2953,York,2022,Small,17.0,8.0,13.0,11.0,52.0,0.0,0.0,0.0,0.0,13.0,65.0,93.0,73.0,82.0


In [5]:
# Run regression
model = smf.ols(
    'removals ~ total_sales + totalsales_lag1 + totalsales_lag2 + totalsales_lag3 + C(local_authority) + C(financial_year)',
    data=panel
).fit()

model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.329
Model:                            OLS   Adj. R-squared:                  0.262
Method:                 Least Squares   F-statistic:                     4.923
Date:                Wed, 09 Jul 2025   Prob (F-statistic):          1.84e-114
Time:                        11:56:20   Log-Likelihood:                -10048.
No. Observations:                3228   AIC:                         2.068e+04
Df Residuals:                    2935   BIC:                         2.246e+04
Df Model:                         292                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

In [6]:
# Step 4: Run regression with interactions and fixed effects, clustered SEs
model = smf.ols(
    'removals ~ total_sales + C(size_category) + C(local_authority) + C(financial_year) + total_sales:C(size_category)',
    data=panel
).fit()

# Step 5: Print regression summary
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.674
Model:                            OLS   Adj. R-squared:                  0.653
Method:                 Least Squares   F-statistic:                     31.72
Date:                Wed, 09 Jul 2025   Prob (F-statistic):               0.00
Time:                        11:56:21   Log-Likelihood:                -13625.
No. Observations:                4996   AIC:                         2.786e+04
Df Residuals:                    4689   BIC:                         2.986e+04
Df Model:                         306                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

In [7]:
# Step 4: Run regression with lag
model = smf.ols(
    'removals ~ C(size_category) + total_sales + totalsales_lag1 + totalsales_lag2 + C(local_authority) + C(financial_year) + C(size_category):(total_sales + totalsales_lag1 + totalsales_lag2)',
    data=panel 
).fit()

# Step 5: Print results
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.679
Model:                            OLS   Adj. R-squared:                  0.651
Method:                 Least Squares   F-statistic:                     24.30
Date:                Wed, 09 Jul 2025   Prob (F-statistic):               0.00
Time:                        11:56:21   Log-Likelihood:                -10448.
No. Observations:                3822   AIC:                         2.151e+04
Df Residuals:                    3515   BIC:                         2.343e+04
Df Model:                         306                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

In [8]:
# Step 4: Run regression with lag
model = smf.ols(
    'removals ~ C(size_category) + total_sales + totalsales_lag1 + totalsales_lag2 + totalsales_lag3 + C(local_authority) + C(financial_year) + C(size_category):(total_sales + totalsales_lag1 + totalsales_lag2 + totalsales_lag3)',
    data=panel
).fit()

# Step 5: Print results
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.680
Model:                            OLS   Adj. R-squared:                  0.647
Method:                 Least Squares   F-statistic:                     20.60
Date:                Wed, 09 Jul 2025   Prob (F-statistic):               0.00
Time:                        11:56:21   Log-Likelihood:                -8851.6
No. Observations:                3228   AIC:                         1.831e+04
Df Residuals:                    2925   BIC:                         2.015e+04
Df Model:                         302                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

In [9]:
# Run regression
model = smf.ols(
    'removals ~ total_sales + totalsales_lag1 + totalsales_lag2 + totalsales_lag3 + C(local_authority) + C(financial_year) + Housing_And_Infrastructure + Community_And_Social_Welfare + Grantmaking_And_Financial_Support',
    data=panel
).fit()

model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.377
Model:                            OLS   Adj. R-squared:                  0.315
Method:                 Least Squares   F-statistic:                     6.026
Date:                Wed, 09 Jul 2025   Prob (F-statistic):          1.37e-152
Time:                        11:56:21   Log-Likelihood:                -9926.5
No. Observations:                3228   AIC:                         2.044e+04
Df Residuals:                    2932   BIC:                         2.224e+04
Df Model:                         295                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------