In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import re

In [2]:
council_housing = pd.read_csv(r'../../data/processed/council_housing_cleaned.csv')
dataset = pd.read_csv(r'../../data/processed/charity_main_cleaned.csv')

In [3]:
removals = (
    dataset
    .groupby(['local_authority', 'removal_fy'])
    .size()
    .reset_index(name='removals')
    .rename(columns={'removal_fy': 'financial_year'})
)

removals_by_size = (
    dataset
    .groupby(['local_authority', 'removal_fy', 'size_category'])
    .size()
    .unstack(fill_value=0)  # Pivot size_category to columns
    .reset_index()
    .rename(columns={'removal_fy': 'financial_year'})
)

removals_by_size = removals_by_size.rename(columns={
    'Small': 'removals_small',
    'Medium': 'removals_medium',
    'Large': 'removals_large'  # if exists
})

category = (
    dataset
    .groupby(['local_authority', 'removal_fy'])[['Housing_And_Infrastructure', 'Community_And_Social_Welfare', 'Grantmaking_And_Financial_Support']]
    .sum()
    .reset_index()
    .rename(columns={'removal_fy': 'financial_year'})
)

council_housing = council_housing.rename(columns={'Local authority': 'local_authority'})
council_housing = council_housing.rename(columns={'Financial_Year': 'financial_year'})

# Merge datasets
panel = pd.merge(removals, removals_by_size, on=['local_authority', 'financial_year'], how='left')
panel = pd.merge(panel, category , on=['local_authority', 'financial_year'], how='left')
panel = pd.merge(panel, council_housing , on=['local_authority', 'financial_year'], how='left')
panel = panel[(panel['financial_year'] >= 2015) & (panel['financial_year'] <= 2023)]

# Treat as category
panel['local_authority'] = panel['local_authority'].astype('category')

# Sale columns
sale_cols = [
    'Right_to_Buy_total_number_of_dwellings',
    'Social_Homebuy_number_of_dwellings',
    'Other_sales_to_sitting_tenants_number_of_dwellings',
    'Other_sales_number_of_dwellings',
    'Transfers_to_PRPs',
    'Sales_of_Shared_Ownership_number_of_dwellings'
]

# Convert to numeric
for col in sale_cols:
    panel[col] = pd.to_numeric(panel[col], errors='coerce')

# Drop rows where 'Right_to_Buy_total_number_of_dwellings' is NaN
panel = panel.dropna(subset=['Right_to_Buy_total_number_of_dwellings'])

# Fill NaN with 0 in the remaining sale columns
other_sale_cols = [
    'Social_Homebuy_number_of_dwellings',
    'Other_sales_to_sitting_tenants_number_of_dwellings',
    'Other_sales_number_of_dwellings',
    'Transfers_to_PRPs',
    'Sales_of_Shared_Ownership_number_of_dwellings'
]
panel[other_sale_cols] = panel[other_sale_cols].fillna(0)
# Calculate total sales
panel['total_sales'] = panel[sale_cols].sum(axis=1)
panel['totalsales_lag1'] = panel.groupby(['local_authority'])['total_sales'].shift(1)
panel['totalsales_lag2'] = panel.groupby(['local_authority'])['total_sales'].shift(2)
panel['totalsales_lag3'] = panel.groupby(['local_authority'])['total_sales'].shift(3)

# Step 1: Melt removals into long format by size
panel_long = panel.melt(
    id_vars=['local_authority', 'financial_year', 'total_sales','totalsales_lag1','totalsales_lag2', 'totalsales_lag3'],
    value_vars=['removals_small', 'removals_medium', 'removals_large'],
    var_name='size_category',
    value_name='removal'
)

# Step 2: Clean size_category labels
panel_long['size_category'] = panel_long['size_category'].str.replace('removals_', '').str.capitalize()

# Step 3: Convert categorical variables if not already
panel_long['local_authority'] = panel_long['local_authority'].astype('category')
panel_long['financial_year'] = panel_long['financial_year'].astype('category')
panel_long['size_category'] = panel_long['size_category'].astype('category')


  panel['totalsales_lag1'] = panel.groupby(['local_authority'])['total_sales'].shift(1)
  panel['totalsales_lag2'] = panel.groupby(['local_authority'])['total_sales'].shift(2)
  panel['totalsales_lag3'] = panel.groupby(['local_authority'])['total_sales'].shift(3)


In [4]:
# Run regression
model = smf.ols(
    'removals ~ total_sales + totalsales_lag1 + totalsales_lag2 + totalsales_lag3 + C(local_authority) + C(financial_year)',
    data=panel
).fit()

model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.861
Model:                            OLS   Adj. R-squared:                  0.830
Method:                 Least Squares   F-statistic:                     28.07
Date:                Mon, 07 Jul 2025   Prob (F-statistic):               0.00
Time:                        15:46:53   Log-Likelihood:                -4545.1
No. Observations:                1615   AIC:                             9674.
Df Residuals:                    1323   BIC:                         1.125e+04
Df Model:                         291                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

In [5]:
# Step 4: Run regression with interactions and fixed effects, clustered SEs
model = smf.ols(
    'removal ~ total_sales + C(size_category) + C(local_authority) + C(financial_year) + total_sales:C(size_category)',
    data=panel_long
).fit()

# Step 5: Print regression summary
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                removal   R-squared:                       0.686
Model:                            OLS   Adj. R-squared:                  0.673
Method:                 Least Squares   F-statistic:                     51.14
Date:                Mon, 07 Jul 2025   Prob (F-statistic):               0.00
Time:                        15:46:53   Log-Likelihood:                -19721.
No. Observations:                7464   AIC:                         4.006e+04
Df Residuals:                    7157   BIC:                         4.218e+04
Df Model:                         306                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

In [30]:
# Step 4: Run regression with lag
model = smf.ols(
    'removal ~ C(size_category) + total_sales + totalsales_lag1 + totalsales_lag2 + C(local_authority) + C(financial_year) + C(size_category):(total_sales + totalsales_lag1 + totalsales_lag2)',
    data=panel_long
).fit()

# Step 5: Print results
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                removal   R-squared:                       0.693
Model:                            OLS   Adj. R-squared:                  0.676
Method:                 Least Squares   F-statistic:                     39.92
Date:                Mon, 07 Jul 2025   Prob (F-statistic):               0.00
Time:                        16:05:39   Log-Likelihood:                -15096.
No. Observations:                5712   AIC:                         3.081e+04
Df Residuals:                    5405   BIC:                         3.285e+04
Df Model:                         306                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

In [7]:
# Step 4: Run regression with lag
model = smf.ols(
    'removal ~ C(size_category) + total_sales + totalsales_lag1 + totalsales_lag2 + totalsales_lag3 + C(local_authority) + C(financial_year) + C(size_category):(total_sales + totalsales_lag1 + totalsales_lag2 + totalsales_lag3)',
    data=panel_long
).fit()

# Step 5: Print results
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                removal   R-squared:                       0.696
Model:                            OLS   Adj. R-squared:                  0.675
Method:                 Least Squares   F-statistic:                     34.47
Date:                Mon, 07 Jul 2025   Prob (F-statistic):               0.00
Time:                        15:46:54   Log-Likelihood:                -12819.
No. Observations:                4842   AIC:                         2.624e+04
Df Residuals:                    4540   BIC:                         2.820e+04
Df Model:                         301                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

In [16]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
import pandas as pd

# Define a simplified formula (no interactions or fixed effects)
formula = 'removal ~ total_sales + totalsales_lag1 + totalsales_lag2 + totalsales_lag3'

# Create design matrices
y, X = dmatrices(formula, data=panel_long)

# Convert X to a proper DataFrame with column names
X = pd.DataFrame(X, columns=X.design_info.column_names)

# Calculate VIF
vif_df = pd.DataFrame({
    'variable': X.columns,
    'VIF': [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
})

print(vif_df)


          variable        VIF
0        Intercept   1.263504
1      total_sales  10.321284
2  totalsales_lag1  13.146137
3  totalsales_lag2  14.325333
4  totalsales_lag3   9.816155


In [8]:
# Run regression
model = smf.ols(
    'removals ~ total_sales + totalsales_lag1 + totalsales_lag2 + totalsales_lag3 + C(local_authority) + C(financial_year) + Housing_And_Infrastructure + Community_And_Social_Welfare + Grantmaking_And_Financial_Support',
    data=panel
).fit()

model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.985
Model:                            OLS   Adj. R-squared:                  0.982
Method:                 Least Squares   F-statistic:                     297.3
Date:                Mon, 07 Jul 2025   Prob (F-statistic):               0.00
Time:                        15:46:54   Log-Likelihood:                -2738.6
No. Observations:                1615   AIC:                             6067.
Df Residuals:                    1320   BIC:                             7656.
Df Model:                         294                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------