In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import re

In [33]:
council_housing = pd.read_csv(r'../../data/processed/council_housing_cleaned.csv')
dataset = pd.read_csv(r'../../data/processed/charity_main_cleaned.csv')
gbp = pd.read_csv(r'../../data/raw/UK_monthly_GDP.csv', skiprows = 6)

In [34]:
gbp

Unnamed: 0,Date,Monthly GDP
0,Jan 2007,82.9
1,Feb 2007,83.2
2,Mar 2007,83.3
3,Apr 2007,83.5
4,May 2007,83.9
...,...,...
215,Dec 2024,101.9
216,Jan 2025,101.9
217,Feb 2025,102.4
218,Mar 2025,102.6


In [None]:
# Step 1: Active charities at end of each year (up to that FY)
active_counts = (
    dataset[(dataset['charity_status'] == 'active') | (dataset['removal_fy'].isna())]
    .assign(reg_fy=dataset['registration_fy'].astype('Int64'))
    .groupby(['local_authority', 'reg_fy'])
    .size()
    .groupby(level=0).cumsum()  # cumulative active up to that year
    .reset_index(name='cumulative_active')
    .rename(columns={'reg_fy': 'financial_year'})
)

# Step 2: Removals per financial year
# Filter to small and medium inactive charities
sm_removals = dataset[
    (dataset['charity_status'] == 'inactive') &
    (dataset['size_category'].isin(['Small', 'Medium']))
]

# Group by local authority, financial year, and size_category
removal_counts = (
    sm_removals
    .groupby(['local_authority', 'removal_fy', 'size_category'])
    .size()
    .reset_index(name='removals')
    .rename(columns={'removal_fy': 'financial_year'})
)

# Step 3: Merge and compute removal rate
df = pd.merge(removal_counts, active_counts, on=['local_authority', 'financial_year'], how='left')

# Step 4: Shift cumulative_active to previous year for correct denominator
df['prev_year'] = df['financial_year'] - 1
prev_active = active_counts.rename(columns={
    'financial_year': 'prev_year',
    'cumulative_active': 'active_last_year'
})

df = df.merge(prev_active, on=['local_authority', 'prev_year'], how='left')

# Step 5: Calculate removal rate
df['removal_rate'] = df['removals'] / df['active_last_year']

In [5]:
# Standardise column names
df['financial_year'] = df['financial_year'].astype(int)
council_housing = council_housing.rename(columns={'Local authority': 'local_authority'})
council_housing = council_housing.rename(columns={'Financial_Year': 'financial_year'})

# Merge datasets
panel = pd.merge(df, council_housing, on=['local_authority', 'financial_year'], how='right')


In [None]:
panel = panel[(panel['financial_year'] >= 2014) & (panel['financial_year'] <= 2023)]

panel


In [17]:
# Treat as category
panel['local_authority'] = panel['local_authority'].astype('category')

# Sale columns
sale_cols = [
    'Right_to_Buy_total_number_of_dwellings',
    'Social_Homebuy_number_of_dwellings',
    'Other_sales_to_sitting_tenants_number_of_dwellings',
    'Other_sales_number_of_dwellings',
    'Transfers_to_PRPs',
    'Sales_of_Shared_Ownership_number_of_dwellings'
]

# Convert to numeric and calculate total sales
for col in sale_cols:
    panel[col] = pd.to_numeric(panel[col], errors='coerce')

panel['total_sales'] = panel[sale_cols].fillna(0).sum(axis=1)

# Add size dummies
panel['is_small'] = (panel['size_category'] == 'Small').astype(int)
panel['is_medium'] = (panel['size_category'] == 'Medium').astype(int)


# Run regression
model = smf.ols(
    'removal_rate ~ total_sales + is_small + is_medium + C(local_authority) + C(financial_year)',
    data=panel
).fit()

model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:           removal_rate   R-squared:                       0.556
Model:                            OLS   Adj. R-squared:                  0.526
Method:                 Least Squares   F-statistic:                     18.96
Date:                Fri, 04 Jul 2025   Prob (F-statistic):               0.00
Time:                        22:19:25   Log-Likelihood:                 17090.
No. Observations:                4915   AIC:                        -3.357e+04
Df Residuals:                    4610   BIC:                        -3.159e+04
Df Model:                         304                                         
Covariance Type:            nonrobust                                         
                                                                                                                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

In [20]:
panel['total_sales_lag1'] = panel.groupby('local_authority')['total_sales'].shift(1)
# Run regression
model = smf.ols(
    'removal_rate ~ total_sales_lag1 + is_small + is_medium + C(local_authority) + C(financial_year)',
    data=panel
).fit()

model_summary = model.summary()
print(model_summary)

  panel['total_sales_lag1'] = panel.groupby('local_authority')['total_sales'].shift(1)


                            OLS Regression Results                            
Dep. Variable:           removal_rate   R-squared:                       0.559
Model:                            OLS   Adj. R-squared:                  0.528
Method:                 Least Squares   F-statistic:                     18.09
Date:                Fri, 04 Jul 2025   Prob (F-statistic):               0.00
Time:                        22:22:05   Log-Likelihood:                 16155.
No. Observations:                4650   AIC:                        -3.170e+04
Df Residuals:                    4345   BIC:                        -2.973e+04
Df Model:                         304                                         
Covariance Type:            nonrobust                                         
                                                                                                                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

In [None]:
# Example: National-level aggregation by financial year
ts_df = panel.groupby('financial_year')[['total_sales', 'removal_rate']].mean().dropna()

# Check for stationarity using ADF test
from statsmodels.tsa.stattools import adfuller
for col in ['total_sales', 'removal_rate']:
    result = adfuller(ts_df[col])
    print(f"{col} ADF p-value: {result[1]}")

# Engle-Granger cointegration test
from statsmodels.tsa.stattools import coint

score, pvalue, _ = coint(ts_df['removal_rate'], ts_df['total_sales'])
print(f"Cointegration test statistic: {score}")
print(f"P-value: {pvalue}")


total_sales ADF p-value: 0.24241185967325674
removal_rate ADF p-value: 8.084438840754101e-05
Cointegration test statistic: -0.0
P-value: 0.9859002580259643


In [30]:
# Ensure council is treated as a categorical variable
panel['local_authority'] = panel['local_authority'].astype('category')
sale_cols = [
    'Right_to_Buy_total_number_of_dwellings',
    'Social_Homebuy_number_of_dwellings',
    'Other_sales_to_sitting_tenants_number_of_dwellings',
    'Other_sales_number_of_dwellings',
    'Transfers_to_PRPs',
    'Sales_of_Shared_Ownership_number_of_dwellings'
]

# Convert to numeric safely
for col in sale_cols:
    panel[col] = pd.to_numeric(panel[col], errors='coerce')

# Then sum safely
panel['total_sales'] = panel[sale_cols].fillna(0).sum(axis=1)

# Run OLS with council fixed effects
model = smf.ols(
    'removal_rate ~ total_sales + C(local_authority) + C(financial_year)', 
    data=panel
).fit()

print(model.summary())


                            OLS Regression Results                            
Dep. Variable:           removal_rate   R-squared:                       0.168
Model:                            OLS   Adj. R-squared:                  0.113
Method:                 Least Squares   F-statistic:                     3.076
Date:                Fri, 04 Jul 2025   Prob (F-statistic):           4.71e-57
Time:                        22:28:42   Log-Likelihood:                 15548.
No. Observations:                4915   AIC:                        -3.049e+04
Df Residuals:                    4611   BIC:                        -2.851e+04
Df Model:                         303                                         
Covariance Type:            nonrobust                                         
                                                                                                                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

In [29]:
model = smf.ols(
    'removal_rate ~ Right_to_Buy_total_number_of_dwellings + '
    'Social_Homebuy_number_of_dwellings + '
    'Other_sales_to_sitting_tenants_number_of_dwellings + '
    'Other_sales_number_of_dwellings + '
    'Transfers_to_PRPs + '
    'Sales_of_Shared_Ownership_number_of_dwellings + '
    'C(local_authority)',
    data=panel
).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:           removal_rate   R-squared:                       0.111
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     1.067
Date:                Fri, 04 Jul 2025   Prob (F-statistic):              0.219
Time:                        22:28:32   Log-Likelihood:                 8809.3
No. Observations:                2833   AIC:                        -1.702e+04
Df Residuals:                    2534   BIC:                        -1.524e+04
Df Model:                         298                                         
Covariance Type:            nonrobust                                         
                                                                                                                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------