In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import re

In [None]:
council_housing = pd.read_csv(r'../../data/processed/council_housing_cleaned.csv')
dataset = pd.read_csv(r'../../data/processed/charity_main_cleaned.csv')
gbp = pd.read_csv(r'../../data/raw/UK_monthly_GDP.csv', skiprows = 6)
quarterly = pd.read_excel(r'../../data/raw/Quarterly_right_to_buy.ods', engine = 'odf', skiprows = 2)

In [3]:
# Replace empty strings or whitespace with actual NaN
quarterly['Lower and Single Tier Authority Data'] = quarterly['Lower and Single Tier Authority Data'].replace(r'^\s*$', pd.NA, regex=True)

# Drop rows where the key column is NaN
quarterly = quarterly.dropna(subset=['Lower and Single Tier Authority Data'])
quarterly = quarterly.drop(columns=['CurrentONS code', 'Unnamed: 0'], errors='ignore')

# Rename the first column
quarterly = quarterly.rename(columns={'Lower and Single Tier Authority Data': 'local_authority'})


In [4]:
# Keep only relevant columns (2010-11 Q1 to 2019-20 Q4)
relevant_cols = ['local_authority'] + [
    col for col in quarterly.columns 
    if col.startswith(('2010-', '2011-', '2012-', '2013-', '2014-', '2015-', '2016-', '2017-', '2018-', '2019-'))
]
quarterly = quarterly[relevant_cols]

# Melt to long format
long_quarterly = quarterly.melt(
    id_vars='local_authority', 
    var_name='quarter', 
    value_name='sales'
)

# Parse to period[Q] directly
def parse_to_period(q):
    try:
        fy, q_part = q.split(' ')
        start_year = int(fy.split('-')[0])
        quarter = int(q_part[1])
        year = start_year if quarter in [1, 2, 3] else start_year + 1
        return f"{year}Q{quarter}"
    except:
        return np.nan

# Apply and convert to period
long_quarterly['quarter'] = long_quarterly['quarter'].apply(parse_to_period)
long_quarterly = long_quarterly.dropna(subset=['quarter'])
long_quarterly['quarter'] = long_quarterly['quarter'].astype('period[Q]')

# Sort for consistency
long_quarterly = long_quarterly.sort_values(['local_authority', 'quarter']).reset_index(drop=True)


In [5]:
# Ensure datetime
dataset['date_of_registration'] = pd.to_datetime(dataset['date_of_registration'], errors='coerce')
dataset['date_of_removal'] = pd.to_datetime(dataset['date_of_removal'], errors='coerce')

# Step 1: Assign quarters
dataset['registration_quarter'] = dataset['date_of_registration'].dt.to_period('Q')
dataset['removal_quarter'] = dataset['date_of_removal'].dt.to_period('Q')

# Step 2: Active charities up to each quarter
active_counts = (
    dataset[(dataset['charity_status'] == 'active') | (dataset['date_of_removal'].isna())]
    .groupby(['local_authority', 'registration_quarter'])
    .size()
    .groupby(level=0).cumsum()
    .reset_index(name='cumulative_active')
    .rename(columns={'registration_quarter': 'quarter'})
)

# Step 3: Removals per quarter (small + medium only)
sm_removals = dataset[
    (dataset['charity_status'] == 'inactive')
    # & (dataset['size_category'].isin(['Small', 'Medium']))
].copy()

removal_counts = (
    sm_removals
    .groupby(['local_authority', 'removal_quarter', 'size_category'])
    .size()
    .reset_index(name='removals')
    .rename(columns={'removal_quarter': 'quarter'})
)

# Step 4: Merge active and removal
df = pd.merge(removal_counts, active_counts, on=['local_authority', 'quarter'], how='left')

# Step 5: Shift active count by 1 quarter
active_counts = active_counts.rename(columns={
    'quarter': 'prev_quarter',
    'cumulative_active': 'active_last_quarter'
})
df['prev_quarter'] = (df['quarter'] - 1).astype('str').apply(pd.Period, freq='Q')
df = df.merge(active_counts, on=['local_authority', 'prev_quarter'], how='left')

# Step 6: Calculate removal rate
df['removal_rate'] = df['removals'] / df['active_last_quarter']


In [6]:
# Merge on local authority and quarter
merged_panel = pd.merge(df,long_quarterly, on=['local_authority', 'quarter'], how='outer')
merged_panel

Unnamed: 0,local_authority,quarter,size_category,removals,cumulative_active,prev_quarter,active_last_quarter,removal_rate,sales
0,Aberdeen City,2017Q1,Medium,1.0,,2016Q4,,,
1,Aberdeen City,2017Q1,Small,1.0,,2016Q4,,,
2,Aberdeen City,2018Q1,Medium,1.0,,2017Q4,,,
3,Aberdeen City,2018Q3,Medium,1.0,,2018Q2,,,
4,Aberdeen City,2021Q4,Small,1.0,,2021Q3,,,
...,...,...,...,...,...,...,...,...,...
26985,York,2024Q4,Small,4.0,687.0,2024Q3,685.0,0.005839,
26986,York,2025Q1,Medium,1.0,696.0,2024Q4,687.0,0.001456,
26987,York,2025Q1,Small,7.0,696.0,2024Q4,687.0,0.010189,
26988,York,2025Q2,Medium,2.0,699.0,2025Q1,696.0,0.002874,


In [7]:
merged_panel

Unnamed: 0,local_authority,quarter,size_category,removals,cumulative_active,prev_quarter,active_last_quarter,removal_rate,sales
0,Aberdeen City,2017Q1,Medium,1.0,,2016Q4,,,
1,Aberdeen City,2017Q1,Small,1.0,,2016Q4,,,
2,Aberdeen City,2018Q1,Medium,1.0,,2017Q4,,,
3,Aberdeen City,2018Q3,Medium,1.0,,2018Q2,,,
4,Aberdeen City,2021Q4,Small,1.0,,2021Q3,,,
...,...,...,...,...,...,...,...,...,...
26985,York,2024Q4,Small,4.0,687.0,2024Q3,685.0,0.005839,
26986,York,2025Q1,Medium,1.0,696.0,2024Q4,687.0,0.001456,
26987,York,2025Q1,Small,7.0,696.0,2024Q4,687.0,0.010189,
26988,York,2025Q2,Medium,2.0,699.0,2025Q1,696.0,0.002874,


In [8]:
merged_panel.dropna(subset=["sales", "removals"], inplace=True)

In [9]:
# Ensure correct types
merged_panel['local_authority'] = merged_panel['local_authority'].astype('category')
merged_panel['quarter'] = merged_panel['quarter'].astype('category')

# Convert quarterly_sales to numeric if needed
merged_panel['sales'] = pd.to_numeric(merged_panel['sales'], errors='coerce')

# Step 3: Create dummies for Small and Medium
# merged_panel['is_small'] = (merged_panel['size_category'] == 'Small').astype(int)
# merged_panel['is_medium'] = (merged_panel['size_category'] == 'Medium').astype(int)


# Define and fit the regression #
model = smf.ols(
    'removals ~ sales + C(local_authority) + C(quarter)'
,
    data=merged_panel
).fit()

# Print summary (non-categorical parts shown at top)
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.292
Model:                            OLS   Adj. R-squared:                  0.264
Method:                 Least Squares   F-statistic:                     10.57
Date:                Sun, 06 Jul 2025   Prob (F-statistic):               0.00
Time:                        16:23:08   Log-Likelihood:                -14906.
No. Observations:                7379   AIC:                         3.037e+04
Df Residuals:                    7101   BIC:                         3.229e+04
Df Model:                         277                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

In [10]:
# Sort first to ensure proper lagging
merged_panel = merged_panel.sort_values(['local_authority', 'quarter'])
# Create lagged sales
merged_panel['sales_lag1'] = (
    merged_panel.groupby('local_authority')['sales']
    .shift(1)
)
model = smf.ols(
    'removal_rate ~ sales_lag1 + C(local_authority)',
    data=merged_panel
).fit()

print(model.summary())


                            OLS Regression Results                            
Dep. Variable:           removal_rate   R-squared:                       0.222
Model:                            OLS   Adj. R-squared:                  0.190
Method:                 Least Squares   F-statistic:                     6.930
Date:                Sun, 06 Jul 2025   Prob (F-statistic):          1.91e-191
Time:                        16:23:08   Log-Likelihood:                 26871.
No. Observations:                6489   AIC:                        -5.323e+04
Df Residuals:                    6232   BIC:                        -5.149e+04
Df Model:                         256                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

  merged_panel.groupby('local_authority')['sales']


In [11]:
merged_panel['quarter'] = merged_panel['quarter'].astype('period[Q]')
# Lag 6 months (2 quarters)
merged_panel['sales_lag2'] = merged_panel.groupby('local_authority')['sales'].shift(2)
merged_panel['sales_lag1'] = merged_panel.groupby('local_authority')['sales'].shift(1)
merged_panel['sales_lag3'] = merged_panel.groupby('local_authority')['sales'].shift(3)
merged_panel['sales_lag4'] = merged_panel.groupby('local_authority')['sales'].shift(4)
# Lead 6 months (2 quarters)
merged_panel['sales_lead2'] = merged_panel.groupby('local_authority')['sales'].shift(-2)

# Fit the model
model = smf.ols(
    'removal_rate ~ sales_lag1+sales_lag2+ sales_lag3 + sales_lag4 + C(local_authority)',
    data=merged_panel
).fit()

# Print the summary
print(model.summary())


  merged_panel['sales_lag2'] = merged_panel.groupby('local_authority')['sales'].shift(2)
  merged_panel['sales_lag1'] = merged_panel.groupby('local_authority')['sales'].shift(1)
  merged_panel['sales_lag3'] = merged_panel.groupby('local_authority')['sales'].shift(3)
  merged_panel['sales_lag4'] = merged_panel.groupby('local_authority')['sales'].shift(4)
  merged_panel['sales_lead2'] = merged_panel.groupby('local_authority')['sales'].shift(-2)


                            OLS Regression Results                            
Dep. Variable:           removal_rate   R-squared:                       0.226
Model:                            OLS   Adj. R-squared:                  0.190
Method:                 Least Squares   F-statistic:                     6.304
Date:                Sun, 06 Jul 2025   Prob (F-statistic):          3.48e-167
Time:                        16:23:08   Log-Likelihood:                 24265.
No. Observations:                5843   AIC:                        -4.801e+04
Df Residuals:                    5583   BIC:                        -4.628e+04
Df Model:                         259                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

In [12]:
merged_panel[merged_panel['local_authority'] == 'York']

Unnamed: 0,local_authority,quarter,size_category,removals,cumulative_active,prev_quarter,active_last_quarter,removal_rate,sales,sales_lag1,sales_lag2,sales_lag3,sales_lag4,sales_lead2
26915,York,2014Q4,Small,1.0,500.0,2014Q3,,,17.0,,,,,14.0
26916,York,2015Q1,Medium,2.0,502.0,2014Q4,500.0,0.004,14.0,17.0,,,,18.0
26917,York,2015Q1,Small,1.0,502.0,2014Q4,500.0,0.002,14.0,14.0,17.0,,,18.0
26918,York,2015Q2,Medium,5.0,509.0,2015Q1,502.0,0.00996,18.0,14.0,14.0,17.0,,20.0
26919,York,2015Q2,Small,5.0,509.0,2015Q1,502.0,0.00996,18.0,18.0,14.0,14.0,17.0,16.0
26920,York,2015Q3,Small,2.0,512.0,2015Q2,509.0,0.003929,20.0,18.0,18.0,14.0,14.0,23.0
26922,York,2016Q1,Small,4.0,519.0,2015Q4,513.0,0.007797,16.0,20.0,18.0,18.0,14.0,23.0
26923,York,2016Q2,Medium,1.0,524.0,2016Q1,519.0,0.001927,23.0,16.0,20.0,18.0,18.0,18.0
26924,York,2016Q2,Small,1.0,524.0,2016Q1,519.0,0.001927,23.0,23.0,16.0,20.0,18.0,18.0
26925,York,2016Q3,Medium,3.0,529.0,2016Q2,524.0,0.005725,18.0,23.0,23.0,16.0,20.0,16.0


YEARLY

In [13]:
# Step 1: Active charities at end of each year (up to that FY)
active_counts = (
    dataset[(dataset['charity_status'] == 'active') | (dataset['removal_fy'].isna())]
    .assign(reg_fy=dataset['registration_fy'].astype('Int64'))
    .groupby(['local_authority', 'reg_fy'])
    .size()
    .groupby(level=0).cumsum()  # cumulative active up to that year
    .reset_index(name='cumulative_active')
    .rename(columns={'reg_fy': 'financial_year'})
)

# Step 2: Removals per financial year
# Filter to small and medium inactive charities
sm_removals = dataset[
    (dataset['charity_status'] == 'inactive') 
    # & (dataset['size_category'].isin(['Small', 'Medium']))
]

# Group by local authority, financial year, and size_category
removal_counts = (
    sm_removals
    .groupby(['local_authority', 'removal_fy'])
    .size()
    .reset_index(name='removals')
    .rename(columns={'removal_fy': 'financial_year'})
)

# Step 3: Merge and compute removal rate
df = pd.merge(removal_counts, active_counts, on=['local_authority', 'financial_year'], how='left')

# Step 4: Shift cumulative_active to previous year for correct denominator
df['prev_year'] = df['financial_year'] - 1
prev_active = active_counts.rename(columns={
    'financial_year': 'prev_year',
    'cumulative_active': 'active_last_year'
})

df = df.merge(prev_active, on=['local_authority', 'prev_year'], how='left')

# Step 5: Calculate removal rate
df['removal_rate'] = df['removals'] / df['active_last_year']

In [14]:
# Standardise column names
df['financial_year'] = df['financial_year'].astype(int)
council_housing = council_housing.rename(columns={'Local authority': 'local_authority'})
council_housing = council_housing.rename(columns={'Financial_Year': 'financial_year'})

# Merge datasets
panel = pd.merge(df, council_housing, on=['local_authority', 'financial_year'], how='right')


In [15]:
panel = panel[(panel['financial_year'] >= 2014) & (panel['financial_year'] <= 2023)]

panel


Unnamed: 0,local_authority,financial_year,removals,cumulative_active,prev_year,active_last_year,removal_rate,Right_to_Buy_total_number_of_dwellings,Social_Homebuy_number_of_dwellings,Other_sales_to_sitting_tenants_number_of_dwellings,Other_sales_number_of_dwellings,Transfers_to_PRPs,Sales_of_Shared_Ownership_number_of_dwellings
0,County Durham,2023,33.0,1007.0,2022.0,971.0,0.033986,0.0,0,0,0,0,0
1,Darlington,2023,4.0,173.0,2022.0,165.0,0.024242,20.0,0,0,0,0,0
2,Gateshead,2023,10.0,426.0,2022.0,400.0,0.025000,117.0,0,0,0,0,7
3,Hartlepool,2023,6.0,126.0,2022.0,119.0,0.050420,0.0,0,0,0,0,0
4,Middlesbrough,2023,4.0,200.0,2022.0,191.0,0.020942,0.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3326,Malvern Hills,2014,5.0,306.0,2013.0,301.0,0.016611,0.0,0.0,0.0,0.0,0.0,
3327,Redditch,2014,1.0,103.0,2013.0,101.0,0.009901,41.0,0.0,0.0,0.0,0.0,
3328,Worcester,2014,,,,,,0.0,0.0,0.0,0.0,0.0,
3329,Wychavon,2014,2.0,362.0,2013.0,350.0,0.005714,0.0,0.0,0.0,0.0,0.0,


In [23]:
# Treat as category
panel['local_authority'] = panel['local_authority'].astype('category')

# Sale columns
sale_cols = [
    'Right_to_Buy_total_number_of_dwellings',
    'Social_Homebuy_number_of_dwellings',
    'Other_sales_to_sitting_tenants_number_of_dwellings',
    'Other_sales_number_of_dwellings',
    'Transfers_to_PRPs',
    'Sales_of_Shared_Ownership_number_of_dwellings'
]

# Convert to numeric and calculate total sales
for col in sale_cols:
    panel[col] = pd.to_numeric(panel[col], errors='coerce')

panel['total_sales'] = panel[sale_cols].fillna(0).sum(axis=1)

# Add size dummies
# panel['is_small'] = (panel['size_category'] == 'Small').astype(int)
# panel['is_medium'] = (panel['size_category'] == 'Medium').astype(int)


# Run regression
model = smf.ols(
    'removals ~ total_sales + C(local_authority) + C(financial_year)',
    data=panel
).fit()

model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.851
Model:                            OLS   Adj. R-squared:                  0.832
Method:                 Least Squares   F-statistic:                     45.08
Date:                Sun, 06 Jul 2025   Prob (F-statistic):               0.00
Time:                        21:45:15   Log-Likelihood:                -7625.6
No. Observations:                2701   AIC:                         1.586e+04
Df Residuals:                    2397   BIC:                         1.765e+04
Df Model:                         303                                         
Covariance Type:            nonrobust                                         
                                                                                                                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

In [None]:
panel['total_sales_lag1'] = panel.groupby('local_authority')['total_sales'].shift(1)
# Run regression
model = smf.ols(
    'removal_rate ~ total_sales_lag1 + is_small + is_medium + C(local_authority) + C(financial_year)',
    data=panel
).fit()

model_summary = model.summary()
print(model_summary)

  panel['total_sales_lag1'] = panel.groupby('local_authority')['total_sales'].shift(1)


                            OLS Regression Results                            
Dep. Variable:           removal_rate   R-squared:                       0.547
Model:                            OLS   Adj. R-squared:                  0.515
Method:                 Least Squares   F-statistic:                     17.15
Date:                Sun, 06 Jul 2025   Prob (F-statistic):               0.00
Time:                        15:56:03   Log-Likelihood:                 16014.
No. Observations:                4626   AIC:                        -3.142e+04
Df Residuals:                    4321   BIC:                        -2.945e+04
Df Model:                         304                                         
Covariance Type:            nonrobust                                         
                                                                                                                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

In [None]:
# Example: National-level aggregation by financial year
ts_df = panel.groupby('financial_year')[['total_sales', 'removal_rate']].mean().dropna()

# Check for stationarity using ADF test
from statsmodels.tsa.stattools import adfuller
for col in ['total_sales', 'removal_rate']:
    result = adfuller(ts_df[col])
    print(f"{col} ADF p-value: {result[1]}")

# Engle-Granger cointegration test
from statsmodels.tsa.stattools import coint

score, pvalue, _ = coint(ts_df['removal_rate'], ts_df['total_sales'])
print(f"Cointegration test statistic: {score}")
print(f"P-value: {pvalue}")


total_sales ADF p-value: 0.24241185967325674
removal_rate ADF p-value: 8.084438840754101e-05
Cointegration test statistic: -0.0
P-value: 0.9859002580259643


In [None]:
# Ensure council is treated as a categorical variable
panel['local_authority'] = panel['local_authority'].astype('category')
sale_cols = [
    'Right_to_Buy_total_number_of_dwellings',
    'Social_Homebuy_number_of_dwellings',
    'Other_sales_to_sitting_tenants_number_of_dwellings',
    'Other_sales_number_of_dwellings',
    'Transfers_to_PRPs',
    'Sales_of_Shared_Ownership_number_of_dwellings'
]

# Convert to numeric safely
for col in sale_cols:
    panel[col] = pd.to_numeric(panel[col], errors='coerce')

# Then sum safely
panel['total_sales'] = panel[sale_cols].fillna(0).sum(axis=1)

# Run OLS with council fixed effects
model = smf.ols(
    'removal_rate ~ total_sales + C(local_authority) + C(financial_year)', 
    data=panel
).fit()

print(model.summary())


                            OLS Regression Results                            
Dep. Variable:           removal_rate   R-squared:                       0.168
Model:                            OLS   Adj. R-squared:                  0.113
Method:                 Least Squares   F-statistic:                     3.076
Date:                Sun, 06 Jul 2025   Prob (F-statistic):           4.71e-57
Time:                        15:56:03   Log-Likelihood:                 15548.
No. Observations:                4915   AIC:                        -3.049e+04
Df Residuals:                    4611   BIC:                        -2.851e+04
Df Model:                         303                                         
Covariance Type:            nonrobust                                         
                                                                                                                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

In [None]:
model = smf.ols(
    'removal_rate ~ Right_to_Buy_total_number_of_dwellings + '
    'Social_Homebuy_number_of_dwellings + '
    'Other_sales_to_sitting_tenants_number_of_dwellings + '
    'Other_sales_number_of_dwellings + '
    'Transfers_to_PRPs + '
    'Sales_of_Shared_Ownership_number_of_dwellings + '
    'C(local_authority)',
    data=panel
).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:           removal_rate   R-squared:                       0.111
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     1.067
Date:                Sun, 06 Jul 2025   Prob (F-statistic):              0.219
Time:                        15:56:03   Log-Likelihood:                 8809.3
No. Observations:                2833   AIC:                        -1.702e+04
Df Residuals:                    2534   BIC:                        -1.524e+04
Df Model:                         298                                         
Covariance Type:            nonrobust                                         
                                                                                                                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------