In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import re

In [2]:
council_housing = pd.read_csv(r'../../data/processed/council_housing_cleaned.csv')
dataset = pd.read_csv(r'../../data/processed/charity_main_cleaned.csv')

In [3]:
# Group by local authority, financial year, and size_category
removal_counts = (
    dataset
    .groupby(['local_authority', 'removal_fy'])
    .size()
    .reset_index(name='removals')
    .rename(columns={'removal_fy': 'financial_year'})
)

# Drop 2014
removal_counts = removal_counts[removal_counts['financial_year'] >= 2015]

df = removal_counts

In [4]:
council_housing = council_housing.rename(columns={'Local authority': 'local_authority'})
council_housing = council_housing.rename(columns={'Financial_Year': 'financial_year'})

# Merge datasets
panel = pd.merge(df, council_housing, on=['local_authority', 'financial_year'], how='left')

In [5]:
panel = panel[(panel['financial_year'] >= 2015) & (panel['financial_year'] <= 2023)]

panel

Unnamed: 0,local_authority,financial_year,removals,Right_to_Buy_total_number_of_dwellings,Social_Homebuy_number_of_dwellings,Other_sales_to_sitting_tenants_number_of_dwellings,Other_sales_number_of_dwellings,Transfers_to_PRPs,Sales_of_Shared_Ownership_number_of_dwellings
0,Aberdeen City,2016.0,2,,,,,,
1,Aberdeen City,2017.0,1,,,,,,
2,Aberdeen City,2018.0,1,,,,,,
3,Aberdeen City,2021.0,1,,,,,,
5,Aberdeenshire,2016.0,1,,,,,,
...,...,...,...,...,...,...,...,...,...
3499,York,2019.0,34,58.0,0,0,0,0,24
3500,York,2020.0,14,46.0,0,.,.,.,27
3501,York,2021.0,27,74.0,0,0,0,0,19
3502,York,2022.0,24,52.0,0,0,0,0,13


In [6]:
# Treat as category
panel['local_authority'] = panel['local_authority'].astype('category')

# Sale columns
sale_cols = [
    'Right_to_Buy_total_number_of_dwellings',
    'Social_Homebuy_number_of_dwellings',
    'Other_sales_to_sitting_tenants_number_of_dwellings',
    'Other_sales_number_of_dwellings',
    'Transfers_to_PRPs',
    'Sales_of_Shared_Ownership_number_of_dwellings'
]

# Convert to numeric
for col in sale_cols:
    panel[col] = pd.to_numeric(panel[col], errors='coerce')

# Drop rows where 'Right_to_Buy_total_number_of_dwellings' is NaN
panel = panel.dropna(subset=['Right_to_Buy_total_number_of_dwellings'])

# Fill NaN with 0 in the remaining sale columns
other_sale_cols = [
    'Social_Homebuy_number_of_dwellings',
    'Other_sales_to_sitting_tenants_number_of_dwellings',
    'Other_sales_number_of_dwellings',
    'Transfers_to_PRPs',
    'Sales_of_Shared_Ownership_number_of_dwellings'
]
panel[other_sale_cols] = panel[other_sale_cols].fillna(0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  panel[other_sale_cols] = panel[other_sale_cols].fillna(0)


In [7]:
# Calculate total sales
panel['total_sales'] = panel[sale_cols].sum(axis=1)

# Run regression
model = smf.ols(
    'removals ~ total_sales + C(local_authority) + C(financial_year)',
    data=panel
).fit()

model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.862
Model:                            OLS   Adj. R-squared:                  0.843
Method:                 Least Squares   F-statistic:                     45.22
Date:                Sun, 06 Jul 2025   Prob (F-statistic):               0.00
Time:                        17:12:15   Log-Likelihood:                -6953.9
No. Observations:                2489   AIC:                         1.451e+04
Df Residuals:                    2186   BIC:                         1.628e+04
Df Model:                         302                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  panel['total_sales'] = panel[sale_cols].sum(axis=1)


In [16]:
panel[panel['local_authority']=="Birmingham"]

Unnamed: 0,local_authority,financial_year,removals,Right_to_Buy_total_number_of_dwellings,Social_Homebuy_number_of_dwellings,Other_sales_to_sitting_tenants_number_of_dwellings,Other_sales_number_of_dwellings,Transfers_to_PRPs,Sales_of_Shared_Ownership_number_of_dwellings,total_sales,total_sales_lag1,total_sales_lag2,total_sales_lag3,total_sales_lag4
176,Birmingham,2015.0,52,449.0,0.0,0.0,0.0,0.0,0.0,449.0,,,,
177,Birmingham,2016.0,56,632.0,0.0,0.0,0.0,0.0,0.0,632.0,449.0,,,
178,Birmingham,2017.0,57,782.0,0.0,0.0,0.0,0.0,0.0,782.0,632.0,449.0,,
179,Birmingham,2018.0,72,683.0,0.0,0.0,0.0,0.0,3.0,686.0,782.0,632.0,449.0,
180,Birmingham,2019.0,89,693.0,0.0,0.0,0.0,0.0,1.0,694.0,686.0,782.0,632.0,449.0
181,Birmingham,2020.0,38,415.0,0.0,0.0,0.0,0.0,0.0,415.0,694.0,686.0,782.0,632.0
182,Birmingham,2021.0,67,675.0,0.0,0.0,0.0,0.0,0.0,675.0,415.0,694.0,686.0,782.0
183,Birmingham,2022.0,60,614.0,0.0,0.0,0.0,0.0,0.0,614.0,675.0,415.0,694.0,686.0
184,Birmingham,2023.0,43,584.0,0.0,0.0,0.0,0.0,0.0,584.0,614.0,675.0,415.0,694.0


In [14]:
panel = panel.sort_values(['local_authority', 'financial_year'])

panel['total_sales_lag1'] = panel.groupby('local_authority')['total_sales'].shift(1)
panel['total_sales_lag2'] = panel.groupby('local_authority')['total_sales'].shift(2)
panel['total_sales_lag3'] = panel.groupby('local_authority')['total_sales'].shift(3)

# Step 4: Run regression with lag
model = smf.ols(
    'removals ~ total_sales + total_sales_lag1 + total_sales_lag2 + total_sales_lag3 + C(local_authority) + C(financial_year)',
    data=panel
).fit()

# Step 5: Print results
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.861
Model:                            OLS   Adj. R-squared:                  0.830
Method:                 Least Squares   F-statistic:                     28.07
Date:                Sun, 06 Jul 2025   Prob (F-statistic):               0.00
Time:                        17:17:51   Log-Likelihood:                -4545.1
No. Observations:                1615   AIC:                             9674.
Df Residuals:                    1323   BIC:                         1.125e+04
Df Model:                         291                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

  panel['total_sales_lag1'] = panel.groupby('local_authority')['total_sales'].shift(1)
  panel['total_sales_lag2'] = panel.groupby('local_authority')['total_sales'].shift(2)
  panel['total_sales_lag3'] = panel.groupby('local_authority')['total_sales'].shift(3)
