In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import re

In [2]:
council_housing = pd.read_csv(r'../../data/processed/council_housing_cleaned.csv')
dataset = pd.read_csv(r'../../data/processed/charity_main_cleaned.csv')

In [3]:
# Group by local authority, financial year, and size_category
removal_counts = (
    dataset
    .groupby(['local_authority', 'size_category','removal_fy'])
    .size()
    .reset_index(name='removals')
    .rename(columns={'removal_fy': 'financial_year'})
)

# Drop 2014
removal_counts = removal_counts[removal_counts['financial_year'] >= 2015]

df = removal_counts

In [4]:
council_housing = council_housing.rename(columns={'Local authority': 'local_authority'})
council_housing = council_housing.rename(columns={'Financial_Year': 'financial_year'})

# Merge datasets
panel = pd.merge(df, council_housing, on=['local_authority', 'financial_year'], how='left')

In [5]:
panel = panel[(panel['financial_year'] >= 2015) & (panel['financial_year'] <= 2023)]

panel

Unnamed: 0,local_authority,size_category,financial_year,removals,Right_to_Buy_total_number_of_dwellings,Social_Homebuy_number_of_dwellings,Other_sales_to_sitting_tenants_number_of_dwellings,Other_sales_number_of_dwellings,Transfers_to_PRPs,Sales_of_Shared_Ownership_number_of_dwellings
0,Aberdeen City,Medium,2016.0,1,,,,,,
1,Aberdeen City,Medium,2017.0,1,,,,,,
2,Aberdeen City,Medium,2018.0,1,,,,,,
3,Aberdeen City,Small,2016.0,1,,,,,,
4,Aberdeen City,Small,2021.0,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...
6768,York,Small,2019.0,24,58.0,0,0,0,0,24
6769,York,Small,2020.0,11,46.0,0,.,.,.,27
6770,York,Small,2021.0,19,74.0,0,0,0,0,19
6771,York,Small,2022.0,17,52.0,0,0,0,0,13


In [6]:
# Treat as category
panel['local_authority'] = panel['local_authority'].astype('category')
panel['size_category'] = panel['size_category'].astype('category')

# Sale columns
sale_cols = [
    'Right_to_Buy_total_number_of_dwellings',
    'Social_Homebuy_number_of_dwellings',
    'Other_sales_to_sitting_tenants_number_of_dwellings',
    'Other_sales_number_of_dwellings',
    'Transfers_to_PRPs',
    'Sales_of_Shared_Ownership_number_of_dwellings'
]

# Convert to numeric
for col in sale_cols:
    panel[col] = pd.to_numeric(panel[col], errors='coerce')

# Drop rows where 'Right_to_Buy_total_number_of_dwellings' is NaN
panel = panel.dropna(subset=['Right_to_Buy_total_number_of_dwellings'])

# Fill NaN with 0 in the remaining sale columns
other_sale_cols = [
    'Social_Homebuy_number_of_dwellings',
    'Other_sales_to_sitting_tenants_number_of_dwellings',
    'Other_sales_number_of_dwellings',
    'Transfers_to_PRPs',
    'Sales_of_Shared_Ownership_number_of_dwellings'
]
panel[other_sale_cols] = panel[other_sale_cols].fillna(0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  panel[other_sale_cols] = panel[other_sale_cols].fillna(0)


In [12]:
# Calculate total sales
panel['total_sales'] = panel[sale_cols].sum(axis=1)

# Run regression
model = smf.ols(
    'removals ~ total_sales + C(size_category) + C(local_authority) + C(financial_year) + total_sales:C(size_category)',
    data=panel
).fit()

model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.674
Model:                            OLS   Adj. R-squared:                  0.653
Method:                 Least Squares   F-statistic:                     31.72
Date:                Sun, 06 Jul 2025   Prob (F-statistic):               0.00
Time:                        22:56:34   Log-Likelihood:                -13625.
No. Observations:                4996   AIC:                         2.786e+04
Df Residuals:                    4689   BIC:                         2.986e+04
Df Model:                         306                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

In [9]:
panel

Unnamed: 0,local_authority,size_category,financial_year,removals,Right_to_Buy_total_number_of_dwellings,Social_Homebuy_number_of_dwellings,Other_sales_to_sitting_tenants_number_of_dwellings,Other_sales_number_of_dwellings,Transfers_to_PRPs,Sales_of_Shared_Ownership_number_of_dwellings,total_sales,total_sales_lag1,total_sales_lag2,total_sales_lag3
15,Adur,Small,2015.0,2,9.0,0.0,0.0,0.0,0.0,0.0,9.0,,,
8,Adur,Medium,2016.0,3,7.0,0.0,0.0,0.0,0.0,0.0,7.0,,,
16,Adur,Small,2016.0,2,7.0,0.0,0.0,0.0,0.0,0.0,7.0,9.0,,
9,Adur,Medium,2017.0,2,8.0,0.0,0.0,0.0,0.0,0.0,8.0,7.0,,
17,Adur,Small,2017.0,1,8.0,0.0,0.0,0.0,0.0,0.0,8.0,7.0,9.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6770,York,Small,2021.0,19,74.0,0.0,0.0,0.0,0.0,19.0,93.0,73.0,82.0,68.0
6760,York,Medium,2022.0,7,52.0,0.0,0.0,0.0,0.0,13.0,65.0,93.0,73.0,82.0
6771,York,Small,2022.0,17,52.0,0.0,0.0,0.0,0.0,13.0,65.0,93.0,73.0,82.0
6761,York,Medium,2023.0,3,50.0,0.0,0.0,0.0,0.0,25.0,75.0,65.0,93.0,73.0


In [8]:
panel = panel.sort_values(['local_authority', 'financial_year'])

panel['total_sales_lag1'] = panel.groupby(['local_authority', 'size_category'])['total_sales'].shift(1)
panel['total_sales_lag2'] = panel.groupby(['local_authority', 'size_category'])['total_sales'].shift(2)
panel['total_sales_lag3'] = panel.groupby(['local_authority', 'size_category'])['total_sales'].shift(3)

# Step 4: Run regression with lag
model = smf.ols(
    'removals ~ total_sales + C(size_category) + total_sales_lag1 + total_sales_lag2 + total_sales_lag3 + C(local_authority) + C(financial_year)',
    data=panel
).fit()

# Step 5: Print results
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.684
Model:                            OLS   Adj. R-squared:                  0.649
Method:                 Least Squares   F-statistic:                     19.56
Date:                Sun, 06 Jul 2025   Prob (F-statistic):               0.00
Time:                        22:48:38   Log-Likelihood:                -8120.2
No. Observations:                2946   AIC:                         1.683e+04
Df Residuals:                    2652   BIC:                         1.859e+04
Df Model:                         293                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

  panel['total_sales_lag1'] = panel.groupby(['local_authority', 'size_category'])['total_sales'].shift(1)
  panel['total_sales_lag2'] = panel.groupby(['local_authority', 'size_category'])['total_sales'].shift(2)
  panel['total_sales_lag3'] = panel.groupby(['local_authority', 'size_category'])['total_sales'].shift(3)
