In [46]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import itertools
from statsmodels.tsa.stattools import grangercausalitytests

In [2]:
DISPOSAL_FILEPATH = "../../data/raw/council_disposal_receipts.xlsx"

YEAR_SKIP_MAPPING = {
    "2014": 0,
    "2015": 0,
    "2016": 2,
    "2017": 3,
    "2018": 3,
    "2019": 4,
    "2020": 4,
    "2021": 4,
    "2022": 4,
    "2023": 4,
}

dfs = []
for year, skip_row in YEAR_SKIP_MAPPING.items():
    df = pd.read_excel(DISPOSAL_FILEPATH, skiprows=skip_row, sheet_name=year)
    dfs.append(df)

dataset = pd.read_csv(r'../../data/processed/charity_main_cleaned.csv')
postcode_msoa = pd.read_csv(r'../../data/raw/PCD_OA21_LSOA21_MSOA21_LAD_FEB25_UK_LU.csv')

  postcode_msoa = pd.read_csv(r'../../data/raw/PCD_OA21_LSOA21_MSOA21_LAD_FEB25_UK_LU.csv')


In [3]:
# Step 1: Ensure postcode columns are uppercase and stripped
dataset['postcode'] = dataset['postcode'].str.upper().str.strip()
postcode_msoa['pcds'] = postcode_msoa['pcds'].str.upper().str.strip()

# Step 2: Merge on postcode
dataset = dataset.merge(
    postcode_msoa[['pcds', 'msoa21nm']],
    left_on='postcode',
    right_on='pcds',
    how='left'
)

# Step 3: Rename to 'msoa'
dataset = dataset.rename(columns={'msoa21nm': 'msoa'})

# Step 4: Drop the duplicate postcode column if needed
dataset = dataset.drop(columns='pcds')

In [4]:
def clean_sheet(df, rename_from_col=3, header_row=1):
    """
    Cleans a DataFrame read from Excel:
    - Replaces values in `header_row` with column names starting from `rename_from_col`
    - Keeps all columns up to `rename_from_col`
    - Keeps only renamed columns after `rename_from_col`
    - Sets row `header_row` as the header and drops all rows above it
    """
    # Step 1: Identify renamed columns
    renamed_mask = [not str(col).startswith("Unnamed:") for col in df.columns]

    # Step 2: Determine which columns to keep
    cols_to_keep = list(range(rename_from_col + 1))  # Always keep up to and including `rename_from_col`
    for i in range(rename_from_col + 1, len(df.columns)):
        if renamed_mask[i]:
            cols_to_keep.append(i)

    # Step 3: Overwrite header_row with column names for renamed columns
    for i in range(rename_from_col, len(df.columns)):
        if renamed_mask[i]:
            df.iloc[header_row, i] = df.columns[i]

    # Step 4: Set row `header_row` as header and drop rows above
    df.columns = df.iloc[header_row]
    df = df.drop(index=list(range(header_row + 1))).reset_index(drop=True)

    # Step 5: Keep only selected columns
    df = df.iloc[:, cols_to_keep]

    return df

def rename_and_filter_disposal(df, start_col=5, keyword=": Disposal of tangible fixed assets"):
    """
    Keeps:
    - All columns before `start_col`
    - Columns from `start_col` onward that contain `keyword`
    
    Renames matching columns by removing the keyword from their name.
    """
    cols_to_keep = list(range(start_col))  # Keep early columns as-is

    new_columns = df.columns.tolist()  # Copy of column names

    for i in range(start_col, len(df.columns)):
        col = str(df.columns[i])
        if keyword in col:
            # Rename column by removing the keyword
            new_columns[i] = col.replace(keyword, "")
            cols_to_keep.append(i)

    # Apply renaming
    df.columns = new_columns

    # Keep only selected columns
    return df.iloc[:, cols_to_keep]

dfs[0] = clean_sheet(dfs[0], rename_from_col=3, header_row=1)
dfs[1] = clean_sheet(dfs[1], rename_from_col=4, header_row=1)
dfs[2] = clean_sheet(dfs[2], rename_from_col=4, header_row=2)
dfs[3] = clean_sheet(dfs[3], rename_from_col=5, header_row=0)
dfs[4] = clean_sheet(dfs[4], rename_from_col=5, header_row=0)

dfs[5] = rename_and_filter_disposal(dfs[5], start_col=5)
dfs[6] = rename_and_filter_disposal(dfs[6], start_col=5)
dfs[7] = rename_and_filter_disposal(dfs[7], start_col=5)
dfs[8] = rename_and_filter_disposal(dfs[8], start_col=5)
dfs[9] = rename_and_filter_disposal(dfs[9], start_col=6)

In [5]:
for i in range(10):
    df = dfs[i]

    # Step 1: Standardise column names first
    df.columns = df.columns.str.strip().str.lower()
    if 'la name' in df.columns:
        df = df.rename(columns={'la name': 'local_authority'})
    
    if 'local_authority' in df.columns:
        df['local_authority'] = df['local_authority'].str.replace(r'(?i)\(ua\)|\bua\b', '', regex=True).str.strip()

    # Step 2: Drop bad values in first column
    first_col = df.columns[0]
    df = df[~df[first_col].isin([pd.NA, None, '[z]', 'la_lgf_code'])]
    df = df.dropna(subset=[first_col])

    # Step 3: Drop missing ONS codes
    if i == 0 and 'ecode' in df.columns:
        df = df.dropna(subset=['ecode'])
    elif i != 0 and 'ons code' in df.columns:
        df = df.dropna(subset=['ons code'])

    # Step 4: Drop unwanted council classes
    if 'class' in df.columns:
        df = df[~df['class'].isin(['O', 'MC','SC'])]

    # Save cleaned frame back
    dfs[i] = df

In [6]:
merge_columns = {
    "community safety": ["community safety", "community safety (cctv)"],
    'agricultural & fisheries services':['agriculture & fisheries'],
    'all services total':['total all services'],
    'early years & primary schools':['pre-primary & primary education'],
    'parking':['parking of vehicles'],
    'ports & piers':['local authority ports & piers'],
    'special schools & alternative provision':['special education'],
    'secondary schools':['secondary education'],
    'total industrial & commercial trading':['industrial & commercial trading'],
    'total environmental & regulatory services':['total environmental services','regulatory services (environmental health)'],
    'tolled roads, bridges, tunnels,ferries & public transport companies':['tolled road bridges, tunnels, ferries, public transport companies','tolled road bridges, tunnels, ferries & public transport companies'],
    'public transport (bus)':['public passenger transport - bus'],
    'public transport (rail & other)':['public passenger transport - rail & other'],
    'total housing':['housing'],
    'total police':['police'],
    'total social care':['social services','social care'],
    'total public health':['public health'],
    'roads, street lighting & road safety':['roads, street lights & safety'],
    'total fire & rescue services':['fire & rescue services'],
    'total central services':['central services (including court services)'],
    'street cleaning (not chargeable to highways)':['street cleaning not chargeable to highways'],
    'total planning & development':['total planning & development services','planning & development services'],
    'total trading services':['total trading','trading'],
    'total education':['education'],
    'total highways & transport':['total transport','highways & transport'],
    'total culture & related services':['culture & heritage']

    #'commercial housing',
    #'energy generation & supply',
    #'finance & insurance activity',
    #'hospitality & catering',
    #'lgf code',
    #'ons code',
    #'other commercial activity',
    #'other real estate activities',
    #'post-16 provision & other education',
    #'subclass',
    #'total digital infrastructure',
    #'water supply, sewerage & remediation'
}

# Standardise and merge alias columns across all dfs
for idx in range(len(dfs)):
    df = dfs[idx].copy()
    df.columns = [
        str(col).strip().lower()
        .replace(" and ", " & ")
        for col in df.columns
    ]

    dfs[idx] = df  # Save the cleaned version

    for standard_col, aliases in merge_columns.items():
        # Find which alias columns are present
        present_cols = [col for col in aliases if col in df.columns]
        if not present_cols:
            continue  # Nothing to merge for this category

        # Sum across all present columns
        df[standard_col] = df[present_cols].sum(axis=1, skipna=True)

        # Drop duplicates, except the one we’re keeping
        cols_to_drop = [col for col in present_cols if col != standard_col]
        df.drop(columns=cols_to_drop, inplace=True)

    dfs[idx] = df  # Save the cleaned DataFrame back

In [7]:
# Combined mapping: new_name → list of old names
unified_map = {
    'buckinghamshire': ['aylesbury vale', 'chiltern', 'south bucks', 'wycombe','south buckinghamshire'],
    'dorset': ['weymouth and portland', 'west dorset', 'north dorset', 'east dorset', 'purbeck', 'christchurch'],
    'somerset': ['taunton deane', 'west somerset', 'mendip', 'sedgemoor', 'south somerset', 'somerset council','somerset west and taunton'],
    'cumberland': ['allerdale', 'carlisle', 'copeland', 'cumberland council'],
    'westmorland and furness': ['barrow in furness', 'barrow-in-furness', 'eden', 'south lakeland'],
    'north yorkshire': ['craven', 'hambleton', 'harrogate', 'richmondshire', 'ryedale', 'scarborough', 'selby', 'north yorkshire council'],
    'bournemouth christchurch and poole': ['bournemouth', 'christchurch', 'poole'],
    'west suffolk': ['forest heath', 'st edmundsbury'],
    'east suffolk': ['suffolk coastal', 'waveney'],
    'bath and north east somerset': ['bath and ne somerset'],
    'southend-on-sea': ['southend on sea'],
    'leicester': ['leicester city'],
    'medway': ['medway towns'],
    'derby': ['derby city'],
    'folkestone and hythe': ['shepway'],
    'county durham': ['durham'],
    "king's lynn and west norfolk": ['kings lynn and west norfolk'],
    'north northamptonshire': ['wellingborough', 'east northamptonshire', 'kettering', 'corby'],
    'west northamptonshire': ['northampton', 'south northamptonshire', 'daventry'],
}
# old_name → new_name (lowercase)
flat_lookup = {
    old.lower(): new.lower()
    for new, olds in unified_map.items()
    for old in olds
}
def clean_local_authority(name):
    if pd.isna(name):
        return name
    name = str(name).lower()
    name = (
        name.replace('&', 'and')
        .replace('-', ' ')
        .replace(',', '')
        .replace('.', '')
        .replace(' city of', '')
        .replace(' county of', '')
        .strip()
    )
    return flat_lookup.get(name, name)
# Apply to disposal files
for i in range(len(dfs)):
    if 'local_authority' in dfs[i].columns:
        dfs[i]['local_authority'] = dfs[i]['local_authority'].apply(clean_local_authority)

# Apply to charity dataset
if 'local_authority' in dataset.columns:
    dataset['local_authority'] = dataset['local_authority'].apply(clean_local_authority)

# List of known devolved nation council names to exclude
non_england_keywords = [
    'aberdeen', 'aberdeenshire', 'angus', 'antrim', 'ards', 'argyll', 'armagh', 'belfast', 'blaenau', 'bridgend',
    'caerphilly', 'cardiff', 'carmarthenshire', 'causeway', 'ceredigion', 'conwy', 'denbighshire', 'derry',
    'dumfries', 'dundee', 'east ayrshire', 'east dunbartonshire', 'east lothian', 'east renfrewshire', 'falkirk',
    'fermanagh', 'fife', 'flintshire', 'glasgow', 'gwynedd', 'highland', 'inverclyde', 'isle of man',
    'isle of anglesey', 'lisburn', 'merthyr', 'mid and east antrim', 'mid ulster', 'midlothian', 'monmouthshire',
    'moray', 'na h eileanan siar', 'neath', 'newport', 'newry', 'north ayrshire', 'north lanarkshire', 'orkney',
    'pembrokeshire', 'perth and kinross', 'powys', 'renfrewshire', 'rhondda', 'scottish borders', 'shetland',
    'south ayrshire', 'south lanarkshire', 'stirling', 'swansea', 'torfaen', 'vale of glamorgan', 'west dunbartonshire',
    'west lothian', 'wrexham','city of edinburgh','channel islands'
]

# Convert to lowercase and filter out rows containing these names
dataset['local_authority_lower'] = dataset['local_authority'].str.lower()
dataset = dataset[~dataset['local_authority_lower'].str.contains('|'.join(non_england_keywords), na=False)]

# Drop the helper column
dataset = dataset.drop(columns=['local_authority_lower'])

In [8]:
# Step 1: Get unique local_authority names from dfs[0–9]
all_disposal_names = set()
for i in range(10):
    if 'local_authority' in dfs[i].columns:
        all_disposal_names.update(dfs[i]['local_authority'].dropna().unique())

# Step 2: Get unique local_authority names from dataset
if 'local_authority' in dataset.columns:
    charity_names = set(dataset['local_authority'].dropna().unique())
else:
    raise KeyError("'local_authority' column not found in dataset")

# Step 3: Find mismatches
in_disposal_not_in_charity = all_disposal_names - charity_names
in_charity_not_in_disposal = charity_names - all_disposal_names

# Step 4: Print results
print("✅ In disposal (dfs[0–9]) but NOT in charity dataset:")
print(sorted(in_disposal_not_in_charity))

print("\n🔍 In charity dataset but NOT in disposal (dfs[0–9]):")
print(sorted(in_charity_not_in_disposal))


✅ In disposal (dfs[0–9]) but NOT in charity dataset:
[]

🔍 In charity dataset but NOT in disposal (dfs[0–9]):
[]


In [9]:
# Step 1: Assign financial year to each DataFrame in dfs
years = list(range(2014, 2024))
for i, year in enumerate(years):
    if not dfs[i].empty:
        dfs[i]['financial_year'] = year

# Step 2: Melt each DataFrame into long format
long_frames = []
for i, df in enumerate(dfs):
    if df.empty:
        continue
    long_df = df.melt(
        id_vars=["local_authority", "financial_year"],
        value_vars=[
            col for col in df.columns 
            if col not in ["ecode", 'lgf code', 'ons code', "class", "subclass", 
                           "local_authority", "financial_year", 'certification complete']
        ],
        var_name="category",
        value_name="value"
    )
    long_frames.append(long_df)

# Step 3: Combine all disposal long-format DataFrames
disposal_long_df = pd.concat(long_frames, ignore_index=True)

# Step 4: Count removals from charity dataset
removals = (
    dataset
    .groupby(['local_authority', 'removal_fy', 'size_category','msoa'])
    .size()
    .reset_index(name='removals')
    .rename(columns={'removal_fy': 'financial_year'})
)

# Step 5: Merge disposal data with charity removals
panel = pd.merge(
    disposal_long_df,
    removals,
    on=["financial_year", "local_authority"],
    how="outer"
)

# Step 6: Replace NaNs in 'removals' with 0
panel['removals'] = panel['removals'].fillna(0).astype(int)
panel = panel[(panel['financial_year'] >= 2015) & (panel['financial_year'] <= 2023)]
panel['value'] = pd.to_numeric(panel['value'], errors='coerce') / 1000000


In [10]:
removals

Unnamed: 0,local_authority,financial_year,size_category,msoa,removals
0,adur,2015.0,Small,Adur 005,1
1,adur,2015.0,Small,Adur 008,1
2,adur,2016.0,Medium,Adur 004,2
3,adur,2016.0,Medium,Adur 008,1
4,adur,2016.0,Small,Adur 004,1
...,...,...,...,...,...
31699,york,2024.0,Small,York 023,2
31700,york,2024.0,Small,York 024,4
31701,york,2025.0,Medium,York 013,1
31702,york,2025.0,Medium,York 017,1


In [23]:
# Step 1: Separate disposal data (no size_category or msoa)
disposal = panel[['local_authority', 'financial_year', 'category', 'value']].drop_duplicates()

# Step 2: Charity data (includes size_category and msoa)
charity = panel[['local_authority', 'financial_year', 'category', 'size_category', 'removals', 'msoa']].drop_duplicates()

# Step 3: Merge disposal (broadcast across msoa + size_category rows)
merged = pd.merge(
    charity,
    disposal,
    on=['local_authority', 'financial_year', 'category'],
    how='left'
)

# Step 4: Fill missing values
merged['value'] = pd.to_numeric(merged['value'], errors='coerce').fillna(0)
merged['removals'] = merged['removals'].fillna(0).astype(int)

# Result
panel_complete = merged
panel_complete

Unnamed: 0,local_authority,financial_year,category,size_category,removals,msoa,value
0,adur,2015.0,non-school funding,Small,1,Adur 005,0.0
1,adur,2015.0,non-school funding,Small,1,Adur 008,0.0
2,adur,2015.0,airports,Small,1,Adur 005,0.0
3,adur,2015.0,airports,Small,1,Adur 008,0.0
4,adur,2015.0,recreation & sport,Small,1,Adur 005,0.0
...,...,...,...,...,...,...,...
1784427,york,2023.0,"all services total, of which vehicles",Small,1,York 011,0.0
1784428,york,2023.0,"all services total, of which vehicles",Small,1,York 013,0.0
1784429,york,2023.0,"all services total, of which vehicles",Small,1,York 019,0.0
1784430,york,2023.0,"all services total, of which vehicles",Small,2,York 021,0.0


In [24]:
# Resulting filtered panel
filtered_panel = panel_complete[panel_complete['category']=='all services total']
# Ensure proper sorting
filtered_panel = filtered_panel.sort_values(by=['local_authority', 'size_category', 'financial_year'])

# Create lagged value columns

In [32]:
filtered_panel['removals'].max()

np.int64(29)

In [41]:
from statsmodels.tsa.stattools import adfuller

# ADF test on 'removals'
result_removals = adfuller(filtered_panel['removals'].dropna())
print("ADF Test for 'removals':")
print(f"ADF Statistic: {result_removals[0]}")
print(f"p-value: {result_removals[1]}")
print(f"Critical Values: {result_removals[4]}\n")

# ADF test on 'value'
result_value = adfuller(filtered_panel['value'].dropna())
print("ADF Test for 'value':")
print(f"ADF Statistic: {result_value[0]}")
print(f"p-value: {result_value[1]}")
print(f"Critical Values: {result_value[4]}")


ADF Test for 'removals':
ADF Statistic: -40.44493804767834
p-value: 0.0
Critical Values: {'1%': np.float64(-3.430573570146791), '5%': np.float64(-2.8616388118688993), '10%': np.float64(-2.566822594558581)}

ADF Test for 'value':
ADF Statistic: -18.76394493133423
p-value: 2.025990577181649e-30
Critical Values: {'1%': np.float64(-3.4305738456544788), '5%': np.float64(-2.861638933631144), '10%': np.float64(-2.566822659369727)}


In [42]:
from statsmodels.tsa.stattools import coint

# Run Engle-Granger cointegration test
coint_stat, p_value, crit_values = coint(filtered_panel['removals'], filtered_panel['value'])

print(f"Engle-Granger Cointegration Test")
print(f"Test Statistic: {coint_stat}")
print(f"p-value: {p_value}")
print(f"Critical Values: {crit_values}")

Engle-Granger Cointegration Test
Test Statistic: -40.41334532613555
p-value: 0.0
Critical Values: [-3.89681427 -3.33633879 -3.04459493]


In [47]:
# Format data as a two-column array: [removals, value]
data = filtered_panel[['removals', 'value']].dropna()

# Run Granger causality test with up to 3 lags
grangercausalitytests(data, maxlag=3)



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=10.1664 , p=0.0014  , df_denom=29262, df_num=1
ssr based chi2 test:   chi2=10.1674 , p=0.0014  , df=1
likelihood ratio test: chi2=10.1656 , p=0.0014  , df=1
parameter F test:         F=10.1664 , p=0.0014  , df_denom=29262, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=4.9534  , p=0.0071  , df_denom=29259, df_num=2
ssr based chi2 test:   chi2=9.9084  , p=0.0071  , df=2
likelihood ratio test: chi2=9.9068  , p=0.0071  , df=2
parameter F test:         F=4.9534  , p=0.0071  , df_denom=29259, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=5.4475  , p=0.0010  , df_denom=29256, df_num=3
ssr based chi2 test:   chi2=16.3464 , p=0.0010  , df=3
likelihood ratio test: chi2=16.3418 , p=0.0010  , df=3
parameter F test:         F=5.4475  , p=0.0010  , df_denom=29256, df_num=3


{np.int64(1): ({'ssr_ftest': (np.float64(10.166371840999677),
    np.float64(0.0014317597633544262),
    np.float64(29262.0),
    np.int64(1)),
   'ssr_chi2test': (np.float64(10.167414118202979),
    np.float64(0.0014294462069016352),
    np.int64(1)),
   'lrtest': (np.float64(10.165648316513398),
    np.float64(0.0014308159256741669),
    np.int64(1)),
   'params_ftest': (np.float64(10.16637184099323),
    np.float64(0.0014317597633565194),
    np.float64(29262.0),
    1.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2cd0a7040b0>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2cd0a79cd70>,
   array([[0., 1., 0.]])]),
 np.int64(2): ({'ssr_ftest': (np.float64(4.9533745661114965),
    np.float64(0.00706546690133107),
    np.float64(29259.0),
    np.int64(2)),
   'ssr_chi2test': (np.float64(9.908442072708352),
    np.float64(0.007053572616830198),
    np.int64(2)),
   'lrtest': (np.float64(9.906765010964591),
    np.float64(0.00705948973

In [49]:
# Prepare data: ensure it's sorted and aligned
df = filtered_panel[['removals', 'value']].dropna().copy()
df = df.astype(float)

# Run Granger causality test: does 'removals' Granger-cause 'value'?
# maxlag = 3 for testing 1, 2, 3 lags
reverse_granger_results = grangercausalitytests(df[['value', 'removals']], maxlag=3, verbose=True)



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=27.7555 , p=0.0000  , df_denom=29262, df_num=1
ssr based chi2 test:   chi2=27.7584 , p=0.0000  , df=1
likelihood ratio test: chi2=27.7452 , p=0.0000  , df=1
parameter F test:         F=27.7555 , p=0.0000  , df_denom=29262, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=24.3481 , p=0.0000  , df_denom=29259, df_num=2
ssr based chi2 test:   chi2=48.7046 , p=0.0000  , df=2
likelihood ratio test: chi2=48.6641 , p=0.0000  , df=2
parameter F test:         F=24.3481 , p=0.0000  , df_denom=29259, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=16.4157 , p=0.0000  , df_denom=29256, df_num=3
ssr based chi2 test:   chi2=49.2589 , p=0.0000  , df=3
likelihood ratio test: chi2=49.2174 , p=0.0000  , df=3
parameter F test:         F=16.4157 , p=0.0000  , df_denom=29256, df_num=3




In [44]:
# Run OLS regression with fixed effects for LA and year
model = smf.ols(
    formula='removals ~ value + C(local_authority) + C(financial_year) + C(size_category) + value:C(size_category)',
    data=filtered_panel
).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.193
Model:                            OLS   Adj. R-squared:                  0.184
Method:                 Least Squares   F-statistic:                     22.54
Date:                Thu, 10 Jul 2025   Prob (F-statistic):               0.00
Time:                        00:15:23   Log-Likelihood:                -33151.
No. Observations:               29254   AIC:                         6.692e+04
Df Residuals:                   28946   BIC:                         6.947e+04
Df Model:                         307                                         
Covariance Type:            nonrobust                                         
                                                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------

In [52]:
model = smf.ols(
    formula=(
        'removals ~ '
        'value + value_lag1 + value_lag2 + value_lag3 + '
        'C(local_authority) + C(financial_year) + C(size_category) + '
        'value:C(size_category) + '
        'value_lag1:C(size_category) + '
        'value_lag2:C(size_category) + '
        'value_lag3:C(size_category)'
    ),
    data=filtered_panel
).fit()

print(model.summary())


PatsyError: Error evaluating factor: NameError: name 'value_lag1' is not defined
    removals ~ value + value_lag1 + value_lag2 + value_lag3 + C(local_authority) + C(financial_year) + C(size_category) + value:C(size_category) + value_lag1:C(size_category) + value_lag2:C(size_category) + value_lag3:C(size_category)
                       ^^^^^^^^^^

In [None]:
panel_complete['category'].unique()

In [None]:
filtered_panel_1 = panel_complete[
    panel_complete['category'].isin(['non-school funding', 'airports', 'recreation & sport',
       'open spaces', 'tourism', 'library services',
       'total culture & related services',
       'cemeteries, cremation & mortuary', 'coast protection',
       'community safety', 'flood defence & land drainage',
       'regulatory services (trading standards)', 'waste collection',
       'waste disposal', 'trade waste', 'recycling', 'waste minimisation',
       'climate change costs', 'other trading',
       'agricultural & fisheries services', 'all services total',
       'early years & primary schools', 'parking', 'ports & piers',
       'special schools & alternative provision', 'secondary schools',
       'total industrial & commercial trading',
       'total environmental & regulatory services',
       'tolled roads, bridges, tunnels,ferries & public transport companies',
       'public transport (bus)', 'public transport (rail & other)',
       'total housing', 'total police', 'total social care',
       'total public health', 'roads, street lighting & road safety',
       'total fire & rescue services', 'total central services',
       'street cleaning (not chargeable to highways)',
       'total planning & development', 'total trading services',
       'total education', 'total highways & transport',
       'culture & related services',
       'environmental & regulatory services',
       'post-16 provision & other education', 'commercial housing',
       'other real estate activities', 'finance & insurance activity',
       'energy generation & supply',
       'water supply, sewerage & remediation', 'hospitality & catering',
       'other commercial activity', 'total digital infrastructure'])
]


In [51]:
filtered_panel = panel_complete[
    panel_complete['category'].str.startswith('total') &
    ~panel_complete['category'].str.contains(', of which')
]
model = smf.ols(
    formula=(
        'removals ~ value + C(local_authority) + C(financial_year) + C(category) + C(size_category) + value:C(size_category) + value:C(category)'
    ),
    data=filtered_panel
).fit()

print(model.summary())


                            OLS Regression Results                            
Dep. Variable:               removals   R-squared:                       0.209
Model:                            OLS   Adj. R-squared:                  0.208
Method:                 Least Squares   F-statistic:                     299.0
Date:                Thu, 10 Jul 2025   Prob (F-statistic):               0.00
Time:                        18:09:43   Log-Likelihood:            -4.3188e+05
No. Observations:              378776   AIC:                         8.644e+05
Df Residuals:                  378441   BIC:                         8.681e+05
Df Model:                         334                                         
Covariance Type:            nonrobust                                         
                                                                     coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------