In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import itertools
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.iolib.summary2 import summary_col
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
filtered_panel = pd.read_csv(r'../data/processed/final_panel_data.csv')

In [None]:
test = filtered_panel.groupby('local_authority').size().reset_index(name="value")
test[test["value"]!=27]

In [None]:
# ADF test on 'removals'
result_removals = adfuller(filtered_panel['removals'].dropna())
print("ADF Test for 'removals':")
print(f"ADF Statistic: {result_removals[0]}")
print(f"p-value: {result_removals[1]}")
print(f"Critical Values: {result_removals[4]}\n")

# ADF test on 'value'
result_value = adfuller(filtered_panel['value'].dropna())
print("ADF Test for 'value':")
print(f"ADF Statistic: {result_value[0]}")
print(f"p-value: {result_value[1]}")
print(f"Critical Values: {result_value[4]}")

In [None]:
# Run Engle-Granger cointegration test
coint_stat, p_value, crit_values = coint(filtered_panel['removals'], filtered_panel['value'])

print(f"Engle-Granger Cointegration Test")
print(f"Test Statistic: {coint_stat}")
print(f"p-value: {p_value}")
print(f"Critical Values: {crit_values}")

In [None]:
# Format data as a two-column array: [removals, value]
data = filtered_panel[['removals', 'value']].dropna()

# Run Granger causality test with up to 3 lags
grangercausalitytests(data, maxlag=3)

In [None]:
# Prepare data: ensure it's sorted and aligned
df = filtered_panel[['removals', 'value']].dropna().copy()
df = df.astype(float)

# Run Granger causality test: does 'removals' Granger-cause 'value'?
# maxlag = 3 for testing 1, 2, 3 lags
reverse_granger_results = grangercausalitytests(df[['value', 'removals']], maxlag=3, verbose=True)


In [None]:
# Run OLS regression with fixed effects for LA and year
model = smf.ols(
    formula='removals ~ value + C(local_authority) + C(financial_year) + C(size_category)+value:C(size_category)',
    data=filtered_panel
).fit()

print(model.summary())

In [None]:
print(model.summary().as_latex())

In [None]:
model = smf.ols(
    formula=(
        'removals ~ '
        'value + value_lag1 + value_lag2 + value_lag3 + '
        'C(local_authority) + C(financial_year) + C(size_category) + '
        'value:C(size_category) + '
        'value_lag1:C(size_category) + '
        'value_lag2:C(size_category) + '
        'value_lag3:C(size_category)'
    ),
    data=filtered_panel
).fit()

print(model.summary())


In [None]:
print(model.summary().as_latex())

In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf
from sklearn.metrics import r2_score, mean_squared_error

# ----------------------------
# Step 1: Ensure consistent categorical dtype
# ----------------------------
filtered_panel['financial_year'] = pd.Categorical(
    filtered_panel['financial_year'],
    categories=sorted(filtered_panel['financial_year'].unique())
)
filtered_panel['local_authority'] = pd.Categorical(
    filtered_panel['local_authority'],
    categories=sorted(filtered_panel['local_authority'].unique())
)
filtered_panel['size_category'] = pd.Categorical(
    filtered_panel['size_category'],
    categories=sorted(filtered_panel['size_category'].unique())
)

# ----------------------------
# Step 2: Stratified 70/30 split within each financial year
# ----------------------------
train_list = []
test_list = []

for year in filtered_panel['financial_year'].cat.categories:
    year_data = filtered_panel[filtered_panel['financial_year'] == year]
    if len(year_data) > 1:
        train_split, test_split = train_test_split(
            year_data, test_size=0.3, random_state=42
        )
        train_list.append(train_split)
        test_list.append(test_split)
    else:
        train_list.append(year_data)

train_data = pd.concat(train_list)
test_data = pd.concat(test_list)

# ----------------------------
# Step 3: Clean test set — drop rows with missing lag values
# ----------------------------
required_vars = [
    'value', 'value_lag1', 'value_lag2', 'value_lag3',
    'local_authority', 'financial_year', 'size_category'
]

test_data_clean = test_data.dropna(subset=required_vars).copy()

# Match categories between train and test
for col in ['financial_year', 'local_authority', 'size_category']:
    test_data_clean[col] = pd.Categorical(
        test_data_clean[col],
        categories=train_data[col].cat.categories
    )

# ----------------------------
# Step 4: Fit the model
# ----------------------------
train_model = smf.ols(
    formula=(
        'removals ~ '
        'value + value_lag1 + value_lag2 + value_lag3 + '
        'C(local_authority) + C(financial_year) + C(size_category) + '
        'value:C(size_category) + '
        'value_lag1:C(size_category) + '
        'value_lag2:C(size_category) + '
        'value_lag3:C(size_category)'
    ),
    data=train_data
).fit()

# ----------------------------
# Step 5: Predict and evaluate
# ----------------------------
y_pred = train_model.predict(test_data_clean)
y_true = test_data_clean['removals']

print("Test R-squared:", r2_score(y_true, y_pred))
print("Test RMSE:", mean_squared_error(y_true, y_pred) ** 0.5)
print("Training R-squared:", train_model.rsquared)



In [None]:
top10_removals = filtered_panel.sort_values('removals', ascending=False).head(30)
print(top10_removals[['local_authority', 'financial_year', 'removals', 'value', 'size_category']])

In [None]:
filtered_panel_1 = filtered_panel.copy()
filtered_panel_1['value_thousands'] = filtered_panel_1['value'] * 1000

sns.lmplot(
    data=filtered_panel_1,
    x='value_thousands', y='removals',
    hue='size_category',  # group by charity size
    lowess=True,
    scatter_kws={'alpha':0.3},
    line_kws={'linewidth':2},
    height=5, aspect=1.2
)

plt.xlabel('Capital Receipts (£1,000)')
plt.ylabel('Charity Removals')
plt.title('Removals vs Capital Receipts by Charity Size')
plt.show()

In [None]:
sns.lmplot(
    data=filtered_panel_1,
    x='value_thousands', y='removals',
    hue='size_category',
    lowess=True,
    scatter_kws={'alpha':0.3},
    line_kws={'linewidth':2},
    height=5, aspect=1.2
)

plt.xlabel('Capital Receipts (£1,000)')
plt.ylabel('Charity Removals')
plt.title('Removals vs Capital Receipts by Charity Size')
plt.xlim(0, 100000)  # limit to £100 million
plt.show()


In [None]:
sns.lmplot(
    data=filtered_panel_1,
    x='value_thousands', y='removals',
    hue='size_category',
    hue_order=['Small', 'Medium', 'Large'],  # green on top, then orange, then blue
    lowess=True,
    scatter_kws={'alpha': 0.3},
    line_kws={'linewidth': 2},
    height=5, aspect=1.2
)
plt.xlabel('Capital Receipts per Council (£1,000)')
plt.ylabel('Number of Charity Removals')
plt.title('Removals vs Capital Receipts by Charity Size')
plt.xlim(0, 100000)
plt.show()