In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import statsmodels.api as sm

In [2]:
#Load Data
dataset = pd.read_csv(r'../../data/processed/charity_main_cleaned.csv')
la_2025_asset_summary = pd.read_csv(r'../../data/processed/la_2025_asset_summary.csv')
population_summary = pd.read_csv(r'../../data/processed/population_summary_by_la.csv')

  dataset = pd.read_csv(r'../../data/processed/charity_main_cleaned.csv')


In [3]:
#Charity Summary
charity_summary = dataset.groupby('local_authority').agg(
    total_charities=('registered_charity_number', 'count'),
    percent_land_holding=('charity_has_land', 'mean'),
    median_income=('latestIncome', 'median'),
    removal_rate=('charity_status', lambda x: (x == 'inactive').mean()),
).reset_index()

In [4]:
# Merge charity summary with asset summary
matched = pd.merge(
    la_2025_asset_summary,
    charity_summary,
    left_on='local_authority',
    right_on='local_authority',
    how='left'
)

In [5]:
dataset['date_of_removal'] = pd.to_datetime(dataset['date_of_removal'], errors='coerce')
dataset['date_of_registration'] = pd.to_datetime(dataset['date_of_registration'], errors='coerce')

# Optional: focus on a time window (e.g. removals from 2015 onward)
start_year = 2015
charity_filtered = dataset[
    (dataset['date_of_registration'] <= f'{start_year}-01-01')  # must have been eligible before 2015
]

# Remove charities with missing LA info
charity_filtered = charity_filtered[charity_filtered['local_authority'].notna()]

# Total charities per LA
total_by_la = charity_filtered.groupby('local_authority')['registered_charity_number'].count().rename('total_charities')

# Removed charities per LA (non-null date_of_removal)
removed_by_la = charity_filtered[charity_filtered['date_of_removal'].dt.year >= start_year] \
    .groupby('local_authority')['registered_charity_number'].count().rename('total_removed_charities')

removal_stats = pd.concat([total_by_la, removed_by_la], axis=1).fillna(0)
removal_stats['removal_rate'] = removal_stats['total_removed_charities'] / removal_stats['total_charities']
removal_stats = removal_stats.reset_index()


In [7]:
merged_df = pd.merge(
    matched,
    removal_stats,
    on='local_authority',
    how='left'  # or 'inner' if you want to keep only matching rows
)


In [8]:
merged_df = pd.merge(
    merged_df,
    population_summary,
    on='local_authority',
    how='left'
)

In [9]:
# Ensure all relevant columns are numeric (convert errors to NaN)
columns_needed = ['removal_rate_y', 'total_assets', 'total_land_area', 'percent_freehold', 'population_count']
merged_df[columns_needed] = merged_df[columns_needed].apply(pd.to_numeric, errors='coerce')

# Drop rows with any missing values in the relevant columns
merged_df = merged_df.dropna(subset=columns_needed)

# Prepare Y and X
Y = merged_df['removal_rate_y']
X = merged_df[['total_assets', 'total_land_area', 'percent_freehold', 'population_count']]

# Add constant term to X
X = sm.add_constant(X)

# Fit the OLS regression model
model = sm.OLS(Y, X).fit()

# Output model summary
model_summary = model.summary()
model_summary


0,1,2,3
Dep. Variable:,removal_rate_y,R-squared:,0.037
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,2.82
Date:,"Tue, 17 Jun 2025",Prob (F-statistic):,0.0254
Time:,15:03:15,Log-Likelihood:,598.49
No. Observations:,298,AIC:,-1187.0
Df Residuals:,293,BIC:,-1168.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2436,0.013,19.301,0.000,0.219,0.268
total_assets,4.928e-06,2.34e-06,2.110,0.036,3.32e-07,9.52e-06
total_land_area,-1.752e-08,4.87e-08,-0.359,0.720,-1.13e-07,7.84e-08
percent_freehold,0.0101,0.014,0.712,0.477,-0.018,0.038
population_count,1.814e-08,1.38e-08,1.318,0.188,-8.94e-09,4.52e-08

0,1,2,3
Omnibus:,8.373,Durbin-Watson:,2.0
Prob(Omnibus):,0.015,Jarque-Bera (JB):,8.271
Skew:,0.373,Prob(JB):,0.016
Kurtosis:,3.332,Cond. No.,2510000.0
