In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import statsmodels.api as sm

In [2]:
#Load Data
dataset = pd.read_csv(r'../../data/processed/charity_main_cleaned.csv')
la_2025_asset_summary = pd.read_csv(r'../../data/processed/la_2025_asset_summary.csv')
population_summary = pd.read_csv(r'../../data/processed/population_summary_by_la.csv')

  dataset = pd.read_csv(r'../../data/processed/charity_main_cleaned.csv')


In [3]:
#Charity Summary
charity_summary = dataset.groupby('local_authority').agg(
    total_charities=('registered_charity_number', 'count'),
    percent_land_holding=('charity_has_land', 'mean'),
    median_income=('latestIncome', 'median'),
    removal_rate=('charity_status', lambda x: (x == 'inactive').mean()),
).reset_index()

In [4]:
# Merge charity summary with asset summary
matched = pd.merge(
    la_2025_asset_summary,
    charity_summary,
    left_on='local_authority',
    right_on='local_authority',
    how='left'
)

In [5]:
dataset['date_of_removal'] = pd.to_datetime(dataset['date_of_removal'], errors='coerce')
dataset['date_of_registration'] = pd.to_datetime(dataset['date_of_registration'], errors='coerce')

# Optional: focus on a time window (e.g. removals from 2015 onward)
start_year = 2015
charity_filtered = dataset[
    (dataset['date_of_registration'] <= f'{start_year}-01-01')  # must have been eligible before 2015
]

# Remove charities with missing LA info
charity_filtered = charity_filtered[charity_filtered['local_authority'].notna()]

# Total charities per LA
total_by_la = charity_filtered.groupby('local_authority')['registered_charity_number'].count().rename('total_charities')

# Removed charities per LA (non-null date_of_removal)
removed_by_la = charity_filtered[charity_filtered['date_of_removal'].dt.year >= start_year] \
    .groupby('local_authority')['registered_charity_number'].count().rename('total_removed_charities')

removal_stats = pd.concat([total_by_la, removed_by_la], axis=1).fillna(0)
removal_stats['removal_rate'] = removal_stats['total_removed_charities'] / removal_stats['total_charities']
removal_stats = removal_stats.reset_index()


In [6]:
merged_df = pd.merge(
    matched,
    removal_stats,
    on='local_authority',
    how='left'  # or 'inner' if you want to keep only matching rows
)


In [7]:
merged_df = pd.merge(
    merged_df,
    population_summary,
    on='local_authority',
    how='left'
)

In [8]:
merged_df

Unnamed: 0,local_authority,total_assets,total_land_area,avg_land_area,land_only_assets,buildings_only_assets,land_building_assets,percent_freehold,registered_titles,total_charities_x,percent_land_holding,median_income,removal_rate_x,total_charities_y,total_removed_charities,removal_rate_y,population_count
0,Adur,329,167.1919,0.597114,150,0,0,0.924012,4,166.0,0.296053,23479.0,0.198795,120.0,29.0,0.241667,64687
1,Cumberland,212,456.5636,2.153602,177,0,0,0.995283,0,1177.0,0.38819,10013.0,0.257434,938.0,278.0,0.296375,276876
2,Amber Valley,518,6514.6044,12.600782,385,0,0,0.936293,317,393.0,0.314286,10288.5,0.211196,312.0,77.0,0.246795,127709
3,Arun,301,369.2622,1.903413,67,0,0,0.897010,0,472.0,0.335714,15445.5,0.243644,362.0,103.0,0.284530,168008
4,Ashfield,655,963.6750,1.616904,271,0,0,0.813740,3,202.0,0.316092,13360.5,0.272277,158.0,51.0,0.322785,128360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,Worcester,425,5577.7789,13.280426,237,0,0,0.891765,221,362.0,0.373457,23277.0,0.207182,266.0,68.0,0.255639,105143
333,Worthing,715,327.2456,0.510524,249,0,0,0.956643,0,292.0,0.361217,21740.0,0.256849,225.0,66.0,0.293333,112240
334,Wychavon,465,39129.7642,84.880183,214,0,0,0.840860,1,593.0,0.322222,9136.0,0.212479,477.0,118.0,0.247379,136229
335,Wyre,454,445.4685,0.981208,187,0,0,0.962555,4,303.0,0.368217,8191.0,0.267327,248.0,73.0,0.294355,116994


In [9]:
# Ensure all relevant columns are numeric (convert errors to NaN)
columns_needed = ['removal_rate_y', 'total_assets', 'total_land_area', 'percent_freehold', 'population_count']
merged_df[columns_needed] = merged_df[columns_needed].apply(pd.to_numeric, errors='coerce')

# Drop rows with any missing values in the relevant columns
merged_df = merged_df.dropna(subset=columns_needed)

# Prepare Y and X
Y = merged_df['removal_rate_y']
X = merged_df[['total_assets', 'total_land_area', 'percent_freehold', 'population_count']]

# Add constant term to X
X = sm.add_constant(X)

# Fit the OLS regression model
model = sm.OLS(Y, X).fit()

# Output model summary
model_summary = model.summary()
model_summary


0,1,2,3
Dep. Variable:,removal_rate_y,R-squared:,0.037
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,2.82
Date:,"Tue, 17 Jun 2025",Prob (F-statistic):,0.0254
Time:,18:25:45,Log-Likelihood:,598.49
No. Observations:,298,AIC:,-1187.0
Df Residuals:,293,BIC:,-1168.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2436,0.013,19.301,0.000,0.219,0.268
total_assets,4.928e-06,2.34e-06,2.110,0.036,3.32e-07,9.52e-06
total_land_area,-1.752e-08,4.87e-08,-0.359,0.720,-1.13e-07,7.84e-08
percent_freehold,0.0101,0.014,0.712,0.477,-0.018,0.038
population_count,1.814e-08,1.38e-08,1.318,0.188,-8.94e-09,4.52e-08

0,1,2,3
Omnibus:,8.373,Durbin-Watson:,2.0
Prob(Omnibus):,0.015,Jarque-Bera (JB):,8.271
Skew:,0.373,Prob(JB):,0.016
Kurtosis:,3.332,Cond. No.,2510000.0
