In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import matplotlib.cm as cm
import seaborn as sns
import statsmodels.api as sm

In [2]:
#Load Data
dataset = pd.read_csv(r'../../data/processed/charity_main_cleaned.csv')
la_2025_asset_summary = pd.read_csv(r'../../data/processed/la_2025_asset_summary.csv')
population_summary = pd.read_csv(r'../../data/processed/population_summary_by_la.csv')
df_category = pd.read_csv(r'../../data/raw/charityClassification_RegisteredCharitiesInEnglandAndWales2025.csv')

  dataset = pd.read_csv(r'../../data/processed/charity_main_cleaned.csv')


In [5]:
# Define function to get financial year
def get_financial_year(date):
    if pd.isna(date):
        return np.nan
    return date.year if date.month >= 4 else date.year - 1

# --- ensure dates are in datetime ---
dataset['date_of_removal']      = pd.to_datetime(dataset['date_of_removal'],      errors='coerce')
dataset['date_of_registration'] = pd.to_datetime(dataset['date_of_registration'], errors='coerce')

In [None]:
# Missing-income charities (2015-2024 window)
year_mask = (
    (dataset['date_of_registration'].dt.year.between(2015, 2024)) |
    (dataset['date_of_removal'].dt.year      .between(2015, 2024))
)
# --- rows where income is missing AND the year mask is true ---
missing_income_rows = dataset[dataset['latest_income'].isna() & year_mask]

# how many
count_missing = len(missing_income_rows)
print(f"Missing-income charities (2015-2024 window): {count_missing:,}")

# Apply to create financial year column (prioritise registration over removal if available)
missing_income_rows = missing_income_rows.copy()
missing_income_rows['financial_year'] = missing_income_rows['date_of_registration'].apply(get_financial_year)
missing_income_rows['financial_year'] = missing_income_rows['financial_year'].fillna(
    missing_income_rows['date_of_removal'].apply(get_financial_year)
)
# Drop NaNs and count per year
fy_counts = missing_income_rows['financial_year'].dropna().astype(int).value_counts().sort_index()

# Keep only financial years from 2015 to 2024
fy_counts = fy_counts[(fy_counts.index >= 2015) & (fy_counts.index <= 2024)]

print("Count of missing-income charities per financial year (2015–2024):")
print(fy_counts)

Missing-income charities (2015-2024 window): 9,637
Count of missing-income charities per financial year (2015–2024):
financial_year
2015     162
2016     197
2017     184
2018     157
2019     186
2020     200
2021     195
2022     277
2023    2871
2024    3752
Name: count, dtype: int64


In [None]:
# Ensure datetime
dataset['date_of_removal'] = pd.to_datetime(dataset['date_of_removal'], errors='coerce')
dataset['date_of_registration'] = pd.to_datetime(dataset['date_of_registration'], errors='coerce')

dataset['removal_fy'] = dataset['date_of_removal'].apply(get_financial_year)
dataset['registration_fy'] = dataset['date_of_registration'].apply(get_financial_year)

# Count per financial year
removed_by_fy = dataset['removal_fy'].value_counts().sort_index()
registered_by_fy = dataset['registration_fy'].value_counts().sort_index()

# Filter to 2015–2024 financial years
removed_by_fy = removed_by_fy[(removed_by_fy.index >= 2015) & (removed_by_fy.index <= 2024)]
registered_by_fy = registered_by_fy[(registered_by_fy.index >= 2015) & (registered_by_fy.index <= 2024)]

# Print summary
print(f'Total number of charity data: {dataset["charity_status"].count():,}')
print(f"Total number of charities removed: {removed_by_fy.sum():,}")
print("Number of charities removed per FY (April–March):")
for year, count in removed_by_fy.items():
    print(f"FY {year}: {count:,}")

# Plot both lines
plt.figure(figsize=(10, 5))
plt.plot(removed_by_fy.index, removed_by_fy.values, marker='o', linestyle='-', color='#1f77b4', label='Removed')
plt.plot(registered_by_fy.index, registered_by_fy.values, marker='s', linestyle='--', color='#ff7f0e', label='Registered')

plt.title('Charities Registered vs Removed Per Financial Year (2015–2024)')
plt.xlabel('Financial Year')
plt.ylabel('Number of Charities')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.xticks(removed_by_fy.index.astype(int), [str(int(y)) for y in removed_by_fy.index], rotation=45)
plt.gca().yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
plt.show()


In [None]:
# Ensure datetime
dataset['date_of_registration'] = pd.to_datetime(dataset['date_of_registration'], errors='coerce')
dataset['date_of_removal'] = pd.to_datetime(dataset['date_of_removal'], errors='coerce')

dataset['registration_fy'] = dataset['date_of_registration'].apply(get_financial_year)
dataset['removal_fy'] = dataset['date_of_removal'].apply(get_financial_year)

# Create a range of financial years
financial_years = list(range(2015, 2025))

# Prepare storage
data_summary = []

# Loop through each financial year
for fy in financial_years:
    # Denominator: charities active at the start of FY
    active_start_fy = dataset[
        (dataset['registration_fy'] <= fy - 1) &
        ((dataset['removal_fy'].isna()) | (dataset['removal_fy'] > fy - 1))
    ].shape[0]

    # Registered in FY
    registered_in_fy = dataset[dataset['registration_fy'] == fy].shape[0]

    # Removed in FY
    removed_in_fy = dataset[dataset['removal_fy'] == fy].shape[0]

    # Calculate rates
    registration_rate = registered_in_fy / active_start_fy if active_start_fy > 0 else None
    removal_rate = removed_in_fy / active_start_fy if active_start_fy > 0 else None

    # Store the data
    data_summary.append({
        'Financial Year': fy,
        'Active at Start of FY': active_start_fy,
        'Registered': registered_in_fy,
        'Removed': removed_in_fy,
        'Registration Rate': registration_rate,
        'Removal Rate': removal_rate
    })

# Convert to DataFrame
fy_summary_df = pd.DataFrame(data_summary)



In [None]:
# Plotting
plt.figure(figsize=(12, 6))

# Plot Registration and Removal Rates
plt.plot(fy_summary_df['Financial Year'], fy_summary_df['Registration Rate'], label='Registration Rate', marker='o', linestyle='--', color='green')
plt.plot(fy_summary_df['Financial Year'], fy_summary_df['Removal Rate'], label='Removal Rate', marker='o', linestyle='-', color='red')

# Formatting
plt.title('Charity Registration vs Removal Rates Per Financial Year (2015–2024)')
plt.xlabel('Financial Year')
plt.ylabel('Rate (% of existing charities at start of FY)')
plt.grid(True)
plt.legend()
plt.xticks(fy_summary_df['Financial Year'])
plt.gca().yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1.0))

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import matplotlib.ticker as mticker

# Ensure datetime format
dataset['date_of_removal'] = pd.to_datetime(dataset['date_of_removal'], errors='coerce')
dataset['date_of_registration'] = pd.to_datetime(dataset['date_of_registration'], errors='coerce')

# Extract year-month as Period
dataset['removal_month'] = dataset['date_of_removal'].dt.to_period('M')
dataset['registration_month'] = dataset['date_of_registration'].dt.to_period('M')

# Count per month
removed_by_month = dataset['removal_month'].value_counts().sort_index()
registered_by_month = dataset['registration_month'].value_counts().sort_index()

# Filter to 2015–2024
removed_by_month = removed_by_month[(removed_by_month.index >= '2015-01') & (removed_by_month.index <= '2024-12')]
registered_by_month = registered_by_month[(registered_by_month.index >= '2015-01') & (registered_by_month.index <= '2024-12')]

# Convert PeriodIndex to Timestamp for modeling and plotting
removed_by_month.index = removed_by_month.index.to_timestamp()
registered_by_month.index = registered_by_month.index.to_timestamp()

# Apply ETS model (additive trend and seasonality)
ets_model_removed = ExponentialSmoothing(removed_by_month, trend='add', seasonal='add', seasonal_periods=12).fit()
ets_model_registered = ExponentialSmoothing(registered_by_month, trend='add', seasonal='add', seasonal_periods=12).fit()

# Prepare for boxplots
removed_df = removed_by_month.reset_index()
removed_df.columns = ['Date', 'Removed']
removed_df['Month'] = removed_df['Date'].dt.month_name()

registered_df = registered_by_month.reset_index()
registered_df.columns = ['Date', 'Registered']
registered_df['Month'] = registered_df['Date'].dt.month_name()

# Ensure month order for boxplot
month_order = ['January', 'February', 'March', 'April', 'May', 'June',
               'July', 'August', 'September', 'October', 'November', 'December']

# Plot
fig, axes = plt.subplots(3, 1, figsize=(12, 14), sharex=False)

# ETS Model Trends
axes[0].plot(removed_by_month.index, removed_by_month.values, linestyle='--', label='Original (Removed)')
axes[0].plot(removed_by_month.index, ets_model_removed.fittedvalues, color='blue', label='ETS Fitted (Removed)')
axes[0].plot(registered_by_month.index, registered_by_month.values, linestyle='--', label='Original (Registered)')
axes[0].plot(registered_by_month.index, ets_model_registered.fittedvalues, color='orange', label='ETS Fitted (Registered)')
axes[0].set_title('ETS Model Fit for Monthly Removals and Registrations')
axes[0].legend()
axes[0].grid(True)
axes[0].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{int(x):,}'))

# Boxplot for Removals
sns.boxplot(data=removed_df, x='Month', y='Removed', order=month_order, ax=axes[1])
axes[1].set_title('Monthly Distribution of Charity Removals')
axes[1].tick_params(axis='x', rotation=45)
axes[1].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{int(x):,}'))

# Boxplot for Registrations
sns.boxplot(data=registered_df, x='Month', y='Registered', order=month_order, ax=axes[2])
axes[2].set_title('Monthly Distribution of Charity Registrations')
axes[2].tick_params(axis='x', rotation=45)
axes[2].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{int(x):,}'))

plt.tight_layout()
plt.show()


In [None]:
# Step 2: Ensure datetime
dataset['date_of_removal'] = pd.to_datetime(dataset['date_of_removal'], errors='coerce')
dataset['date_of_registration'] = pd.to_datetime(dataset['date_of_registration'], errors='coerce')

dataset['removal_year'] = dataset['date_of_removal'].dt.year
dataset['registration_year'] = dataset['date_of_registration'].dt.year

# Step 3: Group by year and size
removed = (
    dataset[dataset['removal_year'].between(2015, 2024)]
    .groupby(['removal_year', 'size_category'])
    .size()
    .unstack(fill_value=0)
)

registered = (
    dataset[dataset['registration_year'].between(2015, 2024)]
    .groupby(['registration_year', 'size_category'])
    .size()
    .unstack(fill_value=0)
)

# Step 4: Plot
plt.figure(figsize=(12, 6))

# Registered lines
for size, style in zip(['Small', 'Medium', 'Large'], ['--', '--', '--']):
    plt.plot(registered.index, registered[size], linestyle=style, marker='o', label=f'Registered ({size})')

# Removed lines
for size, style in zip(['Small', 'Medium', 'Large'], ['-', '-', '-']):
    plt.plot(removed.index, removed[size], linestyle=style, marker='s', label=f'Removed ({size})')

plt.title('Charity Registrations and Removals by Size (2015–2024)')
plt.xlabel('Year')
plt.ylabel('Number of Charities')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.xticks(rotation=45)
plt.gca().yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
plt.show()

In [None]:
# Apply financial year classification
dataset['removal_fy'] = dataset['date_of_removal'].apply(get_financial_year)
dataset['registration_fy'] = dataset['date_of_registration'].apply(get_financial_year)

# Filter by financial year
removed_fy = (
    dataset[dataset['removal_fy'].between(2015, 2022)]
    .groupby(['removal_fy', 'size_category'])
    .size()
    .unstack(fill_value=0)
)

registered_fy = (
    dataset[dataset['registration_fy'].between(2015, 2022)]
    .groupby(['registration_fy', 'size_category'])
    .size()
    .unstack(fill_value=0)
)

# Plotting
plt.figure(figsize=(12, 6))

# Registered lines
for size, style in zip(['Small', 'Medium', 'Large'], ['--', '--', '--']):
    if size in registered_fy.columns:
        plt.plot(registered_fy.index, registered_fy[size], linestyle=style, marker='o', label=f'Registered ({size})')

# Removed lines
for size, style in zip(['Small', 'Medium', 'Large'], ['-', '-', '-']):
    if size in removed_fy.columns:
        plt.plot(removed_fy.index, removed_fy[size], linestyle=style, marker='s', label=f'Removed ({size})')

plt.title('Charity Registrations and Removals by Size per Financial Year (2015–2022)')
plt.xlabel('Financial Year')
plt.ylabel('Number of Charities')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.xticks(rotation=45)
plt.gca().yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
plt.show()


In [None]:
# --- Ensure datetime format ---
dataset['date_of_registration'] = pd.to_datetime(dataset['date_of_registration'], errors='coerce')
dataset['date_of_removal'] = pd.to_datetime(dataset['date_of_removal'], errors='coerce')

# --- Compute financial years ---
dataset['registration_fy'] = dataset['date_of_registration'].apply(get_financial_year)
dataset['removal_fy'] = dataset['date_of_removal'].apply(get_financial_year)

# --- Calculate charity age at removal ---
dataset['age_at_removal'] = (
    dataset['date_of_removal'] - dataset['date_of_registration']
).dt.days // 365  # convert to approximate years

# --- Define age bins and labels ---
bins = [0, 10, 20, 30, 40, 50, float('inf')]
labels = ['0–10 yrs', '11–20 yrs', '21–30 yrs', '31–40 yrs', '41–50 yrs', '50+ yrs']
dataset['age_group'] = pd.cut(dataset['age_at_removal'], bins=bins, labels=labels, right=False)

# --- Filter data to financial years of interest ---
filtered = dataset[dataset['removal_fy'].between(2015, 2024)]

# --- Group and count removals by FY and age group ---
removals_by_age_fy = (
    filtered.groupby(['removal_fy', 'age_group'])
    .size()
    .unstack(fill_value=0)
    .sort_index()
)

# --- Plot ---
plt.figure(figsize=(14, 6))
for col in removals_by_age_fy.columns:
    plt.plot(removals_by_age_fy.index, removals_by_age_fy[col], marker='o', label=col)

plt.title('Removed Charities in the UK by Age at Removal (Financial Year)')
plt.xlabel('Financial Year')
plt.ylabel('Number of Removals')
plt.legend(title='Charity Age at Removal')
plt.grid(True)
plt.tight_layout()
plt.xticks(removals_by_age_fy.index)
plt.show()


In [None]:
# --- Ensure datetime format ---
dataset['date_of_registration'] = pd.to_datetime(dataset['date_of_registration'], errors='coerce')
dataset['date_of_removal'] = pd.to_datetime(dataset['date_of_removal'], errors='coerce')

# --- Compute financial years ---
dataset['registration_fy'] = dataset['date_of_registration'].apply(get_financial_year)
dataset['removal_fy'] = dataset['date_of_removal'].apply(get_financial_year)

# --- Calculate charity age at removal ---
dataset['age_at_removal'] = (
    dataset['date_of_removal'] - dataset['date_of_registration']
).dt.days // 365  # convert to years

# --- Define age bins and labels ---
bins = [0, 10, 20, 30, 40, 50, float('inf')]
labels = ['0–10 yrs', '11–20 yrs', '21–30 yrs', '31–40 yrs', '41–50 yrs', '50+ yrs']
dataset['age_group'] = pd.cut(dataset['age_at_removal'], bins=bins, labels=labels, right=False)

# --- Filter to small charities and FY 2015–2024 ---
filtered = dataset[
    (dataset['size_category'] == 'Small') &
    (dataset['removal_fy'].between(2015, 2024))
]

# --- Group and count removals by FY and age group ---
removals_by_age_fy = (
    filtered.groupby(['removal_fy', 'age_group'])
    .size()
    .unstack(fill_value=0)
    .sort_index()
)

# --- Plot ---
plt.figure(figsize=(14, 6))
for col in removals_by_age_fy.columns:
    plt.plot(removals_by_age_fy.index, removals_by_age_fy[col], marker='o', label=col)

plt.title('Removed Small Charities in the UK by Age at Removal (FY 2015–2024)')
plt.xlabel('Financial Year')
plt.ylabel('Number of Removals')
plt.legend(title='Charity Age at Removal')
plt.grid(True)
plt.tight_layout()
plt.xticks(removals_by_age_fy.index)
plt.show()


In [None]:
dataset['date_of_removal'] = pd.to_datetime(dataset['date_of_removal'], errors='coerce')
dataset['removal_year'] = dataset['date_of_removal'].dt.year

removed = dataset[dataset['removal_year'].notnull() & dataset['local_authority'].notnull()]

removed_by_year_la = (
    removed
    .groupby(['local_authority', 'removal_year'])
    .size()
    .unstack(fill_value=0)
    .sort_index(axis=1)
)

# ✅ Keep only years 2015–2024
years = list(range(2015, 2025))
removed_by_year_la = removed_by_year_la[removed_by_year_la.columns.intersection(years)]

# ✅ Recalculate total and sort
removed_by_year_la['Total'] = removed_by_year_la.sum(axis=1)
removed_by_year_la = removed_by_year_la.sort_values(by='Total', ascending=False)

print("Removed Charities per Year per Local Authority (2015–2024):")
print(removed_by_year_la.head(10))


In [None]:
# Step 1: Define year range
years = list(range(2015, 2025))

# Step 2: Calculate percentage change across years
removed_pct_change = removed_by_year_la[years].pct_change(axis=1) * 100

# Step 3: Round for readability
removed_pct_change = removed_pct_change.round(2)
removed_pct_change

In [None]:
# Already sorted by total, so just keep top 50
top_50_removed = removed_by_year_la.head(15).drop(columns='Total')
# This is now a DataFrame of shape (50, 10) with years 2015–2024

removed_pct_change = top_50_removed.pct_change(axis=1) * 100
removed_pct_change = removed_pct_change.round(2)

removed_pct_change_long = (
    removed_pct_change
    .reset_index()
    .melt(id_vars='local_authority', var_name='year', value_name='pct_change')
)
removed_pct_change

In [None]:
def highlight_top2_bottom2(row):
    # Ignore NaNs
    sorted_vals = row.dropna().sort_values(ascending=False)
    top2 = sorted_vals[:2].values
    bottom2 = sorted_vals[-2:].values

    return [
        'color: red; font-weight: bold' if val in top2
        else 'color: blue; font-weight: bold' if val in bottom2
        else ''
        for val in row
    ]
# Make sure columns are int
removed_pct_change.columns = removed_pct_change.columns.astype(int)

# Drop 2015 column
removed_pct_change_clean = removed_pct_change.drop(columns=[2015])

# Apply styling
styled_wide = removed_pct_change_clean.style.apply(highlight_top2_bottom2, axis=1)


In [None]:
styled_wide

In [None]:
# 1. Drop 'Total' column if exists
yearly_data = removed_by_year_la.drop(columns='Total')

# 2. General trend: total removals per year (across all councils)
general_trend = yearly_data.sum(axis=0)

# 3. % change in general trend year-on-year
general_pct_change = general_trend.pct_change() * 100
general_pct_change = general_pct_change.round(2)
print("📈 General % Change in Charity Removals per Year:")
print(general_pct_change)
# 4. Get top 15 councils
top15 = removed_by_year_la.head(15).drop(columns='Total')

# 5. Calculate % change for each
top15_pct_change = top15.pct_change(axis=1) * 100
top15_pct_change = top15_pct_change.round(2)
# 6. Subtract general trend to get deviation
deviation_from_general = top15_pct_change.subtract(general_pct_change, axis=1)
strong_deviations = deviation_from_general[deviation_from_general.abs() > 40]

print("📌 Strong Deviations (>±40% from general trend):")
print(strong_deviations.dropna(how='all'))


In [None]:
# Step 2: Standardise charity number format
dataset['registered_charity_number'] = dataset['registered_charity_number'].astype(str).str.strip().str.zfill(6)
df_category['registered_charity_number'] = df_category['registered_charity_number'].astype(str).str.strip().str.zfill(6)

# Step 3: Merge datasets
merged = pd.merge(dataset, df_category, on='registered_charity_number', how='inner')

# Step 4: Filter by financial year
filtered = merged[merged['removal_fy'].between(2015, 2024)]

# Step 5: Group by financial year and classification
category_trend = (
    filtered
    .groupby(['removal_fy', 'classification_description'])
    .size()
    .reset_index(name='count')
)

In [None]:
top10_2022 = (
    category_trend[category_trend['removal_fy'] == 2022]
    .sort_values(by='count', ascending=False)
    .head(10)
)

print(top10_2022)


In [None]:
# Pivot: years as index, categories as columns
category_timeline = (
    category_trend
    .pivot(index='removal_fy', columns='classification_description', values='count')
    .fillna(0)
)

In [None]:
top5_per_year = (
    category_trend
    .sort_values(['removal_fy', 'count'], ascending=[True, False])
    .groupby('removal_fy')
    .head(5)
)
plt.figure(figsize=(12, 6))
sns.barplot(data=top5_per_year, x='removal_fy', y='count', hue='classification_description')

plt.title("Top 5 Removed Charity Categories by Year")
plt.xlabel("Year")
plt.ylabel("Number of Removals")
plt.xticks(rotation=45)
plt.legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
# Number of categories
num_lines = len(category_timeline.columns)

# Generate distinct colours using a colormap (e.g. 'tab20', 'nipy_spectral', 'gist_ncar', etc.)
colors = cm.get_cmap('tab20', num_lines)

# Plot with colours
plt.figure(figsize=(15, 8))
for i, col in enumerate(category_timeline.columns):
    plt.plot(
        category_timeline.index,
        category_timeline[col],
        marker='o',
        label=col,
        color=colors(i)
    )

plt.title("Timeline of Charity Removals by Category (2015–2024)")
plt.xlabel("Year")
plt.ylabel("Number of Removals")
plt.legend(title="Category", bbox_to_anchor=(1.05, 1), loc='upper left', ncol=2)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# ✅ Step 1: Get top 10 categories in 2022
top10_2022_categories = (
    category_trend[category_trend['removal_fy'] == 2022]
    .sort_values(by='count', ascending=False)
    .head(10)['classification_description']
    .tolist()
)

# ✅ Step 2: Pivot timeline table
category_timeline = (
    category_trend
    .pivot(index='removal_fy', columns='classification_description', values='count')
    .fillna(0)
)

# ✅ Step 3: Filter to only top 10 categories
top10_timeline = category_timeline[top10_2022_categories]

# ✅ Step 4: Plot
plt.figure(figsize=(14, 7))
for category in top10_timeline.columns:
    plt.plot(top10_timeline.index, top10_timeline[category], marker='o', label=category)

plt.title("Top 10 Charity Removals (2022) – Trends Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Removals")
plt.legend(title="Category", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()



In [None]:
# Step 1: Prepare data
dataset['registered_charity_number'] = dataset['registered_charity_number'].astype(str).str.zfill(6)
df_category['registered_charity_number'] = df_category['registered_charity_number'].astype(str).str.zfill(6)
dataset['removal_fy'] = pd.to_datetime(dataset['date_of_removal'], errors='coerce').dt.year

merged = pd.merge(dataset, df_category, on='registered_charity_number', how='inner')
filtered = merged[merged['removal_fy'].between(2015, 2024)]

# Step 2: Plot general top 10 per size category
for size in ['Small', 'Medium', 'Large']:
    size_df = filtered[filtered['size_category'] == size]

    # ✅ Top 10 categories across all years
    top10 = (
        size_df.groupby('classification_description')
        .size()
        .sort_values(ascending=False)
        .head(15)
        .index
    )

    # ✅ Build trend table
    timeline = (
        size_df[size_df['classification_description'].isin(top10)]
        .groupby(['removal_fy', 'classification_description'])
        .size()
        .reset_index(name='count')
        .pivot(index='removal_fy', columns='classification_description', values='count')
        .fillna(0)
    )

    # ✅ Plot
    plt.figure(figsize=(14, 7))
    for col in timeline.columns:
        plt.plot(timeline.index, timeline[col], marker='o', label=col)

    plt.title(f"Top 15 Removed Categories Over Time – {size} Charities (2015–2024)")
    plt.xlabel("Year")
    plt.ylabel("Number of Removals")
    plt.legend(title="Category", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.show()



In [None]:
# Prep
dataset['registered_charity_number'] = dataset['registered_charity_number'].astype(str).str.zfill(6)
df_category['registered_charity_number'] = df_category['registered_charity_number'].astype(str).str.zfill(6)
dataset['registration_year'] = pd.to_datetime(dataset['date_of_registration'], errors='coerce').dt.year

# Merge
merged_reg = pd.merge(dataset, df_category, on='registered_charity_number', how='inner')
filtered_reg = merged_reg[merged_reg['registration_year'].between(2015, 2024)]

# Top 10 categories overall
top10_reg = (
    filtered_reg.groupby('classification_description')
    .size()
    .sort_values(ascending=False)
    .head(10)
    .index
)

# Timeline of top 10
reg_timeline = (
    filtered_reg[filtered_reg['classification_description'].isin(top10_reg)]
    .groupby(['registration_year', 'classification_description'])
    .size()
    .reset_index(name='count')
    .pivot(index='registration_year', columns='classification_description', values='count')
    .fillna(0)
)

# Plot
plt.figure(figsize=(14, 7))
for col in reg_timeline.columns:
    plt.plot(reg_timeline.index, reg_timeline[col], marker='o', label=col)

plt.title("Top 10 Registered Charity Categories Over Time (2015–2024)")
plt.xlabel("Year")
plt.ylabel("Number of Registrations")
plt.legend(title="Category", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Use already-merged and filtered dataset
for size in ['Small', 'Medium', 'Large']:
    size_df = filtered_reg[filtered_reg['size_category'] == size]

    top10_size = (
        size_df.groupby('classification_description')
        .size()
        .sort_values(ascending=False)
        .head(10)
        .index
    )

    size_timeline = (
        size_df[size_df['classification_description'].isin(top10_size)]
        .groupby(['registration_year', 'classification_description'])
        .size()
        .reset_index(name='count')
        .pivot(index='registration_year', columns='classification_description', values='count')
        .fillna(0)
    )

    plt.figure(figsize=(14, 7))
    for col in size_timeline.columns:
        plt.plot(size_timeline.index, size_timeline[col], marker='o', label=col)

    plt.title(f"Top 10 Registered Categories – {size} Charities (2015–2024)")
    plt.xlabel("Year")
    plt.ylabel("Number of Registrations")
    plt.legend(title="Category", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
# Ensure date format
dataset['date_of_registration'] = pd.to_datetime(dataset['date_of_registration'], errors='coerce')
dataset['registration_year'] = dataset['date_of_registration'].dt.year

# Filter rows with year and local authority
registered = dataset[dataset['registration_year'].notnull() & dataset['local_authority'].notnull()]

# Group by local authority and year
registered_by_year_la = (
    registered
    .groupby(['local_authority', 'registration_year'])
    .size()
    .unstack(fill_value=0)
    .sort_index(axis=1)
)

# Keep only years 2015–2024
years = list(range(2015, 2025))
registered_by_year_la = registered_by_year_la[registered_by_year_la.columns.intersection(years)]

# Add total and sort
registered_by_year_la['Total'] = registered_by_year_la.sum(axis=1)
registered_by_year_la = registered_by_year_la.sort_values(by='Total', ascending=False)

# Preview
print("Registered Charities per Year per Local Authority (2015–2024):")
print(registered_by_year_la.head(10))


In [None]:
# Step 1: Define year range
years = list(range(2015, 2025))

# Step 2: Calculate percentage change across years
registered_pct_change = registered_by_year_la[years].pct_change(axis=1) * 100

# Step 3: Round for readability
registered_pct_change = registered_pct_change.round(2)

# View result
registered_pct_change


In [None]:
# Step 1: Get top 10 local authorities by total registered charities
top15_las = registered_by_year_la.sort_values(by='Total', ascending=False).head(15).index

# Step 2: Subset the % change DataFrame to those LAs
top15_registered_pct_change = registered_pct_change.loc[top15_las]

# Step 3: Drop 2015 (since all % changes there are NaN)
top15_registered_pct_change = top15_registered_pct_change.drop(columns=[2015])

# Step 4: Apply the highlight function
styled_top15 = top15_registered_pct_change.style.apply(highlight_top2_bottom2, axis=1)

# Show
styled_top15


In [None]:
# 1. Drop 'Total' column if it exists
yearly_registered = registered_by_year_la.drop(columns='Total')

# 2. General trend: total registrations per year (all councils)
general_reg_trend = yearly_registered.sum(axis=0)

# 3. % change in the general registration trend
general_reg_pct_change = general_reg_trend.pct_change() * 100
general_reg_pct_change = general_reg_pct_change.round(2)
print("📈 General % Change in Charity Registrations per Year:")
print(general_reg_pct_change)

# 4. Get top 15 councils by total registrations
top15_reg = registered_by_year_la.head(15).drop(columns='Total')

# 5. Calculate % change per council
top15_reg_pct_change = top15_reg.pct_change(axis=1) * 100
top15_reg_pct_change = top15_reg_pct_change.round(2)

# 6. Deviation from general trend
deviation_from_general_reg = top15_reg_pct_change.subtract(general_reg_pct_change, axis=1)

# 7. Filter for strong deviations
strong_reg_deviations = deviation_from_general_reg[deviation_from_general_reg.abs() > 40]

# Output
print("📌 Strong Deviations in Registrations (>±40% from general trend):")
print(strong_reg_deviations.dropna(how='all'))


In [None]:
# Step 1: Parse registration year
dataset['date_of_registration'] = pd.to_datetime(dataset['date_of_registration'], errors='coerce')
dataset['registration_year'] = dataset['date_of_registration'].dt.year

# Step 2: Standardise charity number format
dataset['registered_charity_number'] = dataset['registered_charity_number'].astype(str).str.strip().str.zfill(6)
df_category['registered_charity_number'] = df_category['registered_charity_number'].astype(str).str.strip().str.zfill(6)

# Step 3: Merge datasets on charity number
merged_reg = pd.merge(dataset, df_category, on='registered_charity_number', how='inner')

# Step 4: Filter for desired registration years
filtered_reg = merged_reg[
    merged_reg['registration_year'].between(2015, 2024)
]

# Step 5: Group by year and category
category_reg_trend = (
    filtered_reg
    .groupby(['registration_year', 'classification_description'])
    .size()
    .reset_index(name='count')
)

# Step 6: Get top 5 categories per year
top5_registered_per_year = (
    category_reg_trend
    .sort_values(['registration_year', 'count'], ascending=[True, False])
    .groupby('registration_year')
    .head(5)
)

# Optional preview
print(top5_registered_per_year)


In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(
    data=top5_registered_per_year,
    x='registration_year',
    y='count',
    hue='classification_description'
)

plt.title("Top 5 Registered Charity Categories by Year")
plt.xlabel("Year")
plt.ylabel("Number of Registrations")
plt.xticks(rotation=45)
plt.legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
