In [1]:
import pandas as pd
import glob
import math
import xlrd
import openpyxl
import re

# Define date interval quartals

def get_quartal(date):
    if date.month in [1, 2, 3]:
        return 'Q1'
    elif date.month in [4, 5, 6]:
        return 'Q2'
    elif date.month in [7, 8, 9]:
        return 'Q3'
    else:
        return 'Q4'

In [2]:
filepath = "/Users/graciaandriamiadana/Documents/Research Culture PGR project/data/Combined PGR studenship payments 2022-23.xlsx"
df = pd.read_excel(filepath, engine= 'openpyxl')
df.head(5) 

Unnamed: 0,Student ID,Budget code,A/c Code,2022-10-01 00:00:00,2022-11-01 00:00:00,2022-12-01 00:00:00,2023-01-01 00:00:00,2023-02-01 00:00:00,2023-03-01 00:00:00,2023-04-01 00:00:00,...,2023-06-01 00:00:00,2023-07-01 00:00:00,2023-08-01 00:00:00,2023-09-01 00:00:00,School,Start date,End date,Duration,Source,uplift to 1/10/2022 payment for cost of living increase
0,200343938,ECSY1A3R,3620,4515.5,,,4917.0,,,4917,...,,4917,,,AIDD_EECS,2022-10-01 00:00:00,2026-09-30 00:00:00,48,BBSRC,401.5
1,220679211,ECSY1A2R,3620,,,6556.0,,,,4917,...,,4917,,,AIDD_EECS,2022-12-01 00:00:00,2026-11-30 00:00:00,48,BBSRC,
2,200346331,BCCG1D3R,3620,5250.0,,,5750.0,,,5750,...,,5750,,,BCI,2021-10-01 00:00:00,2024-09-30 00:00:00,36,CRUK + CoL Centre,500.0
3,200826925,IRM9045B,3620,4917.0,4917.0,,4917.0,,,4917,...,,4917,,,BCI,2022-10-01 00:00:00,2025-09-30 00:00:00,36,BCSC,0.0
4,200974709,PRN1040R,3620,4515.5,,,4917.0,,,4917,...,,4917,,,BCI,2020-10-01 00:00:00,2024-09-30 00:00:00,48,BBSRC LIDo,401.5


In [3]:
# Reshape the DataFrame using pd.melt() to unpivot date columns
df_melted = pd.melt(df, id_vars=['Student ID', 'Budget code', 'A/c Code', 'School', 'Start date', 'End date', 'Duration', 'Source'],
                    var_name='Date', value_name='Amount')

# Convert 'Date' column to datetime format
df_melted['Date'] = pd.to_datetime(df_melted['Date'], errors='coerce')  # 'coerce' to handle any parsing errors

# # Convert non-numeric strings to NaN in 'Amount' columns
df_melted['Amount'] = pd.to_numeric(df_melted['Amount'], errors='coerce')

# Convert the "Source" column to strings 
df_melted["Source"] = df_melted["Source"].astype(str).fillna('not available')

In [4]:
# Uncomment this block for dataset 2022-23
# but do not run for all other datasets

# Handle additional column ('uplift to 1/10/2022 payment for cost of living increase')
df_melted['uplift_amount'] = pd.to_numeric(df['uplift to 1/10/2022 payment for cost of living increase '], errors='coerce')

# Combine 'Amount' and 'uplift_amount' columns
df_melted['Total_Amount'] = df_melted['Amount'].fillna(0) + df_melted['uplift_amount'].fillna(0)

In [5]:
df_melted['School'] = df_melted['School'].fillna('')

# Perform case-insensitive replacements and transformations in the 'School' column of df_melted
# replacements = {
#     r'.*EECS.*': 'School of Electronic Engineering and Computer Science',        
#     r'^.*Geog.*$': 'School of Geography',   
#     r'.*BCI.*': 'Barts Cancer Institute', 
#     r'.*Blizard.*': 'Blizard Institute', 
#     r'.*BLIZARD.*': 'Blizard Institute',  
#     r'^.*SED.*$': 'School of English and Drama' ,          
#     r'^.*IPHS.*$': 'Institute of Population Health Sciences' ,         
#     r'^.*SLLF.*$': 'School of Languages, Linguistics and Film' ,          
#     r'^.*History.*$': 'School of History',           
#     r'^.*Law.*$': 'School of Law',
#     r'^.*WHRI.*$': 'William Harvey Research Institute',
#     r'^.*WIPH.*$': 'Wolfson Institute of Population Health',
#     r'^.*DCE_Maths.*$': 'Data-Centric Engineering/Mathematical Sciences',
#     r'^.*SMS.*$': 'School of Mathematical Sciences',
#     r'^.*IoD.*$': 'Institute of Dentistry',
#     r'^.*SBBS.*$': 'School of Biological and Behavioural Sciences',
#     r'^.*SBM.*$': 'School of Business and Management',
#     r'^.*SEF.*$': 'School of Economics and Finance',
#     r'^.*SEMS.*$': 'School of Engineering and Materials Science',
#     r'^.*SPCS.*$': 'School of Physical and Chemical Sciences',
#     r'^.*SPIR.*$': 'School of Politics and International Relations',
# }         

replacements = {
    r'.*EECS.*': 'EECS',        
    # r'^.*Geog.*$': 'School of Geography',   
    r'.*BCI.*': 'BCI', 
    r'.*Blizard.*': 'Blizard', 
    r'.*BLIZARD.*': 'Blizard',  
    r'^School$': 'not available',
    r'^.*SBBS.*$': 'SBBS',
    r'^.*SED.*$': 'SED',
    # r'^.*SED.*$': 'School of English and Drama' ,          
    # r'^.*IPHS.*$': 'Institute of Population Health Sciences' ,         
    # r'^.*SLLF.*$': 'School of Languages, Linguistics and Film' ,          
    # r'^.*History.*$': 'School of History',           
    # r'^.*Law.*$': 'School of Law',
    # r'^.*WHRI.*$': 'William Harvey Research Institute',
    # r'^.*WIPH.*$': 'Wolfson Institute of Population Health',
    r'^.*DCE_Maths.*$': 'Maths',
    r'^.*SMS.*$': 'Maths',
    r'^.*IoD.*$': 'Dentistry',
} 

# Iterate over the dictionary and perform replacements
for pattern, replacement in replacements.items():
    mask = df_melted['School'].str.contains(pattern, case=True, regex=True, na=False)
    df_melted.loc[mask, 'School'] = replacement

# Remove rows where 'School' is an empty string
df_melted = df_melted[df_melted['School'] != '']

print(sorted(df_melted["School"].unique().tolist()))

['BCI', 'Blizard', 'Dentistry', 'EECS', 'Geog', 'History', 'Law', 'Maths', 'SBBS', 'SBM', 'SED', 'SEF', 'SEMS', 'SLLF', 'SPCS', 'SPIR', 'WHRI', 'WIPH', 'not available']


In [6]:
# Read the Excel file containing the category mappings
mapping_file = '/Users/graciaandriamiadana/Documents/Research Culture PGR project/data/Sources classification.xlsx'
category_mapping = pd.read_excel(mapping_file)

# Initialize an empty dictionary for replacements
replacements = {}

# Iterate over each category and its sub-names to generate regex patterns
for category in category_mapping.columns:
    for name in category_mapping[category].dropna():
        # Create a regex pattern that matches the name within any text
        pattern = rf'.*{re.escape(name)}.*'
        replacements[pattern] = category



In [7]:
# Sources_classification_21-22_23-24
# Read the Excel file containing the category mappings
mapping_file2 = '/Users/graciaandriamiadana/Documents/Research Culture PGR project/data/Sources_classification_21-22_23-24.csv'
# category_mapping2 = pd.read_csv(mapping_file)
try:
    category_mapping2 = pd.read_csv(mapping_file2, encoding='latin1')
except UnicodeDecodeError as e:
    print(f"Error reading the file with 'latin1' encoding: {e}")
    # Try another encoding if needed
    category_mapping2 = pd.read_csv(mapping_file2, encoding='ISO-8859-1')

# Remove the unwanted characters "', " from the 'Source' column
category_mapping2['Source'] = category_mapping2['Source'].str.replace(r"[\"',]", "", regex=True)

print(category_mapping2['Source'].tolist())

['50% LMK Thermosafe', 'ASDF', 'Aston Martin Formula One Team', 'Bela - 25%', 'Belgian Rsch Inst VITO', 'British Council', 'Byte Dance', 'Charities', 'Conacyt', 'Deepmind', 'DiscNet', 'EDA', 'EU', 'FAST', 'Industry', 'Internal', 'Music Tribe - 75%', 'NPIF Flex Fund extension', 'Other', 'Qinetiq', 'RadNet', 'Reckitt Benckiser', 'S&E and Bit Bio Ltd', 'Sumitomo Corp Europe', 'Supervisors EDA/Discretionary account', 'UKRI', 'UMG - 50%', 'Value 19-20', 'not available]', '50% LMK Thermosafe', 'AIDD CTP', 'AIDD CTP match ', 'Acutus Medical', 'Altos Labs', 'Aston Martin Formula One Team', 'BBKA-British Beekeepers Assoc', 'Bela - 25%', 'Belgian Rsch Inst VITO', 'CASE Awards', 'Carbon Numbers Ltd and match funded', 'Carl Zeiss', 'Charities', 'DAACI', 'DEFRA', 'Deepmind', 'Delphia', 'EU', 'Environmental Agency top up only', 'Evonik Operations GmbH', 'Flamin-GO', 'Flexible Resarch Fund', 'GambleAware', 'ISIS', 'Industry', 'Internal', 'Music Tribe - 25%', 'Myerscough', 'NDA', 'National Nuclear Lab

In [8]:
# Initialize an empty dictionary for replacements
replacements2 = {}

# Iterate over each row in the category_mapping DataFrame to generate regex patterns
for index, row in category_mapping2.iterrows():
    source = row['Source']
    classification = row['Source classification']
    # Create a regex pattern that matches the source within any text
    pattern = rf'.*{re.escape(source)}.*'
    replacements2[pattern] = classification

replacements.update(replacements2)

In [9]:
(sorted(df_melted["Source"].unique().tolist()))
(sorted(df_melted["School"].unique().tolist()))


['BCI',
 'Blizard',
 'Dentistry',
 'EECS',
 'Geog',
 'History',
 'Law',
 'Maths',
 'SBBS',
 'SBM',
 'SED',
 'SEF',
 'SEMS',
 'SLLF',
 'SPCS',
 'SPIR',
 'WHRI',
 'WIPH',
 'not available']

In [10]:
# replacements[r'^CDA supplemen.*'] = 'UKRI'
replacements[r'^Belgian Rsch Inst\, VITO.*'] = 'Other'
replacements[r'^Other .*'] = 'Other'
replacements[r'^\?.*'] = 'not available'
replacements[r'^nan$'] = 'not available'
replacements[r'^ '] = 'not available'
replacements[r'^\?.*'] = 'not available'
replacements[r'^nan$'] = 'not available'
replacements[r'^Source .*'] = 'Other'


# Additional entries grouped under "Charities"
charities_entries = [
    'HS Barlow Charitable Trust/ Paragraf',
    'Heart Research UK',
    'Horne Family Charitable Fdn',
    'Horne Family foundation',
    'Horne Foundation',
    'Bowel Research UK',
    'Welcome',
    'Wellcome',
    'PWSA UK',
    'Prostate Cancer UK',
    'Stuart Hall Foundation (SHF).',
    'Versus arthritis',
    'ANTRUK (Antibiotic Research UK)',
    'Animal Free Research UK',
    'Barry Reed Foundation',
    "Tommy's Charity",
    'Charity',

]

# Add these entries to the replacements dictionary under "Charities"
for entry in charities_entries:
    pattern = rf'.*{re.escape(entry)}.*'
    replacements[pattern] = 'Charities'

# Additional entries grouped under "Industry"
industry_entries = [
    'AstraZeneca',
    'Artios Pharma',
    'Huawei',
    'Industrial top-up',
    'Industry',
    'Industry (Creative Assembly)',
    'LTA Cola',
    'META',
    'Meta Platforms, Inc.',
    'Microsoft',
    'COLA',
    'ICase industry partner',
    'Matching industry contribution',
    'industry'
]

# Add these entries to the replacements dictionary under "Industry"
for entry in industry_entries:
    pattern = rf'.*{re.escape(entry)}.*'
    replacements[pattern] = 'Industry'

# Additional entries grouped under "UKRI"
ukri_entries = [
    'CDA supplement',
    'AIM CDT',
    'LiDo',
    'LISS',
    'CDT',
    'EPRSC DTP CASE Conversion 2021',

]

# Add these entries to the replacements dictionary under "UKRI"
for entry in ukri_entries:
    pattern = rf'.*{re.escape(entry)}.*'
    replacements[pattern] = 'UKRI'

# Additional entries grouped under "Internal"
internal_entries = [
    'SBBS',
    'SPIR',
    'SPCS',
    'WIPH',
    'Wolfson',
    'Supervisor Project',
    'Supervisor project',
    'Faculty',
    'Faculty Match fund',
    'Faculty Match fund.  P/T rates from 01-Oct-22.',
    'Faculty match fund (50%)',
    'BAME studentship',
    'S&E Flexible Match funding',
    'S&E Match Funded',
    '50% S&E matched funding',
    'BCSC',
    'BCSC Alexandra Carrell',
    'Match Funding',
    'PHURI', # Institute at Whitechapel

]

# Add these entries to the replacements dictionary under "Internal"
for entry in internal_entries:
    pattern = rf'.*{re.escape(entry)}.*'
    replacements[pattern] = 'Internal'

# Additional entries grouped under "Other"
other_entries = [
    'DSTL',
    'Defence Science & Tech Lab',
    'Defence Science and Tech Lab, Gov UK',
    'One off payment for CDA top up back pay. Paid out on 25-Aug-23',
    'One off payment for historial 3m cost of living uplift',
    'One-off top-up => paid in wkly list 25-apr-2023 => ideally, it wasgoint to be an extension of funding for Oct-Dec 2023 => however, the supervisor, Dr Pearce, had to put this payment through now as his grant closes in June 2023',
    'S/s via private donation',
    'Government',
    'Source'
]

# Add these entries to the replacements dictionary under "Other"
for entry in other_entries:
    pattern = rf'.*{re.escape(entry)}.*'
    replacements[pattern] = 'Other'

# print(replacements)
df_melted['Source'] = df_melted['Source'].fillna('not available').astype(str)


for pattern, replacement in replacements.items():
    mask = df_melted['Source'].str.contains(pattern, case=True, regex=True, na=False)
    df_melted.loc[mask, 'Source'] = replacement

df_melted['Source'] = df_melted['Source'].fillna('not available').astype(str)

print(len(df_melted["Source"].unique().tolist()))
(sorted(df_melted["Source"].unique().tolist()))


7


['Charities', 'EU', 'Industry', 'Internal', 'Other', 'UKRI', 'not available']

In [11]:
# Group by 'Budget code' and sum the 'Amount' for each budget code
budget_totals = df_melted.groupby('Budget code')['Total_Amount'].sum().reset_index()
budget_totals['Formatted Amount'] = budget_totals['Total_Amount'].apply(lambda x: f'£{x:,.2f}')  # Format amount as currency with pounds (£) symbol
budget_totals.rename(columns={'Budget code': 'Budget code', 'Formatted Amount': 'Total Amount (£)'}, inplace=True)

# Display the total amounts per budget code
# print(budget_totals)

#### --- if requested by quartal: --- ####
# Apply custom date intervals to 'Date' column
df_melted['Quartal'] = df_melted['Date'].apply(get_quartal)

# Group by 'Budget code' and 'Quartal' and sum the 'Amount' for each combination
budget_quartal_totals = df_melted.groupby(['Budget code', 'Quartal'])['Total_Amount'].sum().reset_index()

# Display the total amounts per budget code and quartal
# print(budget_quartal_totals)

# Group by 'School' and sum the 'Amount' for each School
studentship_payments_by_school = df_melted.groupby('School')['Total_Amount'].sum().reset_index()

# Format the 'Amount' column
studentship_payments_by_school['Formatted Amount'] = studentship_payments_by_school['Total_Amount'].apply(lambda x: f'£{x:,.2f}')  # Format amount as currency with pounds (£) symbol

# Rename columns for better display
studentship_payments_by_school.rename(columns={'School': 'School Name', 'Formatted Amount': 'Total Amount (£)'}, inplace=True)

# Display the total amounts per budget code
# print(studentship_payments_by_school)


# Group by 'Budget code' and 'Quartal' and sum the 'Amount' for each combination
studentship_payments_by_school_quartal_totals = df_melted.groupby(['School', 'Quartal'])['Total_Amount'].sum().reset_index()

# Format the 'Amount' column
studentship_payments_by_school_quartal_totals['Formatted Amount'] = studentship_payments_by_school_quartal_totals['Total_Amount'].apply(lambda x: f'£{x:,.2f}')  # Format amount as currency with pounds (£) symbol

# Rename columns for better display
studentship_payments_by_school.rename(columns={'School': 'School Name', 'Formatted Amount': 'Total Amount (£)'}, inplace=True)

# Display the total amounts per budget code and quartal
# print(studentship_payments_by_school_quartal_totals)

# Group by 'Source' and sum the 'Amount' for each School
studentship_payments_by_source = df_melted.groupby('Source')['Total_Amount'].sum().reset_index()

# Format the 'Amount' column
studentship_payments_by_source['Formatted Amount'] = studentship_payments_by_source['Total_Amount'].apply(lambda x: f'£{x:,.2f}')  # Format amount as currency with pounds (£) symbol

# Rename columns for better display
studentship_payments_by_source.rename(columns={'Source': 'Source Name', 'Formatted Amount': 'Total Amount (£)'}, inplace=True)

# Display the total amounts per budget code
studentship_payments_by_source

Unnamed: 0,Source Name,Total_Amount,Total Amount (£)
0,Charities,2980334.0,"£2,980,333.89"
1,EU,107092.9,"£107,092.92"
2,Industry,636621.5,"£636,621.46"
3,Internal,6691485.0,"£6,691,485.47"
4,Other,417037.7,"£417,037.70"
5,UKRI,6150131.0,"£6,150,131.31"
6,not available,166086.5,"£166,086.51"


In [12]:
# studentship_payments_by_source[studentship_payments_by_source['Source Name'] == "LIDo (BBSRC)"]
studentship_payments_by_source

Unnamed: 0,Source Name,Total_Amount,Total Amount (£)
0,Charities,2980334.0,"£2,980,333.89"
1,EU,107092.9,"£107,092.92"
2,Industry,636621.5,"£636,621.46"
3,Internal,6691485.0,"£6,691,485.47"
4,Other,417037.7,"£417,037.70"
5,UKRI,6150131.0,"£6,150,131.31"
6,not available,166086.5,"£166,086.51"


In [13]:
studentship_payments_by_school

Unnamed: 0,School Name,Total_Amount,Total Amount (£)
0,BCI,1457713.0,"£1,457,713.12"
1,Blizard,1144177.0,"£1,144,176.92"
2,Dentistry,64757.8,"£64,757.80"
3,EECS,2928362.0,"£2,928,361.71"
4,Geog,608190.5,"£608,190.46"
5,History,256467.1,"£256,467.08"
6,Law,430937.9,"£430,937.92"
7,Maths,813077.8,"£813,077.75"
8,SBBS,1320697.0,"£1,320,697.31"
9,SBM,682162.2,"£682,162.25"


In [14]:
studentship_payments_by_school.to_csv("studentship_payments_by_school_2022-23.csv")
studentship_payments_by_source.to_csv("studentship_payments_by_source_2022-23.csv")

In [15]:
# Calculate mean amount spent per source and per school
mean_amount_per_source = df_melted.groupby('Source')['Amount'].mean().to_frame(name='Mean Amount Source').sort_index()
mean_amount_per_school = df_melted.groupby('School')['Amount'].mean().to_frame(name='Mean Amount School').sort_index()

# Calculate frequency by source and by school
frequency_by_source = df_melted['Source'].value_counts().to_frame(name='Frequency Source').sort_index()
frequency_by_school = df_melted['School'].value_counts().to_frame(name='Frequency School').sort_index()

# Create the studentship payments by school and source tables
studentship_payments_by_school = df_melted.groupby('School').agg({'Amount': 'sum'}).sort_index()
studentship_payments_by_source = df_melted.groupby('Source').agg({'Amount': 'sum'}).sort_index()

# Save to Excel file
with pd.ExcelWriter('output2022-23.xlsx') as writer:
    mean_amount_per_source.to_excel(writer, sheet_name='Mean Amount per Source')
    frequency_by_source.to_excel(writer, sheet_name='Frequency by Source')
    studentship_payments_by_source.to_excel(writer, sheet_name='Payments by Source')
    mean_amount_per_school.to_excel(writer, sheet_name='Mean Amount per School')
    frequency_by_school.to_excel(writer, sheet_name='Frequency by School')
    studentship_payments_by_school.to_excel(writer, sheet_name='Payments by School')


In [16]:
mean_amount_per_source
frequency_by_school

Unnamed: 0_level_0,Frequency School
School,Unnamed: 1_level_1
BCI,1859
Blizard,1144
Dentistry,104
EECS,3679
Geog,832
History,403
Law,429
Maths,1053
SBBS,1300
SBM,546
