In [1]:
import pandas as pd
import glob
import math
import xlrd
import openpyxl
import re

# Define date interval quartals

def get_quartal(date):
    if date.month in [1, 2, 3]:
        return 'Q1'
    elif date.month in [4, 5, 6]:
        return 'Q2'
    elif date.month in [7, 8, 9]:
        return 'Q3'
    else:
        return 'Q4'

In [2]:
filepath = "/Users/graciaandriamiadana/Documents/Research Culture PGR project/data/Combined PGR studenship payments 2023-24.xlsx"
df = pd.read_excel(filepath, engine= 'openpyxl')
df.head(5)

Unnamed: 0,Student ID,Budget code,A/c Code,2023-10-01 00:00:00,2023-11-01 00:00:00,2023-12-01 00:00:00,2024-01-01 00:00:00,2024-02-01 00:00:00,2024-03-01 00:00:00,2024-04-01 00:00:00,2024-05-01 00:00:00,2024-06-01 00:00:00,2024-07-01 00:00:00,2024-08-01 00:00:00,2024-09-01 00:00:00,School,Start date,End date,Duration,Source
0,230872255,SMD1011B,3620.0,5155.5,,,5155.5,,,5155.5,,,5155.5,,,AIDD_BCI,2023-10-01 00:00:00,2027-09-30 00:00:00,48,QMUL Match Funding
1,200343938,ECSY1A3R,3620.0,5155.5,,,5155.5,,,5155.5,,,5155.5,,,AIDD_EECS,2022-10-01 00:00:00,2026-09-30 00:00:00,48,BBSRC
2,220679211,ECSY1A2R,3620.0,5155.5,,,5155.5,,,5155.5,,,5155.5,,,AIDD_EECS,2022-12-01 00:00:00,2026-11-30 00:00:00,48,BBSRC
3,190861115,DRIY1A1R,3620.0,5155.5,,,5155.5,,,5155.5,,,5155.5,,,AIDD_EECS,2023-10-01 00:00:00,2027-09-30 00:00:00,48,BBSRC
4,230934704,ECSY1A4R,3620.0,5155.5,,,5155.5,,,5155.5,,,5155.5,,,AIDD_EECS,2023-10-01 00:00:00,2027-09-30 00:00:00,48,BBSRC


In [3]:
# Reshape the DataFrame using pd.melt() to unpivot date columns
df_melted = pd.melt(df, id_vars=['Student ID', 'Budget code', 'A/c Code', 'School', 'Start date', 'End date', 'Duration', 'Source'],
                    var_name='Date', value_name='Amount')

# Convert 'Date' column to datetime format
df_melted['Date'] = pd.to_datetime(df_melted['Date'], errors='coerce')  # 'coerce' to handle any parsing errors

# # Convert non-numeric strings to NaN in 'Amount' columns
df_melted['Amount'] = pd.to_numeric(df_melted['Amount'], errors='coerce')

# Convert the "Source" column to strings 
df_melted["Source"] = df_melted["Source"].astype(str)

In [6]:
df_melted['School'] = df_melted['School'].fillna('')

# Perform case-insensitive replacements and transformations in the 'School' column of df_melted
replacements = {
    r'.*EECS.*': 'School of Electronic Engineering and Computer Science',        
    r'^.*Geog.*$': 'School of Geography',   
    r'.*BCI.*': 'Barts Cancer Institute', 
    r'.*Blizard.*': 'Blizard Institute', 
    r'.*BLIZARD.*': 'Blizard Institute',  
    r'^.*SED.*$': 'School of English and Drama' ,          
    r'^.*IPHS.*$': 'Institute of Population Health Sciences' ,         
    r'^.*SLLF.*$': 'School of Languages, Linguistics and Film' ,          
    r'^.*History.*$': 'School of History',           
    r'^.*Law.*$': 'School of Law',
    r'^.*WHRI.*$': 'William Harvey Research Institute',
    r'^.*WIPH.*$': 'Wolfson Institute of Population Health',
    r'^.*DCE_Maths.*$': 'Data-Centric Engineering/Mathematical Sciences',
    r'^.*SMS.*$': 'School of Mathematical Sciences',
    r'^.*IoD.*$': 'Institute of Dentistry',
    r'^.*SBBS.*$': 'School of Biological and Behavioural Sciences',
    r'^.*SBM.*$': 'School of Business and Management',
    r'^.*SEF.*$': 'School of Economics and Finance',
    r'^.*SEMS.*$': 'School of Engineering and Materials Science',
    r'^.*SPCS.*$': 'School of Physical and Chemical Sciences',
    r'^.*SPIR.*$': 'School of Politics and International Relations',
}          


# Iterate over the dictionary and perform replacements
for pattern, replacement in replacements.items():
    mask = df_melted['School'].str.contains(pattern, case=True, regex=True, na=False)
    df_melted.loc[mask, 'School'] = replacement

# Remove rows where 'School' is an empty string
df_melted = df_melted[df_melted['School'] != '']


print(sorted(df_melted["School"].unique().tolist()))

['Barts Cancer Institute', 'Blizard Institute', 'Data-Centric Engineering/Mathematical Sciences', 'Institute of Dentistry', 'Institute of Population Health Sciences', 'School of Biological and Behavioural Sciences', 'School of Business and Management', 'School of Economics and Finance', 'School of Electronic Engineering and Computer Science', 'School of Engineering and Materials Science', 'School of English and Drama', 'School of Geography', 'School of History', 'School of Languages, Linguistics and Film', 'School of Law', 'School of Mathematical Sciences', 'School of Physical and Chemical Sciences', 'School of Politics and International Relations', 'William Harvey Research Institute', 'Wolfson Institute of Population Health']


In [None]:
# Read the Excel file containing the category mappings
mapping_file = '/Users/graciaandriamiadana/Documents/Research Culture PGR project/data/Sources classification.xlsx'
category_mapping = pd.read_excel(mapping_file)

# Initialize an empty dictionary for replacements
replacements = {}

# Iterate over each category and its sub-names to generate regex patterns
for category in category_mapping.columns:
    for name in category_mapping[category].dropna():
        # Create a regex pattern that matches the name within any text
        pattern = rf'.*{re.escape(name)}.*'
        replacements[pattern] = category

In [9]:
# replacements[r'^CDA supplemen.*'] = 'UKRI'
# replacements[r'^AIM CDT.*'] = 'UKRI'

replacements[r'^\?.*'] = 'not available'
replacements[r'^nan$'] = 'not available'

# Additional entries grouped under "Charities"
charities_entries = [
    'HS Barlow Charitable Trust/ Paragraf',
    'Heart Research UK',
    'Horne Family Charitable Fdn',
    'Horne Family foundation',
    'Horne Foundation',
    'Bowel Research UK',
    'Welcome',
    'Wellcome',
    'PWSA UK',
    'Prostate Cancer UK',
    'Stuart Hall Foundation (SHF).',
    'Versus arthritis',
    'ANTRUK (Antibiotic Research UK)',
    'Animal Free Research UK',
    'Barry Reed Foundation',
    "Tommy's Charity",
    'Charity',

]

# Add these entries to the replacements dictionary under "Charities"
for entry in charities_entries:
    pattern = rf'.*{re.escape(entry)}.*'
    replacements[pattern] = 'Charities'

# Additional entries grouped under "Industry"
industry_entries = [
    'AstraZeneca',
    'Artios Pharma',
    'Huawei',
    'Industrial top-up',
    'Industry',
    'Industry (Creative Assembly)',
    'LTA Cola',
    'META',
    'Meta Platforms, Inc.',
    'Microsoft',
    'COLA',
    'ICase industry partner',
    'Matching industry contribution',
    'industry'
]

# Add these entries to the replacements dictionary under "Industry"
for entry in industry_entries:
    pattern = rf'.*{re.escape(entry)}.*'
    replacements[pattern] = 'Industry'

# Additional entries grouped under "UKRI"
ukri_entries = [
    'CDA supplement',
    'AIM CDT',
    'LiDo',
    'LISS',
    'CDT',
    'EPRSC DTP CASE Conversion 2021',

]

# Add these entries to the replacements dictionary under "UKRI"
for entry in ukri_entries:
    pattern = rf'.*{re.escape(entry)}.*'
    replacements[pattern] = 'UKRI'

# Additional entries grouped under "Internal"
internal_entries = [
    'SBBS',
    'SPIR',
    'SPCS',
    'WIPH',
    'Wolfson',
    'Supervisor Project',
    'Supervisor project',
    'Faculty',
    'Faculty Match fund',
    'Faculty Match fund.  P/T rates from 01-Oct-22.',
    'Faculty match fund (50%)',
    'BAME studentship',
    'S&E Flexible Match funding',
    'S&E Match Funded',
    '50% S&E matched funding',
    'BCSC',
    'BCSC Alexandra Carrell',
    'Match Funding',
    'PHURI', # Institute at Whitechapel

]

# Add these entries to the replacements dictionary under "Internal"
for entry in internal_entries:
    pattern = rf'.*{re.escape(entry)}.*'
    replacements[pattern] = 'Internal'

# Additional entries grouped under "Other"
other_entries = [
    'DSTL',
    'Defence Science & Tech Lab',
    'Defence Science and Tech Lab, Gov UK',
    'One off payment for CDA top up back pay. Paid out on 25-Aug-23',
    'One off payment for historial 3m cost of living uplift',
    'One-off top-up => paid in wkly list 25-apr-2023 => ideally, it wasgoint to be an extension of funding for Oct-Dec 2023 => however, the supervisor, Dr Pearce, had to put this payment through now as his grant closes in June 2023',
    'S/s via private donation',
    'Government'
]

# Add these entries to the replacements dictionary under "Other"
for entry in other_entries:
    pattern = rf'.*{re.escape(entry)}.*'
    replacements[pattern] = 'Other'

# print(replacements)

for pattern, replacement in replacements.items():
    mask = df_melted['Source'].str.contains(pattern, case=True, regex=True, na=False)
    df_melted.loc[mask, 'Source'] = replacement


print(len(df_melted["Source"].unique().tolist()))
(sorted(df_melted["Source"].unique().tolist()))


45


['50% LMK Thermosafe',
 'AIDD CTP',
 'AIDD CTP match ',
 'Acutus Medical',
 'Altos Labs',
 'Aston Martin Formula One Team',
 'BBKA-British Beekeepers Assoc',
 'Bela - 25%',
 'Belgian Rsch Inst, VITO',
 'CASE Awards',
 'Carbon Numbers Ltd and match funded',
 'Carl Zeiss',
 'Charities',
 'DAACI',
 'DEFRA',
 'Deepmind',
 'Delphia',
 'EU',
 'Environmental Agency top up only',
 'Evonik Operations GmbH',
 'Flamin-GO',
 'Flexible Resarch Fund',
 'GambleAware',
 'ISIS',
 'Industry',
 'Internal',
 'Music Tribe - 25%',
 'Myerscough',
 'NDA',
 'National Nuclear Lab Ltd',
 'Other',
 'PGRF',
 'Perren Award',
 'Placement funding via NoPla',
 'Qinetiq',
 'Reckitt Benckiser',
 'S&E and Bit Bio Ltd',
 'Servier',
 'Stuart Hall Foundation HSS Research Studenship',
 'Sumitomo Corp Europe',
 'Syngenta',
 'UKRI',
 'UMG - 50%',
 'iCASE',
 'not available']

In [54]:
# (df_melted.head(10))

In [10]:
# Group by 'Budget code' and sum the 'Amount' for each budget code
budget_totals = df_melted.groupby('Budget code')['Amount'].sum().reset_index()
budget_totals['Formatted Amount'] = budget_totals['Amount'].apply(lambda x: f'£{x:,.2f}')  # Format amount as currency with pounds (£) symbol
budget_totals.rename(columns={'Budget code': 'Budget code', 'Formatted Amount': 'Total Amount (£)'}, inplace=True)

# Display the total amounts per budget code
# print(budget_totals)

#### --- if requested by quartal: --- ####
# Apply custom date intervals to 'Date' column
df_melted['Quartal'] = df_melted['Date'].apply(get_quartal)

# Group by 'Budget code' and 'Quartal' and sum the 'Amount' for each combination
budget_quartal_totals = df_melted.groupby(['Budget code', 'Quartal'])['Amount'].sum().reset_index()

# Display the total amounts per budget code and quartal
# print(budget_quartal_totals)

# Group by 'School' and sum the 'Amount' for each School
studentship_payments_by_school = df_melted.groupby('School')['Amount'].sum().reset_index()

# Format the 'Amount' column
studentship_payments_by_school['Formatted Amount'] = studentship_payments_by_school['Amount'].apply(lambda x: f'£{x:,.2f}')  # Format amount as currency with pounds (£) symbol

# Rename columns for better display
studentship_payments_by_school.rename(columns={'School': 'School Name', 'Formatted Amount': 'Total Amount (£)'}, inplace=True)

# Display the total amounts per budget code
# print(studentship_payments_by_school)


# Group by 'Budget code' and 'Quartal' and sum the 'Amount' for each combination
studentship_payments_by_school_quartal_totals = df_melted.groupby(['School', 'Quartal'])['Amount'].sum().reset_index()

# Format the 'Amount' column
studentship_payments_by_school_quartal_totals['Formatted Amount'] = studentship_payments_by_school_quartal_totals['Amount'].apply(lambda x: f'£{x:,.2f}')  # Format amount as currency with pounds (£) symbol

# Rename columns for better display
studentship_payments_by_school.rename(columns={'School': 'School Name', 'Formatted Amount': 'Total Amount (£)'}, inplace=True)

# Display the total amounts per budget code and quartal
# print(studentship_payments_by_school_quartal_totals)

# Group by 'Source' and sum the 'Amount' for each School
studentship_payments_by_source = df_melted.groupby('Source')['Amount'].sum().reset_index()

# Format the 'Amount' column
studentship_payments_by_source['Formatted Amount'] = studentship_payments_by_source['Amount'].apply(lambda x: f'£{x:,.2f}')  # Format amount as currency with pounds (£) symbol

# Rename columns for better display
studentship_payments_by_source.rename(columns={'Source': 'Source Name', 'Formatted Amount': 'Total Amount (£)'}, inplace=True)

# Display the total amounts per budget code
studentship_payments_by_source

Unnamed: 0,Source Name,Amount,Total Amount (£)
0,50% LMK Thermosafe,10311.0,"£10,311.00"
1,AIDD CTP,39525.5,"£39,525.50"
2,AIDD CTP match,20622.0,"£20,622.00"
3,Acutus Medical,9244.73,"£9,244.73"
4,Altos Labs,5000.0,"£5,000.00"
5,Aston Martin Formula One Team,20622.0,"£20,622.00"
6,BBKA-British Beekeepers Assoc,750.0,£750.00
7,Bela - 25%,5155.5,"£5,155.50"
8,"Belgian Rsch Inst, VITO",20622.0,"£20,622.00"
9,CASE Awards,8000.0,"£8,000.00"


In [11]:
budget_totals

Unnamed: 0,Budget code,Amount,Total Amount (£)
0,ART4020B,10611.000000,"£10,611.00"
1,ART4020B,334839.610000,"£334,839.61"
2,ART4020B,15466.480000,"£15,466.48"
3,ART4040B,20622.000000,"£20,622.00"
4,ASTF1B3R,20622.000000,"£20,622.00"
...,...,...,...
441,TMEN1A1R,21000.000000,"£21,000.00"
442,TMTG1K4R,25718.916667,"£25,718.92"
443,TMTL1D5R,18106.500000,"£18,106.50"
444,TMTW1A2R,3200.000000,"£3,200.00"


In [13]:
studentship_payments_by_school

Unnamed: 0,School Name,Amount,Total Amount (£)
0,Barts Cancer Institute,1389498.0,"£1,389,498.25"
1,Blizard Institute,494816.3,"£494,816.26"
2,Data-Centric Engineering/Mathematical Sciences,11902.5,"£11,902.50"
3,Institute of Dentistry,21290.42,"£21,290.42"
4,Institute of Population Health Sciences,19964.0,"£19,964.00"
5,School of Biological and Behavioural Sciences,1235534.0,"£1,235,533.68"
6,School of Business and Management,694274.0,"£694,274.00"
7,School of Economics and Finance,995104.2,"£995,104.25"
8,School of Electronic Engineering and Computer ...,2816169.0,"£2,816,168.75"
9,School of Engineering and Materials Science,1517174.0,"£1,517,173.60"


In [29]:
studentship_payments_by_school_quartal_totals

Unnamed: 0,School,Quartal,Total_Amount,Formatted Amount
0,BCI,Q1,374000.533333,"£374,000.53"
1,BCI,Q2,336123.750000,"£336,123.75"
2,BCI,Q3,330234.000000,"£330,234.00"
3,BCI,Q4,396847.333333,"£396,847.33"
4,Blizard Institute,Q1,291210.881900,"£291,210.88"
...,...,...,...,...
83,WHRI,Q4,304288.947500,"£304,288.95"
84,WIPH,Q1,266884.265000,"£266,884.27"
85,WIPH,Q2,268415.407527,"£268,415.41"
86,WIPH,Q3,256744.450000,"£256,744.45"
