In [6]:
import pandas as pd
import glob
import math
import xlrd
import openpyxl

# Define date interval quartals

def get_quartal(date):
    if date.month in [1, 2, 3]:
        return 'Q1'
    elif date.month in [4, 5, 6]:
        return 'Q2'
    elif date.month in [7, 8, 9]:
        return 'Q3'
    else:
        return 'Q4'

In [7]:
filepath = "/Users/graciaandriamiadana/Documents/Research Culture PGR project/data/Combined PGR studenship payments 2022-23.xlsx"
df = pd.read_excel(filepath, engine= 'openpyxl')
df.head(5)

Unnamed: 0,Student ID,Budget code,A/c Code,2022-10-01 00:00:00,2022-11-01 00:00:00,2022-12-01 00:00:00,2023-01-01 00:00:00,2023-02-01 00:00:00,2023-03-01 00:00:00,2023-04-01 00:00:00,...,2023-06-01 00:00:00,2023-07-01 00:00:00,2023-08-01 00:00:00,2023-09-01 00:00:00,School,Start date,End date,Duration,Source,uplift to 1/10/2022 payment for cost of living increase
0,200343938,ECSY1A3R,3620,4515.5,,,4917.0,,,4917,...,,4917,,,AIDD_EECS,2022-10-01 00:00:00,2026-09-30 00:00:00,48,BBSRC,401.5
1,220679211,ECSY1A2R,3620,,,6556.0,,,,4917,...,,4917,,,AIDD_EECS,2022-12-01 00:00:00,2026-11-30 00:00:00,48,BBSRC,
2,200346331,BCCG1D3R,3620,5250.0,,,5750.0,,,5750,...,,5750,,,BCI,2021-10-01 00:00:00,2024-09-30 00:00:00,36,CRUK + CoL Centre,500.0
3,200826925,IRM9045B,3620,4917.0,4917.0,,4917.0,,,4917,...,,4917,,,BCI,2022-10-01 00:00:00,2025-09-30 00:00:00,36,BCSC,0.0
4,200974709,PRN1040R,3620,4515.5,,,4917.0,,,4917,...,,4917,,,BCI,2020-10-01 00:00:00,2024-09-30 00:00:00,48,BBSRC LIDo,401.5


In [8]:
# Reshape the DataFrame using pd.melt() to unpivot date columns
df_melted = pd.melt(df, id_vars=['Student ID', 'Budget code', 'A/c Code', 'School', 'Start date', 'End date', 'Duration', 'Source'],
                    var_name='Date', value_name='Amount')

# Convert 'Date' column to datetime format
df_melted['Date'] = pd.to_datetime(df_melted['Date'], errors='coerce')  # 'coerce' to handle any parsing errors

# # Convert non-numeric strings to NaN in 'Amount' columns
df_melted['Amount'] = pd.to_numeric(df_melted['Amount'], errors='coerce')

# Convert the "Source" column to strings 
df_melted["Source"] = df_melted["Source"].astype(str)

In [9]:
# Uncomment this block for dataset 2022-23
# but do not run for all other datasets

# Handle additional column ('uplift to 1/10/2022 payment for cost of living increase')
df_melted['uplift_amount'] = pd.to_numeric(df['uplift to 1/10/2022 payment for cost of living increase '], errors='coerce')

# Combine 'Amount' and 'uplift_amount' columns
df_melted['Total_Amount'] = df_melted['Amount'].fillna(0) + df_melted['uplift_amount'].fillna(0)

In [10]:
df_melted['School'] = df_melted['School'].fillna('')

# Perform case-insensitive replacements and transformations in the 'School' column of df_melted
replacements = {
    r'.*EECS.*': 'EECS',        # Replace entries containing "EECS" with "EECS"
    r'^.*Geog.*$': 'School of Geography',   # Replace entries containing "Geog" with "Geography"
    r'.*Blizard.*': 'Blizard Institute',  # Replace entries containing "Blizard" with "Blizard"
    r'.*BLIZARD.*': 'Blizard Institute',  # Replace entries containing "Blizard" with "Blizard"
    r'^.*SED.*$': 'SED' ,          # Replace entries containing "SED" with "SED"
    r'^.*IPHS.*$': 'IPHS' ,          # Replace entries containing "SED" with "SED"
    r'^.*SLLF.*$': 'SLLF' ,          
    r'^.*History.*$': 'School of History',           
    r'^.*Law.*$': 'School of Law',
}          


# Iterate over the dictionary and perform replacements
for pattern, replacement in replacements.items():
    mask = df_melted['School'].str.contains(pattern, case=True, regex=True, na=False)
    df_melted.loc[mask, 'School'] = replacement

# Remove rows where 'School' is an empty string
df_melted = df_melted[df_melted['School'] != '']


print(sorted(df_melted["School"].unique().tolist()))

['BCI', 'Blizard Institute', 'DCE_Maths', 'EECS', 'IPHS', 'IoD', 'IoD_BCI', 'SBBS', 'SBM', 'SED', 'SEF', 'SEMS', 'SLLF', 'SMS', 'SPCS', 'SPIR', 'School', 'School of Geography', 'School of History', 'School of Law', 'WHRI', 'WIPH']


In [11]:
df_melted['Source'] = df_melted['Source'].fillna('')

# Perform case-insensitive replacements and transformations in the 'Source' column of df_melted
replacements = {
    r'^Alan Turing.*': 'Other',     
    r'^AHRC.*': 'UKRI',         
    r'^BBC.*': 'Industry',  
    r'^BCSC.*': 'BCSC',  
    r'^Barts.*': 'Charities',  # Funded via Barts Charity 
    r'.*unded via Barts Charity .*': 'Charities',  # unded via Barts Charity 
    r'^BHF.*': 'Charities',  
    r'^British Heart Found.*': 'Charities',  
    r'^Blizard.*': 'Internal',  
    r'^British Skin.*': 'Charities',  
    r'^CDA supplemen.*': 'UKRI',  

    r'^DSTL.*': 'Other',  
    r'^Defence Science.*': 'Other',  

    r'.*Horizon.*': 'EU',
    r'.*H2020.*': 'EU',  
    r'.*PROLICELL .*': 'EU',  
    r'.*European Commission .*': 'EU',  

    r'.*LISS.*': 'UKRI', 

    r'.*NC3R.*': 'Other',  
    r'.*NC£R.*': 'Other',  

    r'.*NERC.*': 'UKRI',  
    r'.*NIHR.*': 'UKRI',  

    r'.*EPSRC.*': 'UKRI',
    r'.*EPRSC.*': 'UKRI', 
    #  ?  'EPRSC DTP CASE Conversion 2021', --> EPSRC but typo? yes
    
    r'.*Faculty match fund.*': 'Internal',  
    r'^Faculty.*': 'Internal',  
    r'.*match funded by HSS faculty.*': 'Internal',  

    r'^Steinberg.*': 'Other',  
    r'^WHIR.*': 'Internal',  
    r'^WHRI.*': 'Internal',  
    r'^UKRI.*': 'UKRI',  
    r'^Welcome.*': 'Charities',  
    r'.*Wellcome.*': 'Charities',  
    r'^WIPH.*': 'Internal',  
    r'^Wolfson.*': 'Internal',  


    r'.*QMUL Life Sciences Initative.*': 'Internal',  
    r'.*QMUL (IGGI).*': 'Internal',  
    r'.*QM Principal.*': 'Internal',  
    r'.*Principles.*': 'QM Principal\'s Award',
    r'QMUL Principal\'s Research Studentship': 'Internal',
    r'QMUL Principal\'s$': 'Internal',
    r'QMUL Principal\'s \+ DAME mini\-DTC$': 'Internal',

    r'^One off payment.*': 'Other',  
    r'^One-off top-up.*': 'Other',  

    r'^QMUL$': 'Internal',  
    r'^Match funded to ESRC 100% by QM$': 'Internal',  
    r'^QM match.*': 'Internal',  

# 'Versus Arthritis CiTI','Versus arthritis',

    r'.*BBSRC.*': 'UKRI',  
    # r'.*BBSRC Lido.*': 'BBSRC LiDo',  
    # r'.*BBSRC Lido.*': 'BBSRC LiDo',  
    # r'.*LiDo BBSRC.*': 'BBSRC LiDo',  
    # r'.*Lido BBSRC.*': 'BBSRC LiDo',  
    # r'.*LIDo BBSRC.*': 'BBSRC LiDo',  
    
    r'.*LiDO.*': 'UKRI', 
    r'.*LIDo.*': 'UKRI',  
    r'.*LIDo.*': 'UKRI',  
    r'.*LiDO.*': 'UKRI',  
    r'.*LiDo.*': 'UKRI', 
    r'.*LIDO.*': 'UKRI', 
    r'nan': 'n/a', 
    r'?': 'n/a', 

# nan
}          

# EPSRC
# LISS as ESRC/AHRC 
# CDA supplement AHRC -->  LAHP
# Duchess of Botany --> LAHP
# EPSRC/AHRC as a separate one 

# Iterate over the dictionary and perform replacements
for pattern, replacement in replacements.items():
    mask = df_melted['Source'].str.contains(pattern, case=True, regex=True, na=False)
    df_melted.loc[mask, 'Source'] = replacement


print(len(df_melted["Source"].unique().tolist()))
# print("\n\n")
(sorted(df_melted["Source"].unique().tolist()))

194


  mask = df_melted['Source'].str.contains(pattern, case=True, regex=True, na=False)


['50% LMK Thermosafe',
 '50% S&E matched funding',
 '50:50 SPA:ISIS',
 '?',
 'A* studentship',
 'AIM CDT - top-up',
 'ANTRUK (Antibiotic Research UK)',
 'ARSACS',
 'AWE',
 'Acutus Medical',
 'Animal Free Research UK',
 'Arthritis Research UK',
 'Artios Pharma',
 'Asthma UK Centre for Applied Research',
 'Aston Martin Formula One Team',
 'AstraZeneca',
 'Astro Brain Tumour Funds',
 'BAME studentship',
 'BCSC',
 'BLT Studentship',
 'BT',
 'Balearic Islands Doctoral Studentship in Catalan Studies (Institut Ramon Llull)',
 'Barry Reed Foundation',
 'Bela - 25%',
 'Belgian Rsch Inst, VITO',
 'Birla Carbon USA Inc',
 'Bowel Research UK',
 'Bowel and Cancer Research',
 'Brain Tumour Research',
 'Bridgestone Elastomer Research',
 'British Journal Anaesthesia/Royal College for Anaesthesia',
 'Byte Dance',
 'CCLS Cloud Project',
 'CDT',
 'COLA',
 'CRUK',
 'CRUK + CoL Centre',
 'CRUK + RadNet',
 'CRUK + RadNet + CoL Centre',
 'CRUK Accelerator',
 'Carbon Numbers Ltd and match funded',
 'Charities

In [12]:
# Group by 'Budget code' and sum the 'Amount' for each budget code
budget_totals = df_melted.groupby('Budget code')['Total_Amount'].sum().reset_index()
budget_totals['Formatted Amount'] = budget_totals['Total_Amount'].apply(lambda x: f'£{x:,.2f}')  # Format amount as currency with pounds (£) symbol
budget_totals.rename(columns={'Budget code': 'Budget code', 'Formatted Amount': 'Total Amount (£)'}, inplace=True)

# Display the total amounts per budget code
# print(budget_totals)

#### --- if requested by quartal: --- ####
# Apply custom date intervals to 'Date' column
df_melted['Quartal'] = df_melted['Date'].apply(get_quartal)

# Group by 'Budget code' and 'Quartal' and sum the 'Amount' for each combination
budget_quartal_totals = df_melted.groupby(['Budget code', 'Quartal'])['Total_Amount'].sum().reset_index()

# Display the total amounts per budget code and quartal
# print(budget_quartal_totals)

# Group by 'School' and sum the 'Amount' for each School
studentship_payments_by_school = df_melted.groupby('School')['Total_Amount'].sum().reset_index()

# Format the 'Amount' column
studentship_payments_by_school['Formatted Amount'] = studentship_payments_by_school['Total_Amount'].apply(lambda x: f'£{x:,.2f}')  # Format amount as currency with pounds (£) symbol

# Rename columns for better display
studentship_payments_by_school.rename(columns={'School': 'School Name', 'Formatted Amount': 'Total Amount (£)'}, inplace=True)

# Display the total amounts per budget code
# print(studentship_payments_by_school)


# Group by 'Budget code' and 'Quartal' and sum the 'Amount' for each combination
studentship_payments_by_school_quartal_totals = df_melted.groupby(['School', 'Quartal'])['Total_Amount'].sum().reset_index()

# Format the 'Amount' column
studentship_payments_by_school_quartal_totals['Formatted Amount'] = studentship_payments_by_school_quartal_totals['Total_Amount'].apply(lambda x: f'£{x:,.2f}')  # Format amount as currency with pounds (£) symbol

# Rename columns for better display
studentship_payments_by_school.rename(columns={'School': 'School Name', 'Formatted Amount': 'Total Amount (£)'}, inplace=True)

# Display the total amounts per budget code and quartal
# print(studentship_payments_by_school_quartal_totals)

# Group by 'Source' and sum the 'Amount' for each School
studentship_payments_by_source = df_melted.groupby('Source')['Total_Amount'].sum().reset_index()

# Format the 'Amount' column
studentship_payments_by_source['Formatted Amount'] = studentship_payments_by_source['Total_Amount'].apply(lambda x: f'£{x:,.2f}')  # Format amount as currency with pounds (£) symbol

# Rename columns for better display
studentship_payments_by_source.rename(columns={'Source': 'Source Name', 'Formatted Amount': 'Total Amount (£)'}, inplace=True)

# Display the total amounts per budget code
studentship_payments_by_source

studentship_payments_by_source[studentship_payments_by_source['Source Name'].str.contains('Alan Turing', case=False)]


Unnamed: 0,Source Name,Total_Amount,Total Amount (£)
131,QM & Alan Turing,13475.0,"£13,475.00"
132,QM & Alan Turing.,8625.0,"£8,625.00"


In [13]:
budget_totals

Unnamed: 0,Budget code,Total_Amount,Total Amount (£)
0,ART4020B,10309.750000,"£10,309.75"
1,ART4020B,344086.730000,"£344,086.73"
2,ART4040B,20069.500000,"£20,069.50"
3,ASTF1A9R,0.000000,£0.00
4,ASTF1B3R,14751.000000,"£14,751.00"
...,...,...,...
553,TMTL1D5R,24142.000000,"£24,142.00"
554,TMTP1A4R,6020.666667,"£6,020.67"
555,TMTW1A2R,3200.000000,"£3,200.00"
556,TMTY1A3R,20069.500000,"£20,069.50"


In [14]:
# studentship_payments_by_source[studentship_payments_by_source['Source Name'] == "LIDo (BBSRC)"]
studentship_payments_by_source

Unnamed: 0,Source Name,Total_Amount,Total Amount (£)
0,50% LMK Thermosafe,9031.000000,"£9,031.00"
1,50% S&E matched funding,11038.500000,"£11,038.50"
2,50:50 SPA:ISIS,3010.333333,"£3,010.33"
3,?,25600.000000,"£25,600.00"
4,A* studentship,0.000000,£0.00
...,...,...,...
189,Versus arthritis,11930.000000,"£11,930.00"
190,Zhoukou Tianjiukang Pharmaceutical Co. Ltd.,3750.000000,"£3,750.00"
191,iCASE,2760.000000,"£2,760.00"
192,,144249.756667,"£144,249.76"


In [15]:
studentship_payments_by_school

Unnamed: 0,School Name,Total_Amount,Total Amount (£)
0,BCI,1437206.0,"£1,437,205.62"
1,Blizard Institute,1144177.0,"£1,144,176.92"
2,DCE_Maths,11626.25,"£11,626.25"
3,EECS,2928362.0,"£2,928,361.71"
4,IPHS,14973.0,"£14,973.00"
5,IoD,49784.8,"£49,784.80"
6,IoD_BCI,20507.5,"£20,507.50"
7,SBBS,1320697.0,"£1,320,697.31"
8,SBM,682162.2,"£682,162.25"
9,SED,479718.2,"£479,718.25"


In [16]:
studentship_payments_by_school_quartal_totals

Unnamed: 0,School,Quartal,Total_Amount,Formatted Amount
0,BCI,Q1,374000.533333,"£374,000.53"
1,BCI,Q2,336123.750000,"£336,123.75"
2,BCI,Q3,330234.000000,"£330,234.00"
3,BCI,Q4,396847.333333,"£396,847.33"
4,Blizard Institute,Q1,291210.881900,"£291,210.88"
...,...,...,...,...
83,WHRI,Q4,304288.947500,"£304,288.95"
84,WIPH,Q1,266884.265000,"£266,884.27"
85,WIPH,Q2,268415.407527,"£268,415.41"
86,WIPH,Q3,256744.450000,"£256,744.45"
