In [1]:
# Import necessary libraries
import pandas as pd
import seaborn as sb
import re

In [2]:
# Read csv into dataframe
df = pd.read_csv('tickets.csv')
names = pd.read_csv('names.csv')
names = names['Full Name'].str.upper().to_frame()
names = names.rename(columns={"Full Name" : "name"})
display(names)

Unnamed: 0,name
0,JORGEN LAW
1,CHONG YI AN
2,PERI ADHITYAN SIVALINGAM
3,JEREMY LEONG
4,HONG JUN HENG CLARENCE
...,...
68,LEONG JUN SENG
69,TENG ZI YI
70,LIANG ZHEN YANG
71,MOK JING YI


In [3]:
# Retrieve and rename useful columns
# Replace "Which of our Members..." with the column name of the column that indicates the member's name
memberQuestion = "Which of our Members(NTUSB) did you buy the ticket from? (Put NIL if inapplicable)"
data = df[["Quantity", memberQuestion]]
data = data.rename({memberQuestion: 'member'}, axis=1)
data['member'] = data['member'].str.upper().to_frame()
print("Total number of tickets sold =", data['Quantity'].sum())
display(data)

Total number of tickets sold = 279


Unnamed: 0,Quantity,member
0,1,ALICIA
1,1,ALICIA
2,1,RYU
3,1,NIL
4,1,NIL
...,...,...
274,1,HARRISON
275,1,HARRISON
276,1,MOK JING YI
277,1,MOK JING YI


In [17]:
# Clean up data obtained from Eventbrite
# Warning: run this cell once only!
# To run this cell again, run the previous cell then this one

# Update rows with Matthew's name because why does his name have a fucking comma
data.loc[data['member'] == 'JOSE, MATTHEW ABRAM MENDOZA', 'member'] = 'MATTHEW'

# Split multiple member names into separate rows
data['member'] = data['member'].str.split(r"AND|,|/", expand=False).to_frame()
data['Quantity'] = data['Quantity'] / data['member'].str.len()
data = data.explode('member')
# data['member'] = data['member'].str.strip()

# Update that one entry that has parenthesis wtf
data['member'] = data['member'].str.replace(r"\(.*\)", "")

# Remove <3 and :) because no love for you
data['member'] = data['member'].str.replace("<|3|\(|\)|:", "")

# Update that one entry that misspelt Sue Jean's name
data.loc[data['member'] == 'BLUE JEAN', 'member'] = 'SUE JEAN'

# Update that one entry that put Carol's surname wrong (boo to that friend)
data.loc[data['member'] == 'CAROL LIN', 'member'] = 'CAROL'

# Update that one entry that clarified which Carol they're buying from, thank you but there is only one Carol
data['member'] = data['member'].str.replace("FROM PERCUSSION", "")

# Update that one entry that has Malcolm's middle (?) name, so extra
data.loc[data['member'] == 'MALCOLM PIUS TAN', 'member'] = 'MALCOLM TAN JINHUI'

# Update all entries for Jimmy because Jimmy is not actually in his name
data.loc[data['member'] == 'JIMMY ANG', 'member'] = 'JIMMY'

# Update that one entry of Brenda so it doesn't count into Brendan
data.loc[data['member'] == 'BRENDA', 'member'] = 'BRENDA POH'

# Update any entry that puts Ynez' surname last
data['member'] = data['member'].str.replace("HOOI", "")

# Remove all spaces
data['member'] = data['member'].str.replace(' ', '')

# Display cleaned up data
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None):
    display(data)

print(data['Quantity'].sum())

Unnamed: 0,Quantity,member
0,1.0,ALICIA
1,1.0,ALICIA
2,1.0,RYU
3,1.0,NIL
4,1.0,NIL
5,1.0,LOWTHIENSHAW
6,1.0,YUSHUANG
7,1.0,FADILAH
8,1.0,HENGHONGHWEE
9,1.0,CAROL


279.0


In [18]:
# Remove spaces from both data frames
# This helps in string comparison in order to do the total sum later on
names['name'] = names['name'].str.replace(' |,|\(|\)', '')

In [19]:
# Get total tickets sold by each member based on entry of their names
rawTicketCount = data.groupby('member')['Quantity'].sum().reset_index()
rawTicketCount = rawTicketCount.sort_values(by=['Quantity'], ascending=False)

# Display all rows of ticketCount
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3):
    display(rawTicketCount)

# Check that total number of tickets sold corresponds to what we know
print(rawTicketCount['Quantity'].sum())

Unnamed: 0,member,Quantity
42,NIL,35.0
0,ALICIA,11.333
32,LINGYI,10.5
22,JORGENLAW,10.0
58,XAVIERYU,9.0
60,YUSHUANG,9.0
6,CHARLOTTECHIA,8.5
8,CHARMAINEYAOW,8.5
44,RIYADH,8.0
14,HENGHONGHWEE,8.0


278.99999999999994


In [20]:
# Create temporary column for Cartesian join
names['join'] = 1
rawTicketCount['join'] = 1

# Do a Cartesian join with names and data to consolidate tickets by name
merged = names.merge(rawTicketCount, on='join').drop('join', axis=1)
rawTicketCount.drop('join', axis=1, inplace=True)

# Compare substrings and merge both names and rawTicketCount together
merged['match'] = merged.apply(lambda x: x['name'].find(x['member']), axis=1).ge(0)

# Filter results
filtered = merged.groupby(['name', 'Quantity']).max().reset_index()[['name', 'Quantity', 'match']]
filtered = filtered[filtered['match']][['name', 'Quantity']]

# Do a groupby to get total tickets sold by each member
ticketCount = filtered.groupby('name')['Quantity'].sum().reset_index()

# Clean up ticketCount to remove unnecessary characters
ticketCount['name'] = ticketCount['name'].str.replace('b|\'', '')
ticketCount = ticketCount.sort_values(by=['Quantity'], ascending=False)

# Display all rows of ticketCount
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3):
    display(ticketCount)

# Check that total number of tickets sold corresponds to sum we know
print(ticketCount['Quantity'].sum())

Unnamed: 0,name,Quantity
0,ALICIATHE,17.333
7,GOHLINGYI,14.5
37,WONGYUSHUANG,14.0
3,CHARMAINEYAOWYUSHANE,13.5
13,JORGENLAW,13.0
19,LEESUEJEAN,12.333
38,XAVIERYUZHENGWEI,10.5
23,MALCOLMTANJINHUI,9.5
27,RIYADHAL-MUTTAQIN,8.0
10,HENGHONGHWEE,8.0


213.5
