In [439]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from pypdf import PdfReader
from tabula import read_pdf
from tabulate import tabulate

from tqdm import tqdm
import os 

from multielo import MultiElo, Player, Tracker

In [440]:
def sanitize(i):
    i = i.replace('\r', '')
    i = i.replace('- ', '-')
    i = i.replace('? ', '?')
    i = i.replace('( ', '(')
    i = i.replace(' )', ')')
    i = i.replace(' ̄', '-')
    i = i.replace('¯', '-')
    i = i.replace(chr(772), '-')
    i = i.replace(chr(64257), 'fi')
    i = i.replace(" ", "")
    return i

In [441]:
def read_tabs(pdfs):
    tab_dict_list = []
    for pdf in tqdm(pdfs):
        reader = PdfReader(pdf)
        team_names = []
        number_of_pages = len(reader.pages)
        for i in np.arange(number_of_pages):
            page = reader.pages[int(i)]
            text = page.extract_text()
            team_names+=[x[1:x.find('\n')] for x in text.split("Team:")[1:]]
            #print(team_names)
            team_names = [sanitize(x) for x in team_names]
            #print(team_names)
            #reads table from pdf file
        df = read_pdf(pdf, pages="all", lattice=True) #address| of pdf file
        tab_dict = dict(zip(team_names, df))
        new_keys=[]
        for i in list(tab_dict.keys()):
            if 'R G/O' in i:
                new_keys.append(i[0:i.find('R G/O')-1])
            elif 'RG/OW/L' in i:
                new_keys.append(i[0:i.find('RG/OW/L')])
            else:
                new_keys.append(i)
        tab_dict = dict(zip(new_keys, list(tab_dict.values())))
        tab_dict_list.append(tab_dict)
    return tab_dict_list

In [442]:
def read_tab(pdf):
    if isinstance(pdf, list):
        return read_tabs(pdf)
    reader = PdfReader(pdf)
    team_names = []
    number_of_pages = len(reader.pages)
    for i in np.arange(number_of_pages):
        page = reader.pages[int(i)]
        text = page.extract_text()
        team_names+=[x[1:x.find('\n')] for x in text.split("Team:")[1:]]
        #print(team_names)
        team_names = [sanitize(x) for x in team_names]
        #print(team_names)
        #reads table from pdf file
    df = read_pdf(pdf, pages="all", lattice=True) #address| of pdf file
    tab_dict = dict(zip(team_names, df))
    new_keys = []
    for i in list(tab_dict.keys()):
        if 'R G/O' in i:
            new_keys.append(sanitize(i[0:i.find('R G/O')-1]))
        elif 'RG/OW/L' in i:
            new_keys.append(sanitize(i[0:i.find('RG/OW/L')-1]))
        else:
            new_keys.append(sanitize(i))
    tab_dict = dict(zip(new_keys, list(tab_dict.values())))
            
    return tab_dict

In [446]:
def get_rates(td):
    team_names_dic = {}
    name_ranks_dic = {}
    n_rounds = 0
    for i in list(td.keys()):
        test = td[i]
        #print(test)
        test.dropna(axis=0, how='all', inplace=True)
        test = test[test.columns[~test.columns.str.contains('Unnamed')]]
        if len(test)==0:
            continue
        p1 = test.columns[5]
        p1_name = p1[:-4]
        p1_age = p1[-2:-1]
        p1_name = p1_name.replace('\r', ' ')
        p1_first = p1_name.split(" ")[0]
        p1_second = ''.join(p1_name.split(" ")[1:])
        p1_ranks = [eval(x)[1] for x in list(test[p1].fillna('(-1,99)'))[:-1]]

        p2 = test.columns[6]
        p2_name = p2[:-4]
        p2_age = p2[-2:-1]
        p2_name = p2_name.replace('\r', ' ')
        p2_first = p2_name.split(" ")[0]
        p2_second = ''.join(p2_name.split(" ")[1:])

        p2_ranks = [eval(x)[1] for x in list(test[p2].fillna('(-1,99)'))[:-1]]
        i = sanitize(i)
        team_names_dic[i] = (p1_first+' '+p1_second, p2_first+' '+p2_second)
        name_ranks_dic[p1_first+' '+p1_second] = p1_ranks
        name_ranks_dic[p2_first+' '+p2_second] = p2_ranks
    n_rounds = np.median([len(x) for x in list(name_ranks_dic.values())])

    rounds = []
    for i in list(td.keys()):
        #print(i)
        test = td[i]
        test.dropna(axis=0, how='all', inplace=True)
        #print(test)
        if len(test)==0:
            continue
        oppos = test[test.columns[3]].dropna()[:-1]
        test = test[test.columns[~test.columns.str.contains('Unnamed')]]
        if len(oppos)==n_rounds:
            #print(oppos)
            #print(oppos)
            oppos = [sanitize(x) for x in oppos]
            unq_rounds = [(i,x) for x in oppos if (x,i) not in rounds]
            rounds+= unq_rounds
    
    ranks_df = pd.DataFrame(columns=['date', '1st', '2nd', '3rd', '4th'])
    date_list = []
    rank_names = []
    date_dic = dict(zip(np.unique(rounds), [0]*len(np.unique(rounds))))
    bad_count = 0
    #print(rounds)
    for r in rounds:
        #print(repr(r))
        date_dic[r[0]]+=1
        date_dic[r[1]]+=1
        rd = max(date_dic[r[0]], date_dic[r[1]])
        date_list.append(rd)
        #print(r[1])
        debaters = np.append(team_names_dic[r[0]], team_names_dic[r[1]])
        #print(debaters)
        #print(rd)
        #print([name_ranks_dic[x] for x in debaters])
#         print(rd)
#         print([name_ranks_dic[x] for x in debaters])
#         print(r[0])
#         print(r[1])
        ranks = [name_ranks_dic[x][rd-1] for x in debaters]
        deb_in_rank = [x for _, x in sorted(zip(ranks, debaters))]

        rank_names.append(deb_in_rank)
    ranks_df.date=date_list
    ranks_df[ranks_df.columns[1:]]=rank_names
    return ranks_df

In [444]:

pdfs = ['./all_tabs/to_use/'+x for x in np.sort(os.listdir('./all_tabs/to_use/'))[1:]]

In [456]:
td = read_tab(list(pdfs))

 10%|████▍                                       | 1/10 [00:00<00:06,  1.30it/s]Got stderr: Mar 07, 2023 10:42:04 AM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode

 90%|███████████████████████████████████████▌    | 9/10 [00:10<00:01,  1.33s/it]Got stderr: Mar 07, 2023 10:42:14 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Mar 07, 2023 10:42:14 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4

100%|███████████████████████████████████████████| 10/10 [00:11<00:00,  1.14s/it]


In [447]:
rates_l = []
for n,i in enumerate(tqdm(td)):
#     rates_l.append(get_rates(i))
    try:
        rates_l.append(get_rates(i))
    except:
        print(n)

 30%|█████████████▏                              | 3/10 [00:00<00:00, 26.85it/s]

1
4


100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 12.12it/s]

8





In [458]:
get_rates(td[4])

ValueError: Columns must be same length as key

In [448]:
for n,i in enumerate(rates_l):
    i.date=[str(n)+"_"+str(x) for x in i.date]

In [449]:
all_rates = pd.concat(rates_l)
all_rates.replace('Griffin Badlamente', 'Griffin Badalamente', inplace=True)

In [450]:
tracker = Tracker()
tracker.process_data(all_rates)

In [451]:
rates = tracker.get_current_ratings()

In [454]:
rates

Unnamed: 0,rank,player_id,n_games,rating
0,1,Anish Welde,10,1169.003912
1,2,Parker Kelly,10,1161.901373
2,3,Nicole Kagan,15,1141.349468
3,4,Austin Goldsmith-Lachut,10,1136.452953
4,5,Paloma O'Connor,15,1132.054286
...,...,...,...,...
335,336,Sid Gupta,5,891.184737
336,337,Joe Cox,15,882.135295
337,338,Mehul Agarwal,10,881.409543
338,339,Michel Nehme,15,878.638407


In [452]:
rates[rates.player_id.str.contains("Gri")]

Unnamed: 0,rank,player_id,n_games,rating
120,121,Griffin Badalamente,5,1020.529098


In [453]:
td = read_tab('./all_tabs/brandeis tab cards.pdf')


In [254]:
team_names_dic = {}
name_ranks_dic = {}
n_rounds = 0
tname = 'old'
for i in list(td.keys()):
    test = td[i]
    if 'old' in tname:
        test.columns = list(test.columns[:3])+list(test.iloc[0])[3:]
        if all(pd.Series(list(test[test.columns[6]])[1:6]).isnull()):
            continue
        test.columns=[test.columns[x] for x in [0,4,4,3,4,4,1,2,7]]
        test.drop(0,inplace=True)
        test.dropna(axis=1, how='all', inplace=True)
        print(i)
        i = i[0:i.find('(')]
    else:
        test = test[test.columns[~test.columns.str.contains('Unnamed')]]
    p1 = test.columns[5]
    p1_name = p1[:-4]
    p1_age = p1[-2:-1]
    p1_name = p1_name.replace('\r', ' ')
    p1_first = p1_name.split(" ")[0]
    p1_second = ''.join(p1_name.split(" ")[1:])
    p1_ranks = [eval(x)[1] for x in list(test[p1].fillna('(-1,99)'))[:-1]]

    p2 = test.columns[6]
    p2_name = p2[:-4]
    p2_age = p2[-2:-1]
    p2_name = p2_name.replace('\r', ' ')
    p2_first = p2_name.split(" ")[0]
    p2_second = ''.join(p2_name.split(" ")[1:])

    p2_ranks = [eval(x)[1] for x in list(test[p2].fillna('(-1,99)'))[:-1]]
    i = sanitize(i)
    team_names_dic[i] = (p1_first+' '+p1_second, p2_first+' '+p2_second)
    name_ranks_dic[p1_first+' '+p1_second] = p1_ranks
    name_ranks_dic[p2_first+' '+p2_second] = p2_ranks
n_rounds = np.median([len(x) for x in list(name_ranks_dic.values())])

Brown Space Grails (Brown) Ann
Y ale SB (Y ale)Shreeya


In [257]:
'abc'.find('b')

1

In [256]:
td.keys()

dict_keys(['Brown Space Grails (Brown) Ann', 'Brown LP (Brown)Anagha Lokhande', 'Brown Holy Grail (Brown)Elaine He', 'Brown Unholy Grail (Brown)Michael', 'Y ale PF (Y ale)Lorenzo', 'Y ale KS (Y ale)Jack', 'Y ale SS (Y ale) Arun', 'Y ale WM (Y ale)Michael', 'Y ale SB (Y ale)Shreeya', 'Y ale SA (Y ale)Xavier', 'Harvard Finance Bros (Harvard) Tejal', 'Harvard BBC (Harvard)Aditya', 'Harvard LO (Harvard) Romina', 'BU Problematic Memez (BU) Ian', 'NU/Deis Angriest Orchard (NU)Dennis Su', 'Greendale A (Brandeis)Roy Lee', 'Mr . PB and Princess Carolyn (Brandeis)Noam', 'Brandeis Sex Whistles (Brandeis)Sagie', 'BSD (Brandeis)Jackson', 'Atletico Brandeis (Brandeis)Arman', 'Brandeis Jolkien RR T olkien (Brandeis) Shira', 'Brandeis Snu Snu Gang (Brandeis) Ryan', 'Brandeis QD (Brandeis)Winnie Qin'])

In [248]:
rounds = []
for i in list(td.keys()):
    #print(i)
    test = td[i]
    oppos = test[test.columns[3]].dropna()[:-1]
    #test = test[test.columns[~test.columns.str.contains('Unnamed')]]
    if len(oppos)==n_rounds:
        #print(oppos)
        #print(oppos)
        oppos = [sanitize(x) for x in oppos]
        unq_rounds = [(i,x) for x in oppos if (x,i) not in rounds]
        rounds+= unq_rounds

In [235]:
ranks_df = pd.DataFrame(columns=['date', '1st', '2nd', '3rd', '4th'])
date_list = []
rank_names = []
date_dic = dict(zip(np.unique(rounds), [0]*len(np.unique(rounds))))
bad_count = 0
for r in rounds:
    #print(repr(r))
    date_dic[r[0]]+=1
    date_dic[r[1]]+=1
    rd = max(date_dic[r[0]], date_dic[r[1]])
    date_list.append(rd)
    #print(r[1])
    debaters = np.append(team_names_dic[r[0]], team_names_dic[r[1]])
#     try:
#         debaters = np.append(team_names_dic[r[0]], team_names_dic[r[1]])
#     except:
#         try:
#             team_names_dic[r[0]]
#         except:
#             team_names_dic[r[0]] = ('temp_a'+str(bad_count), 'temp_b'+str(bad_count))
#             name_ranks_dic['temp_a'+str(bad_count)]=[99]*int(n_rounds)
#             name_ranks_dic['temp_b'+str(bad_count)]=[99]*int(n_rounds)
#             bad_count+=1
#         try:
#             team_names_dic[r[1]]
#         except:
#             team_names_dic[r[1]] = ('temp_a'+str(bad_count), 'temp_b'+str(bad_count))
#             name_ranks_dic['temp_a'+str(bad_count)]=[99]*int(n_rounds)
#             name_ranks_dic['temp_b'+str(bad_count)]=[99]*int(n_rounds)
#             bad_count+=1
#         debaters = np.append(team_names_dic[r[0]], team_names_dic[r[1]])
    #print(debaters)
    ranks = [name_ranks_dic[x][rd-1] for x in debaters]
    deb_in_rank = [x for _, x in sorted(zip(ranks, debaters))]
    rank_names.append(deb_in_rank)
ranks_df.date=date_list
ranks_df[ranks_df.columns[1:]]=rank_names

KeyError: 'Harvard Finance Bros'

In [180]:
tracker = Tracker()
tracker.process_data(ranks_df)

In [181]:
rates = tracker.get_current_ratings()

In [186]:
rates

Unnamed: 0,rank,player_id,n_games,rating
0,1,Catherine Wang,5,1100.477769
1,2,An LanhLe,5,1091.547856
2,3,Ethan Liu,5,1091.214851
3,4,Anish Welde,5,1090.764300
4,5,Albi Manfredi,5,1089.358917
...,...,...,...,...
127,128,Zoe Rose,5,910.254572
128,129,Julia Cunningham,5,910.237936
129,130,Phuong AnhNguyenLe(Kem),5,908.684884
130,131,Jacob Sher,5,901.862093
