## Download of gamelogs from pokerbots scrimmage server

### Initialize: Open a new browser instance

In [1]:
import os
import tqdm

import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get("https://scrimmage.pokerbots.org/login")

After browser instance opens
1. Manually select a certificate to login (must do every time a new browser instance is opened)
2. Add accessibility permissions (only need to do once)

### Task: Download log from range of pages

In [None]:
PAGE_MIN, PAGE_MAX = 1, 64
OVERWRITE = False
OUTPUTDIR = './scrimmage_data'

for p in tqdm.tqdm(range(PAGE_MIN,PAGE_MAX+1)):
            
    driver.get(f"https://scrimmage.pokerbots.org/team/games?page={p}")
    cell_data = driver.find_elements(By.TAG_NAME, "td") 
    assert len(cell_data) % 12 == 0, f'expected 12 columns including Logs, but there are {len(cell_data)} cells, which is not divisible by 12'
    
    Nrows = len(cell_data) // 12
    
    for i in range(len(cell_data)):
        cell = driver.find_elements(By.TAG_NAME, "td")[i]

        c = i % 12 # get column index
        
        if c < 10:
            continue
            
        type_str = None
        if c == 10:
            type_str = 'game_log'
        elif c == 11:
            type_str = 'player_log'            

        try:
            href = cell.find_elements(By.LINK_TEXT, "Log")[0].get_attribute('href')
            gameid = href.split('/')[-2]
            driver.get(href)
            savefilename = f'{OUTPUTDIR}/{type_str}_{gameid}.txt'        
            
            if os.path.exists(savefilename) == False or OVERWRITE == True:            

                driver.get(href)           
                f = open(savefilename,'w')
                f.write(driver.page_source[84:-20])
                f.close()
                driver.back()            
                
        except:
            continue

  3%|█▍                                          | 2/64 [01:39<50:38, 49.01s/it]

### Task: Download log from one page

In [192]:
PAGE = 1 # select page number
# Game Logs correspond to i % 2 == 0, Player Logs corresponds to i % 2 == 1
LOGN = 0 # log number on page; 

driver.get(f"https://scrimmage.pokerbots.org/team/games?page={PAGE}")
test = driver.find_elements(By.LINK_TEXT, "Log")
game_id = test[LOGN].get_attribute('href').split('/')[-2]
test[LOGN].click()

# f = open(f'{outputdir}/scrimmage_data/game_log_{game_id}.txt','w')
# f.write(driver.page_source[84:-20])
# f.close()

### Task: Generate Metadata Table with GameID lookup Key

In [342]:
PAGE_MIN, PAGE_MAX = 1, 52
OUTPUTDIR = '/Users/hblim/Dropbox (MIT)/Pokerbots 2022/scrimmage_data'

columns = ['page','Challenger','C_ELO','Opponent','O_ELO','Creation_Time','Your_bot','Status','C_Score','O_Score','Winner','Game Log','Player Log']
table_data = {h: [] for h in columns}

for p in tqdm.tqdm(range(PAGE_MIN,PAGE_MAX+1)):
            
    driver.get(f"https://scrimmage.pokerbots.org/team/games?page={p}")
    cell_data = driver.find_elements(By.TAG_NAME, "td") 
    assert len(cell_data) % 12 == 0, f'expected 12 columns including Logs, but there are {len(cell_data)} cells, which is not divisible by 12'
    
    Nrows = len(cell_data) // 12
    
    for i,cell in enumerate(cell_data):
        
        c = i % 12 # get column index
            
        if c == 0:
            table_data['page'].append(p)
            
        if c in {10,11}:
            
            try:
                gameid = cell.find_elements(By.LINK_TEXT, "Log")[0].get_attribute('href').split('/')[-2]
            except:
                gameid = None
            
            table_data[columns[c+1]].append(gameid)
            
        else:
            table_data[columns[c+1]].append(cell.text)

100%|████████████████████████████████████████████████████████████████████████████████| 52/52 [02:38<00:00,  3.05s/it]


In [447]:
metadata = pd.DataFrame(table_data)

Check bad values.
(1) Determine game id, if absent player log set to 0

In [448]:
metadata['Game Log'].isnull().sum(),metadata['Player Log'].isnull().sum()

(0, 10)

In [449]:
metadata['game_id'] = metadata['Game Log']
metadata['Player Log'].loc[metadata['Player Log'].isnull()] = 0

(2) Fix Missing Score

In [450]:
(metadata['C_Score'] == 'None').sum(),(metadata['O_Score'] == 'None').sum()

(10, 0)

In [451]:
metadata['C_Score'].loc[metadata['C_Score'] == 'None'] = 0

Add headers

In [453]:
dtypes = ['int','string','float','string','float','datetime64[ns]','string','string','int','int','string','int','int','int']

metadata = metadata.astype({columns[i]: dtypes[i] for i in range(len(columns))})

In [482]:
OUTPUTDIR = '/Users/hblim/Dropbox (MIT)/Pokerbots 2022/scrimmage_data'
# metadata.to_pickle(f'{OUTPUTDIR}/metadata.pkl')

In [486]:
pd.read_pickle(f'{OUTPUTDIR}/metadata.pkl').head()

Unnamed: 0,page,Challenger,C_ELO,Opponent,O_ELO,Creation_Time,Your_bot,Status,C_Score,O_Score,Winner,Game Log,Player Log,game_id
0,1,Sweaters & Jetters,1566.0,TwoPocketAces,1673.0,2022-01-18 23:15:39.973619,bugfixed nongreedy CFR,Completed,-661,661,TwoPocketAces,21709,21709,21709
1,1,Qian & Li,1437.0,TwoPocketAces,1706.0,2022-01-18 22:41:56.901903,bugfixed nongreedy CFR,Completed,602,-602,Qian & Li,21698,21698,21698
2,1,Qian & Li,1445.0,TwoPocketAces,1698.0,2022-01-18 22:41:52.292646,bugfixed nongreedy CFR,Completed,-405,405,TwoPocketAces,21697,21697,21697
3,1,Qian & Li,1453.0,TwoPocketAces,1690.0,2022-01-18 22:33:45.431833,bugfixed nongreedy CFR,Completed,-2521,2521,TwoPocketAces,21696,21696,21696
4,1,NotAFlop,1681.0,TwoPocketAces,1712.0,2022-01-18 21:26:36.402023,bugfixed nongreedy CFR,Completed,1371,-1371,NotAFlop,21667,21667,21667


In [484]:
test2

Unnamed: 0,page,Challenger,C_ELO,Opponent,O_ELO,Creation_Time,Your_bot,Status,C_Score,O_Score,Winner,Game Log,Player Log,game_id
0,1,Sweaters & Jetters,1566.0,TwoPocketAces,1673.0,2022-01-18 23:15:39.973619,bugfixed nongreedy CFR,Completed,-661,661,TwoPocketAces,21709,21709,21709
1,1,Qian & Li,1437.0,TwoPocketAces,1706.0,2022-01-18 22:41:56.901903,bugfixed nongreedy CFR,Completed,602,-602,Qian & Li,21698,21698,21698
2,1,Qian & Li,1445.0,TwoPocketAces,1698.0,2022-01-18 22:41:52.292646,bugfixed nongreedy CFR,Completed,-405,405,TwoPocketAces,21697,21697,21697
3,1,Qian & Li,1453.0,TwoPocketAces,1690.0,2022-01-18 22:33:45.431833,bugfixed nongreedy CFR,Completed,-2521,2521,TwoPocketAces,21696,21696,21696
4,1,NotAFlop,1681.0,TwoPocketAces,1712.0,2022-01-18 21:26:36.402023,bugfixed nongreedy CFR,Completed,1371,-1371,NotAFlop,21667,21667,21667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1029,52,TwoPocketAces,1518.0,Grand Central,1754.0,2022-01-07 17:51:30.747386,test CFR,Completed,-458,458,Grand Central,2156,2156,2156
1030,52,TwoPocketAces,1526.0,NaruhodoRyuichi,1764.0,2022-01-07 17:51:29.805720,test CFR,Completed,-136,136,NaruhodoRyuichi,2155,2155,2155
1031,52,TwoPocketAces,1533.0,6A6,1814.0,2022-01-07 17:51:10.145965,test CFR,Completed,-733,733,6A6,2154,2154,2154
1032,52,TwoPocketAces,1518.0,lecture-2-reference-bot (staff bot),1418.0,2022-01-07 17:49:52.916379,test CFR,Completed,39,-39,TwoPocketAces,2153,2153,2153
