## Download of gamelogs from pokerbots scrimmage server

### Initialize: Open a new browser instance

In [13]:
import os
import tqdm

import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get("https://scrimmage.pokerbots.org/login")

After browser instance opens
1. Manually select a certificate to login (must do every time a new browser instance is opened)
2. Add accessibility permissions (only need to do once)

### Task: Download log from range of pages

In [14]:
PAGE_MIN, PAGE_MAX = 1, 64
OVERWRITE = False
OUTPUTDIR = './scrimmage_data'

for p in tqdm.tqdm(range(PAGE_MIN,PAGE_MAX+1)):
            
    driver.get(f"https://scrimmage.pokerbots.org/team/games?page={p}")
    cell_data = driver.find_elements(By.TAG_NAME, "td") 
    assert len(cell_data) % 12 == 0, f'expected 12 columns including Logs, but there are {len(cell_data)} cells, which is not divisible by 12'
    
    Nrows = len(cell_data) // 12
    
    for i in range(len(cell_data)):
        cell = driver.find_elements(By.TAG_NAME, "td")[i]

        c = i % 12 # get column index
        
        if c < 10:
            continue
            
        type_str = None
        if c == 10:
            type_str = 'game_log'
        elif c == 11:
            type_str = 'player_log'            

        try:
            href = cell.find_elements(By.LINK_TEXT, "Log")[0].get_attribute('href')
            gameid = href.split('/')[-2]
            savefilename = f'{OUTPUTDIR}/{type_str}_{gameid}.txt'        
            
            if os.path.exists(savefilename) == False or OVERWRITE == True:            
                driver.get(href)           
                f = open(savefilename,'w')
                f.write(driver.page_source[84:-20])
                f.close()                    
                driver.back()                
                
        except:
            continue

100%|███████████████████████████████████████████| 64/64 [03:04<00:00,  2.88s/it]


### Task: Download log from one page

In [192]:
PAGE = 1 # select page number
# Game Logs correspond to i % 2 == 0, Player Logs corresponds to i % 2 == 1
LOGN = 0 # log number on page; 

driver.get(f"https://scrimmage.pokerbots.org/team/games?page={PAGE}")
test = driver.find_elements(By.LINK_TEXT, "Log")
game_id = test[LOGN].get_attribute('href').split('/')[-2]
test[LOGN].click()

# f = open(f'{outputdir}/scrimmage_data/game_log_{game_id}.txt','w')
# f.write(driver.page_source[84:-20])
# f.close()

### Task: Generate Metadata Table with GameID lookup Key

In [15]:
PAGE_MIN, PAGE_MAX = 1, 64
OUTPUTDIR = './scrimmage_data'

columns = ['page','Challenger','C_ELO','Opponent','O_ELO','Creation_Time','Your_bot','Status','C_Score','O_Score','Winner','Game Log','Player Log']
table_data = {h: [] for h in columns}

for p in tqdm.tqdm(range(PAGE_MIN,PAGE_MAX+1)):
            
    driver.get(f"https://scrimmage.pokerbots.org/team/games?page={p}")
    cell_data = driver.find_elements(By.TAG_NAME, "td") 
    assert len(cell_data) % 12 == 0, f'expected 12 columns including Logs, but there are {len(cell_data)} cells, which is not divisible by 12'
    
    Nrows = len(cell_data) // 12
    
    for i,cell in enumerate(cell_data):
        
        c = i % 12 # get column index
            
        if c == 0:
            table_data['page'].append(p)
            
        if c in {10,11}:
            
            try:
                gameid = cell.find_elements(By.LINK_TEXT, "Log")[0].get_attribute('href').split('/')[-2]
            except:
                gameid = None
            
            table_data[columns[c+1]].append(gameid)
            
        else:
            table_data[columns[c+1]].append(cell.text)

100%|███████████████████████████████████████████| 64/64 [03:36<00:00,  3.38s/it]


In [16]:
metadata = pd.DataFrame(table_data)

Check bad values.
(1) Determine game id, if absent player log set to 0

In [17]:
metadata['Game Log'].isnull().sum(),metadata['Player Log'].isnull().sum()

(0, 10)

In [18]:
metadata['game_id'] = metadata['Game Log']
metadata['Player Log'].loc[metadata['Player Log'].isnull()] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


(2) Fix Missing Score

In [19]:
(metadata['C_Score'] == 'None').sum(),(metadata['O_Score'] == 'None').sum()

(10, 0)

In [20]:
metadata['C_Score'].loc[metadata['C_Score'] == 'None'] = 0

Add headers

In [21]:
dtypes = ['int','string','float','string','float','datetime64[ns]','string','string','int','int','string','int','int','int']
metadata = metadata.astype({columns[i]: dtypes[i] for i in range(len(columns))})

Write dataframe

In [22]:
OUTPUTDIR = './scrimmage_data'
metadata.to_pickle(f'{OUTPUTDIR}/metadata.pkl')