# Miami Data Science Meetup
# Project 1: Poker Data
### October 4, 2015

This set of functions takes multiple Texas Hold-'Em hand files downloaded from the [University of Alberta Computer Poker Research group's website](http://poker.cs.ualberta.ca/IRCdata/) to the local machine, parses them into a list of Python dictionaries, and outputs a JSON file called cprg_hands.json. 

This JSON file can then be uploaded to MongoDB (assuming you have an instance running) with the following command from the Terminal command prompt (on Mac; don't know how to do it on a PC):
>mongoimport --host 127.0.0.1:27017 --db <db name> --collection <collection name> --file cprg_hands.json

In [None]:
'''
An element of the Python dictionary list that this code produces looks like this:

{'_id': 'holdem_199601_820830094',
 'board': [{'stage': 'flop', 'suit': 'clubs', 'value': 'queen'},
           {'stage': 'flop', 'suit': 'spades', 'value': '4'},
           {'stage': 'flop', 'suit': 'spades', 'value': '6'},
           {'stage': 'turn', 'suit': 'diamonds', 'value': '5'},
           {'stage': 'river', 'suit': 'diamonds', 'value': '4'}],
 'dealer': '20',
 'game_type': 'holdem',
 'hand_num': '1163',
 'num_players': '2',
 'players': {'Jak': {'action': 40,
                     'bankroll': 850,
                     'bet_actions': [{'actions': ['blind bet', 'call'],
                                      'stage': 'pre-flop'},
                                     {'actions': ['check', 'call'],
                                      'stage': 'flop'},
                                     {'actions': ['check', 'call'],
                                      'stage': 'turn'},
                                     {'actions': ['check'],
                                      'stage': 'river'}],
                     'player_cards': [{'suit': 'clubs', 'value': '7'},
                                      {'suit': 'clubs', 'value': 'ace'}],
                     'position': 1,
                     'username': 'Jak',
                     'winnings': 80},
             'num': {'action': 40,
                     'bankroll': 1420,
                     'bet_actions': [{'actions': ['blind bet', 'check'],
                                      'stage': 'pre-flop'},
                                     {'actions': ['bet'], 'stage': 'flop'},
                                     {'actions': ['bet'], 'stage': 'turn'},
                                     {'actions': ['check'],
                                      'stage': 'river'}],
                     'player_cards': [{'suit': 'hearts', 'value': '9'},
                                      {'suit': 'hearts', 'value': 'king'}],
                     'position': 2,
                     'username': 'num',
                     'winnings': 0}},
 'pots': [{'num_players': '2', 'stage': 'flop', 'starting_pot_size': '20'},
          {'num_players': '2', 'stage': 'turn', 'starting_pot_size': '40'},
          {'num_players': '2', 'stage': 'river', 'starting_pot_size': '80'},
          {'num_players': '2',
           'stage': 'showdown',
           'starting_pot_size': '80'}]}

'''


# Parses "hdb" file from the IRC Poker Database http://poker.cs.ualberta.ca/irc_poker_database.html
import xlrd
import os
import csv
from tarfile import TarFile
import pprint
import re
import codecs
import json

tgz_extract_directory = "/Users/davidsmith/Downloads/"
OUTFILE = tgz_extract_directory + "cprg_hands.json"

#def open_zip(datafile):
#    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
#        myzip.extractall()

# Global variables
pot_cats = ["flop", "turn", "river", "showdown"]
deck = {'A': 'ace', 'K': 'king', 'Q': 'queen', 'J': 'jack', 'T': '10'}
suits = {'c': 'clubs', 's': 'spades', 'h': 'hearts', 'd': 'diamonds'}
bet_action_codes = {
            '-' : 'no action',
            'B' : 'blind bet',
            'f' : 'fold',
            'k' : 'check',
            'b' : 'bet',
            'c' : 'call',
            'r' : 'raise',
            'A' : 'all-in',
            'Q' : 'quits game',
            'K' : 'kicked from game'
            }
bet_action_cats = ["pre-flop", "flop", "turn", "river"]
folder_search_re = re.compile(r'\d{6}$', re.IGNORECASE)
tgz_search_re = re.compile(r'^\S*.\d{6}.tgz$', re.IGNORECASE)
valid_game_types = {"holdem", "holdem1", "holdem2", "holdem3", "holdemii", "holdempot", "nolimit", "tourney"}

def parse_hdb_file(hdb_file, hands):
    
    split_filename = hdb_file.split("/")
    id_prefix = split_filename[-3] + "_" + split_filename[-2] + "_"
    with open(hdb_file, "r") as hdb:
        for line in hdb:
            hand = {}
            pot_data = []
            board_card_data = []  
            line_parts = line.strip("\n").split(" ")
            line_parts = [elem for elem in line_parts if elem != '']
            _id = id_prefix + line_parts[0]
            hand["_id"] = _id
            hand["game_type"] = split_filename[-3]
            hand["dealer"] = line_parts[1]
            hand["hand_num"] = line_parts[2]
            hand["num_players"] = line_parts[3]
            for lp in line_parts[4:8]:
                pot_data.append(lp)
            for bcd in line_parts[8:]:
                board_card_data.append(bcd)
            
            pots = []
            i = 0
            for p in pot_data:
                pot = {}
                pot["stage"] = pot_cats[i]
                pot["num_players"] = p.split("/")[0]
                pot["starting_pot_size"] = p.split("/")[1]
                pots.append(pot)
                i = i + 1
                
            board = []
            for b in board_card_data:
                board_card = {}
                if b != "":
                    if board_card_data.index(b) + 1 <= 3:
                        board_card["stage"] = "flop"
                    elif board_card_data.index(b) + 1 == 4:
                        board_card["stage"] = "turn"
                    elif board_card_data.index(b) + 1 == 5:
                        board_card["stage"] = "river"
                    if b[0] in deck.keys():
                        board_card["value"] = deck[b[0]]
                    else:
                        board_card["value"] = b[0]
                    board_card["suit"] = suits[b[1]]
                    board.append(board_card)
                
            hand["pots"] = pots
            hand["board"] = board
            hands[_id] = hand
    hdb.close()
    return hands, id_prefix


def parse_hroster_file(hroster_file, id_prefix, hands):
    
    with open(hroster_file, "r") as hroster:
        for line in hroster:
            players = {}
            line_parts = line.strip("\n").split(" ")
            line_parts = [elem for elem in line_parts if elem != ''] 
            _id = id_prefix + line_parts[0]
            player_data = line_parts[2:]
            for p in player_data:
                player = {}
                player["username"] = p
                players[p] = player
            hands[_id]["players"] = players
    hroster.close()
    return hands

    
def parse_pdb_file(pdb_file, id_prefix, hands, invalid_keys):
    
    username = pdb_file.split(".")[-1]
    with open(pdb_file, "r") as pdb:
        for line in pdb:
            line_parts = line.strip("\n").split(" ")
            line_parts = [elem for elem in line_parts if elem != '']
            
            _id = id_prefix + line_parts[1]
            position = line_parts[3]

            bet_actions = []
            i = 0
            for item in line_parts:
                bet_action = {}
                bet_action["actions"] = []
                if line_parts.index(item) >=4 and line_parts.index(item) <= 7:
                    for b in item:
                        bet_action["actions"].append(bet_action_codes[b])
                    bet_action["stage"] = bet_action_cats[i]
                    bet_actions.append(bet_action)
                    i = i + 1
            bankroll, action, winnings = line_parts[8:11]
            
            player_cards = []
            if len(line_parts) == 13:
                for item in line_parts[11:13]:
                    player_card = {}
                    if item[0] in deck.keys():
                        player_card["value"] = deck[item[0]]
                    else:
                        player_card["value"] = item[0]
                    player_card["suit"] = suits[item[1]]
                    player_cards.append(player_card)              
            
            if username not in hands[_id]["players"].keys():
                invalid_keys.add(_id)
            else:
                hands[_id]["players"][username]["bet_actions"] = bet_actions
                hands[_id]["players"][username]["bankroll"] = int(bankroll)
                hands[_id]["players"][username]["action"] = int(action)
                hands[_id]["players"][username]["winnings"] = int(winnings)
                hands[_id]["players"][username]["player_cards"] = player_cards
                hands[_id]["players"][username]["position"] = int(position)
    pdb.close()
    return hands


def loop_pdb_files(pdb_file_directory, hands_col, id_prefix, invalid_keys):
    for root, dirs, files in os.walk(pdb_file_directory, topdown=False):
        for name in files:
            pdb_file = os.path.join(root, name)
            hs = parse_pdb_file(pdb_file, id_prefix, hands_col, invalid_keys)
    print "...Finished processing " + pdb_file_directory
    return hands_col


def loop_file_groups():
    hands = {}
    hands_list = []
    invalid_keys = set()
    for root, dirs, files in os.walk(tgz_extract_directory, topdown=False):
        d = folder_search_re.search(root)
        if d:
            hdb_file = root + "/" + "hdb"
            print "Processing " + hdb_file + "..."
            hroster_file = root + "/" + "hroster"
            pdb_directory = root + "/" + "pdb/"
            hands, idp = parse_hdb_file(hdb_file, hands)
            hands = parse_hroster_file(hroster_file, idp, hands)
            hands = loop_pdb_files(pdb_directory, hands, idp, invalid_keys)
    hands = {key: hands[key] for key in hands if key not in invalid_keys}
    hands_list = hands.values()
    return hands_list, invalid_keys


def loop_tgz(extract_dir):
    for root, dirs, files in os.walk(tgz_extract_directory, topdown=False):
        for file in files:            
            tgz = tgz_search_re.search(file)
            if tgz:
                tgz_file = os.path.join(root, file)
                game_type = tgz_file.split(".")[-3].split("/")[-1]
                if game_type in valid_game_types:
                    print "Extracting " + tgz_file
                    tar = TarFile.open(tgz_file)
                    tar.extractall(extract_dir)
                    tar.close()
                else:
                    print "Skipping " + tgz_file + " because it is for an invalid game type"

                
def process_hands(hands_list):
    with codecs.open(OUTFILE, "w") as fo:
        for hand in hands_list:
            fo.write(json.dumps(hand) + "\n")
    fo.close()
    
    
loop_tgz(tgz_extract_directory)
hnds, ik = loop_file_groups()
print "Processed " + str(len(hnds)) + " total hands"
process_hands(hnds)
print "Rejected " + str(len(ik)) + " distinct invalid hands due to username mismatches"
print "All done!"


# Still to do 9/28/15:
# Download the files directly from the Internet (http://poker.cs.ualberta.ca/IRCdata/)
# DONE Loop through hdb and hroster files, DONE --> be sure to tack YearMonth onto front of timestamp field
# DONE Loop through tgz files
# DONE Change working directory