In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import datetime
import warnings
import requests
import json
import sys
import os

pd.set_option('display.max_columns', 500)
warnings.filterwarnings('ignore')

In [2]:
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium import webdriver

proxy_address = "127.0.0.1:9150"

# Setting up my Tor initialization
if sys.platform=="win32":
    tor = r'C:\Users\cliffchew84\Desktop\Tor Browser\Browser\firefox'
else:
    tor = r'/Applications/TorBrowser.app/Contents/MacOS/firefox'

#### Functions 

In [3]:
def json_to_table(json):
    """Consume the json and format them into their respective tables"""
    df = dict()
    for i in range(0, len(json["resultSets"])):
        holder = pd.DataFrame()
        tmp1 = pd.DataFrame()
        tmp2 = pd.DataFrame()

        # print testson["resultSets"][i]["headers"]
        tmp1 = pd.DataFrame(json["resultSets"][i]["headers"]).T
        holder = holder.append(tmp1)

        # print testson["resultSets"][i]["rowSet"]
        tmp2 = pd.DataFrame(json["resultSets"][i]["rowSet"])

        # Create holder and multiple tables!
        holder = holder.append(tmp2)
        df[i] = holder
    return df

In [4]:
def format_table(table):
    """Get the column names proper! """
    table.columns = table.iloc[0]
    table = table[1:]
    return table

### Load existing tables if they exist!

In [5]:
try:
    # First part
    main_players = pd.read_csv("main_players_2017.csv", dtype={'GAME_ID': str})
    main_players["GAME_ID"] = [str(i).zfill(10) for i in main_players["GAME_ID"]]
    
    main_team = pd.read_csv("team_2017.csv", dtype={'GAME_ID': str})
    main_team["GAME_ID"] = [str(i).zfill(10) for i in main_team["GAME_ID"]]
    
    main_start_bench = pd.read_csv("start_bench_2017.csv", dtype={'GAME_ID': str})
    main_start_bench["GAME_ID"] = [str(i).zfill(10) for i in main_start_bench["GAME_ID"]]
   
    # Second part
    main_more_team_stats = pd.read_csv("more_team_stats_2017.csv", dtype={'GAME_ID': str})
    main_more_team_stats["GAME_ID"] = [str(i).zfill(10) for i in main_more_team_stats["GAME_ID"]]
    
    main_referees = pd.read_csv("referees_2017.csv", dtype={'GAME_ID': str})
    main_referees["GAME_ID"] = [str(i).zfill(10) for i in main_referees["GAME_ID"]]
    
    main_game_date = pd.read_csv("game_date_2017.csv", dtype={'GAME_ID': str})
    main_game_date["GAME_ID"] = [str(i).zfill(10) for i in main_game_date["GAME_ID"]]

    # Third Part
    main_home_away = pd.read_csv("more_home_away_2017.csv", dtype={'GAME_ID': str})
    main_home_away["GAME_ID"] = [str(i).zfill(10) for i in main_home_away["GAME_ID"]]
    
    main_wins_losses = pd.read_csv("more_wins_losses_2017.csv", dtype={'GAME_ID': str})
    main_wins_losses["GAME_ID"] = [str(i).zfill(10) for i in main_wins_losses["GAME_ID"]]
    
    game_id_num = int(int(main_game_date.tail(1).GAME_ID)) + 1

except:
    main_players = pd.DataFrame()
    main_team = pd.DataFrame()
    main_start_bench = pd.DataFrame()
    
    main_more_team_stats = pd.DataFrame()
    main_referees = pd.DataFrame()
    main_game_date = pd.DataFrame()
    
    main_wins_losses = pd.DataFrame()    
    main_home_away = pd.DataFrame()
    
    game_id_num = 21700001

### Establishing Tor connection --> Only need to do this once at the start!
- https://stackoverflow.com/questions/28307469/chrome-driver-needs-to-be-available-in-the-path-error-on-mac

In [6]:
if sys.platform=="win32":
    browser = webdriver.Chrome('C:\Users\cliffchew84\Desktop\chromedriver_win32\chromedriver.exe')
else:
    firefox_binary = FirefoxBinary(tor)
    # browser = webdriver.Firefox(firefox_binary=firefox_binary)
    browser = webdriver.Chrome(r"/usr/local/bin/chromedriver")

In [7]:
test_list = ["0021700803", ]

In [8]:
for i in test_list:
    game_id = i
    indepth_game_stats = "http://stats.nba.com/stats/boxscoretraditionalv2?EndPeriod=10&EndRange=28800&GameID={}&RangeType=0&Season=2013-14&SeasonType=Regular+Season&StartPeriod=1&StartRange=0".format(game_id)
    
    browser.get(indepth_game_stats)
    soup = bs(browser.page_source)
    if soup.text == u'An error has occurred.':
        pass
    else:
        for i in soup.find_all("pre"):
            testson = json.loads(i.get_text())

        df = json_to_table(testson)

        players = format_table(df[0])
        team = format_table(df[1])
        start_bench = format_table(df[2])

        # Combine to main tables
        main_players = main_players.append(players)
        main_team = main_team.append(team)
        main_start_bench = main_start_bench.append(start_bench)

##### Tables with more broad Data

In [9]:
for i in test_list:
    game_id = i
    game_gen = "http://stats.nba.com/stats/boxscoresummaryv2?GameID={}".format(game_id)

    browser.get(game_gen)
    soup2 = bs(browser.page_source)

    #### Process table!
    for i in soup2.find_all("pre"):
        test_test = json.loads(i.get_text())

    df = json_to_table(test_test)

    ##### Tracking useful games...
    more_team_stats = format_table(df[1])
    more_team_stats["GAME_ID"] = unicode(game_id)

    referees = format_table(df[2])
    referees["GAME_ID"] = unicode(game_id)

    game_date = format_table(df[4])
    game_date["GAME_ID"] = unicode(game_id)
    
    # Part 3
    home_away = format_table(df[0])
    home_away["GAME_ID"] = unicode(game_id)
    
    wins_losses = format_table(df[5])
    wins_losses["GAME_ID"] = unicode(game_id)

    # Combine to the main tables
    main_more_team_stats = main_more_team_stats.append(more_team_stats)
    main_referees = main_referees.append(referees)
    main_game_date = main_game_date.append(game_date)
    
    main_home_away = main_home_away.append(home_away)
    main_wins_losses = main_wins_losses.append(wins_losses)

#### Save all files locally! 

In [10]:
main_players.to_csv("main_players_2017.csv", index=False)
main_team.drop_duplicates().to_csv("team_2017.csv", index=False)
main_start_bench.to_csv("start_bench_2017.csv", index=False)

main_more_team_stats.to_csv("more_team_stats_2017.csv", index=False)
main_referees.to_csv("referees_2017.csv", index=False)
main_game_date.to_csv("game_date_2017.csv", index=False)

main_home_away.to_csv("more_home_away_2017.csv", index=False)
main_wins_losses.to_csv("more_wins_losses_2017.csv", index=False)

##### Playoff games started on 16 Apil 2016 - 1230 is the last game!
http://stats.nba.com/stats/boxscoretraditionalv2?EndPeriod=10&EndRange=28800&GameID=0041500111&RangeType=0&Season=2015-16&SeasonType=Playoffs&StartPeriod=1&StartRange=0

In [11]:
browser.close()