In [4]:
import pandas as pd,seaborn as sns,matplotlib.pyplot as plt
import numpy as np
from pprint import pprint
import json
import requests
from bs4 import BeautifulSoup
from modules.functions import get_schedule,create_home_and_away_simple_dataframe,plot_game_trends
from modules.functions import calculate_possessions,get_agg_boxscore,get_game_timestamp_half,clean_dataframe
from IPython.display import clear_output
import time

In [None]:
get_schedule("Purdue")

In [9]:
def create_home_and_away_simple_dataframe(game_id:int,
                                          disp: bool = False) -> tuple:

    url = f'https://www.espn.com/mens-college-basketball/boxscore/_/gameId/{game_id}'
    r = requests.get(url)
    soup = BeautifulSoup(r.content,'lxml')

    if r.status_code != 200:

        raise Exception("Possibly invalid game_id.  Request did not return status code 200")

    #Isolate the home team, away team, and game date.  Away team is always first
    away_team,home_team = [team.strip().title() for team in [i for i in str(soup.find('title')).split('-') if " vs. " in i][0].replace('helmet="true">',"").split(" vs.")]
    game_date = str(soup.find("title")).split("-")[-1].split("|")[0].strip()
    
    #Infer tables with Pandas
    dfs = pd.read_html(url)

    # Pandas pulls in a lot of dataframes
    # Filterint to only retrieve the entries we're interested in
    away_players,away_stats,home_players,home_stats = dfs[1:5]
    # Renaming the column in the home and away players dataframe to "player"
    away_players.columns,home_players.columns = ['Player'],['Player']

    # Remove entries we don't need
    away_players = away_players.iloc[1:len(away_players),]
    away_players = away_players.loc[away_players.Player != "bench"]

    # Remove entries we don't need
    home_players = home_players.iloc[1:len(home_players),]
    home_players = home_players.loc[home_players.Player != "bench"]
    # Grabbing the last letter from the player column and isolating it into it's own column
    # This becomes the position (G,F,C)
    home_players['Position'] = [i[-1] for i in home_players.Player]
    away_players['Position'] = [i[-1] for i in away_players.Player]
    home_players['Player'] = [i[:-2].strip() for i in home_players.Player]
    away_players['Player'] = [i[:-2].strip() for i in away_players.Player]

    # Pandas doesn't recognize the first row as a header, so I'm manually assigning it to the stats dataframes
    away_stats.columns = away_stats.iloc[0,:].tolist()
    home_stats.columns = home_stats.iloc[0,:].tolist()

    # Removing column break headers
    if "FG" in home_stats.columns:

      home_stats = home_stats.loc[home_stats.FG != "FG"]
      away_stats = away_stats.loc[away_stats.FG != "FG"]

    elif "MIN" in home_stats.columns:

      home_stats = home_stats.loc[home_stats.MIN != "MIN"]
      away_stats = away_stats.loc[away_stats.MIN != "MIN"]
    
    else:
       print("Neither Column Exists")
       return home_stats,away_stats

    # Removing the last row as it's all null values
    home_stats = home_stats.iloc[:len(home_stats)-1,]
    away_stats = away_stats.iloc[:len(away_stats)-1,]

    # Merge the players and stats togther
    home_df = home_players.join(home_stats).iloc[:-1].fillna("")
    away_df = away_players.join(away_stats).iloc[:-1].fillna("")

    home_df = clean_dataframe(home_df)
    away_df = clean_dataframe(away_df)

     #Create outer index
    away_df = pd.concat({away_team:away_df})
    home_df = pd.concat({home_team:home_df})

    # Set the Team PTS/FGA to zero
    home_df.loc[home_df.Player == "Team",'PTS/FGA'] = int(0)
    away_df.loc[away_df.Player == "Team",'PTS/FGA'] = int(0)

    if disp:
        display(away_df,home_df)
        return

    return home_df,away_df

create_home_and_away_simple_dataframe('401484843',disp=True)

Unnamed: 0,Unnamed: 1,Player,PTS,FGM,FGA,3PM,3PA,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS/FGA,Position
Purdue,0,C. Furst,2,1,3,0,0,0,0,0,0,0,0,0,0,2,3,0.67,F
Purdue,1,Z. Edey,11,5,7,0,0,1,4,7,10,17,0,0,7,5,2,1.57,C
Purdue,2,B. Smith,8,1,4,1,2,5,6,1,5,6,4,1,0,3,3,2.0,G
Purdue,3,F. Loyer,22,9,21,2,10,2,3,0,3,3,0,1,0,0,1,1.05,G
Purdue,4,E. Morton,7,2,6,2,6,1,2,0,3,3,3,0,0,2,3,1.17,G
Purdue,5,T. Kaufman-Renn,6,2,4,1,2,1,3,3,2,5,2,1,0,0,1,1.5,F
Purdue,6,B. Waddell,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0.0,F
Purdue,7,B. Newman,4,1,6,0,4,2,2,2,5,7,0,0,0,0,3,0.67,G
Purdue,8,D. Jenkins Jr.,5,2,7,1,5,0,0,0,3,3,1,0,0,0,1,0.71,G
Purdue,9,Team,65,23,58,7,29,12,20,13,32,45,10,3,7,12,17,0.0,


Unnamed: 0,Unnamed: 1,Player,PTS,FGM,FGA,3PM,3PA,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS/FGA,Position
Nebraska,0,J. Gary,6,2,7,2,6,0,1,2,6,8,2,2,1,1,4,0.86,F
Nebraska,1,D. Walker,14,7,13,0,0,0,0,6,4,10,1,0,1,2,4,1.08,F
Nebraska,2,E. Bandoumel,14,4,13,3,10,3,3,0,1,1,0,2,0,4,4,1.08,G
Nebraska,3,C.J. Wilcher,0,0,6,0,3,0,0,1,0,1,3,0,0,0,3,0.0,G
Nebraska,4,S. Griesel,5,2,14,0,3,1,3,1,7,8,4,2,1,1,2,0.36,G
Nebraska,5,B. Keita,2,1,2,0,0,0,0,2,3,5,0,0,0,0,2,1.0,F
Nebraska,6,W. Breidenbach,2,1,3,0,1,0,0,1,0,1,0,0,0,0,1,0.67,F
Nebraska,7,J. Lawrence,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0.0,G
Nebraska,8,D. Dawson,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0.0,G
Nebraska,9,K. Tominaga,19,6,12,4,8,3,3,0,3,3,1,0,0,0,1,1.58,G


In [1]:
game_id = 401484843

n_tries = 0

try:
    test_df
except NameError as e:
    test_df = pd.DataFrame()

last_reported_time = None

while n_tries < 10:

    
    timestamp,half = get_game_timestamp_half(game_id)

    if timestamp != last_reported_time:
        last_reported_time = timestamp
        n_tries = 0
        clear_output()
        last_reported_time = timestamp
        temp_df = get_agg_boxscore(game_id = game_id,disp = True)
        temp_df = temp_df.transpose().rename_axis("Team").reset_index()
        temp_df['Half'] = half
        temp_df['Timestamp'] = timestamp
        test_df = pd.concat([test_df,temp_df])
        create_home_and_away_simple_dataframe(game_id,disp = True)
        time.sleep(np.random.randint(12,17))
    else:
        time.sleep(25)
        n_tries +=1

NameError: name 'pd' is not defined

In [None]:
def plot_game_trends(test_df,half = 1,color1 = 'black',color2 = 'blue'):
  melted = test_df.reset_index()
  #melted = test_df.copy()
  for i in melted.columns.tolist()[1:18]:
    plt.figure(figsize = (14,8))
    sns.lineplot(data = melted[melted.Half ==half].iloc[1:,:],
                x = 'Timestamp',
                y = i,
                hue = 'Team',
                palette = [color2,color1])
    plt.title(f"{melted.Team.tolist()[0]} vs {melted.Team.tolist()[1]}: {i.replace('_',' ')}")
    plt.ylabel(i.replace('_',' '))
    plt.xlabel('Time Remaining, 2nd Half')
    if half == 1:
      plt.xlabel('Time Remaining, 1st Half')
    plt.show();

In [None]:
plot_game_trends(test_df,half = half,color1='red',color2='black')