In [2]:
import requests
import fitz  # PyMuPDF
from io import BytesIO
import pandas as pd
import numpy as np
import camelot

In [55]:
pd.set_option('display.max_rows', None)

# Download PDF

In [3]:
# URL of the hosted PDF
pdf_url = "https://ak-static.cms.nba.com/referee/injury/Injury-Report_2024-11-08_12AM.pdf"

# Download the PDF
response = requests.get(pdf_url)
response.raise_for_status()

pdf_path = "temp.pdf"

with open(pdf_path, "wb") as f:
    f.write(response.content)

# Get PDF in Useful Format

In [96]:
columns = ["119,199,264,424,585,666"]

all_stream_tables = camelot.read_pdf(pdf_path, pages = 'all', flavor = 'stream', row_tol = 18, columns = columns)

dfs = []

for x in range(len(all_stream_tables)):
    table = all_stream_tables[x].df
    if x == 1:
        injury_report_datetime = table.iloc[0, 4]
        #print(datetime)
    table = table.iloc[1:]
    
    dfs.append(table)

column_names = ["game_date", "game_start_time", "Matchup", "Team", "Player Name", "status", "reason"]

concat_df = pd.concat(dfs, ignore_index = True)

concat_df.columns = column_names

concat_df['injury_report_datetime'] = injury_report_datetime

concat_df[concat_df.select_dtypes(include=['object']).columns] = concat_df.select_dtypes(include=['object']).apply(lambda x: x.str.strip())

concat_df.replace("", np.nan, inplace = True)

concat_df.iloc[:, 0:6] = concat_df.iloc[:, 0:6].ffill()

not_submitted_df = concat_df.loc[concat_df['reason'] == 'NOT YET SUBMITTED']

concat_df = concat_df.loc[concat_df['reason'] != 'NOT YET SUBMITTED']

concat_df['game_date'] = pd.to_datetime(concat_df['game_date'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')

concat_df['game_start_time'] = concat_df['game_start_time'].str.replace(' (ET)', '')

concat_df['injury_report_datetime'] = concat_df['injury_report_datetime'].str.replace("Injury Report: ", "")

concat_df['injury_report_datetime'] = pd.to_datetime(concat_df['injury_report_datetime'], format = '%m/%d/%y %I:%M %p').dt.strftime('%Y-%m-%d %H:%M')

concat_df = concat_df[['injury_report_datetime', 'game_date', 'Matchup', 'Team', 'Player Name', 'status', 'reason']]

concat_df = concat_df.groupby(concat_df.columns[:-1].to_list(), as_index=False).agg({'reason': ' '.join})

concat_df[['player_last_name','player_first_name']] = concat_df['Player Name'].str.split(",", expand = True)

concat_df['player_name'] = concat_df['player_first_name'] + ' ' + concat_df['player_last_name']

concat_df[['visitor_team_id', 'home_team_id']] = concat_df['Matchup'].str.split('@', expand = True)

In [101]:
def get_nba_injury_pdf_data(pdf_path):
    columns = ["119,199,264,424,585,666"]

    all_stream_tables = camelot.read_pdf(pdf_path, pages = 'all', flavor = 'stream', row_tol = 18, columns = columns)

    dfs = []

    for x in range(len(all_stream_tables)):
        table = all_stream_tables[x].df
        if x == 1:
            injury_report_datetime = table.iloc[0, 4]
            #print(datetime)
        table = table.iloc[1:]
        
        dfs.append(table)

    column_names = ["game_date", "game_start_time", "Matchup", "Team", "Player Name", "status", "reason"]

    concat_df = pd.concat(dfs, ignore_index = True)

    concat_df.columns = column_names

    concat_df['injury_report_datetime'] = injury_report_datetime

    return concat_df

In [134]:
def fill_and_filter_nba_injury_pdf_data(concat_df):
    concat_df[concat_df.select_dtypes(include=['object']).columns] = concat_df.select_dtypes(include=['object']).apply(lambda x: x.str.strip())

    concat_df.replace("", np.nan, inplace = True)

    concat_df.iloc[:, 0:6] = concat_df.iloc[:, 0:6].ffill()

    not_submitted_df = concat_df.loc[concat_df['reason'] == 'NOT YET SUBMITTED']

    not_submitted_df = not_submitted_df.copy()

    concat_df = concat_df.loc[concat_df['reason'] != 'NOT YET SUBMITTED']

    return concat_df, not_submitted_df

In [123]:
def clean_nba_injury_df(concat_df):

    concat_df['game_date'] = pd.to_datetime(concat_df['game_date'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')

    concat_df['game_start_time'] = concat_df['game_start_time'].str.replace(' (ET)', '')

    concat_df['injury_report_datetime'] = concat_df['injury_report_datetime'].str.replace("Injury Report: ", "")

    concat_df['injury_report_datetime'] = pd.to_datetime(concat_df['injury_report_datetime'], format = '%m/%d/%y %I:%M %p').dt.strftime('%Y-%m-%d %H:%M')

    concat_df = concat_df[['injury_report_datetime', 'game_date', 'Matchup', 'Team', 'Player Name', 'status', 'reason']]

    concat_df = concat_df.groupby(concat_df.columns[:-1].to_list(), as_index=False).agg({'reason': ' '.join})

    concat_df[['player_last_name','player_first_name']] = concat_df['Player Name'].str.split(",", expand = True)

    concat_df['player_name'] = concat_df['player_first_name'] + ' ' + concat_df['player_last_name']

    concat_df[['visitor_team_id', 'home_team_id']] = concat_df['Matchup'].str.split('@', expand = True)

    concat_df = concat_df[['injury_report_datetime', 'game_date', 'visitor_team_id', 'home_team_id', 'Team', 'player_name', 'player_first_name', 'player_last_name', 'status', 'reason']]

    return concat_df
    

In [130]:
def clean_nba_injury_missing(not_submitted_df):

    not_submitted_df.loc[not_submitted_df['Player Name'] != np.nan, 'player_name'] = np.nan

    not_submitted_df.loc[not_submitted_df['status'] != np.nan, 'status'] = np.nan
    
    not_submitted_df['game_date'] = pd.to_datetime(not_submitted_df['game_date'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')

    not_submitted_df['game_start_time'] = not_submitted_df['game_start_time'].str.replace(' (ET)', '')

    not_submitted_df['injury_report_datetime'] = not_submitted_df['injury_report_datetime'].str.replace("Injury Report: ", "")

    not_submitted_df['injury_report_datetime'] = pd.to_datetime(not_submitted_df['injury_report_datetime'], format = '%m/%d/%y %I:%M %p').dt.strftime('%Y-%m-%d %H:%M')

    not_submitted_df = not_submitted_df[['injury_report_datetime', 'game_date', 'Matchup', 'Team', 'player_name', 'status', 'reason']]

    not_submitted_df[['visitor_team_id', 'home_team_id']] = not_submitted_df['Matchup'].str.split('@', expand = True)

    not_submitted_df['player_first_name'] = np.nan

    not_submitted_df['player_last_name'] = np.nan

    not_submitted_df = not_submitted_df[['injury_report_datetime', 'game_date', 'visitor_team_id', 'home_team_id', 'Team', 'player_name', 'player_first_name', 'player_last_name', 'status', 'reason']]

    return not_submitted_df

In [136]:
nba_pdf_data = get_nba_injury_pdf_data(pdf_path)

submitted_injuries, not_submitted = fill_and_filter_nba_injury_pdf_data(nba_pdf_data)

cleaned_submitted_injuries = clean_nba_injury_df(submitted_injuries)

cleaned_not_submitted = clean_nba_injury_missing(not_submitted)

full_injury_df = pd.concat([cleaned_submitted_injuries, cleaned_not_submitted], ignore_index = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concat_df['game_date'] = pd.to_datetime(concat_df['game_date'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concat_df['game_start_time'] = concat_df['game_start_time'].str.replace(' (ET)', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concat_df['in

In [137]:
full_injury_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   injury_report_datetime  129 non-null    object
 1   game_date               129 non-null    object
 2   visitor_team_id         129 non-null    object
 3   home_team_id            129 non-null    object
 4   Team                    129 non-null    object
 5   player_name             118 non-null    object
 6   player_first_name       118 non-null    object
 7   player_last_name        118 non-null    object
 8   status                  118 non-null    object
 9   reason                  129 non-null    object
dtypes: object(10)
memory usage: 10.2+ KB
