In [557]:
import requests
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import time
import pandas as pd
import numpy as np
import pickle

In [572]:
def web_scrape(inputlist):    
    ''' 
    create a total boxscore scrape of each game. 
    
    INPUT: use the pickle function to get the unique website identifier
    
    OUTPUT: return a pandas dataframe with all player and team statistics. 
    '''
    
    year = inputlist[:4]
    month = inputlist[4:6]
    day = inputlist[6:8]
    team = inputlist[9:] 
    # from the pickle, had to modify the inputlist single string to its components
    
    web_template = (f'https://www.basketball-reference.com/boxscores/{year}{month}{day}0{team}.html')
    data = requests.get(web_template)
    soup = BeautifulSoup(data.text, 'html.parser')
    # scrape the site
    
    
    headers_four_factors = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
    rows = soup.findAll('tr')[2:]
    player_stats1 = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
    player_names1 = [[td.getText() for td in rows[i].findAll('th')] for i in range(len(rows))]
    # run some list comprehensions to identify the headers for the table, the player statistics and separately 
    # the player name since that is not included in player statistics
    
    stats = pd.DataFrame(player_stats1, columns = headers_four_factors[1:])
    player = pd.DataFrame(player_names1)
    player = player[0][:66]
    stats['Player'] = player
    # create a statistics dataframe and a player name dataframe and then append the player name to the stats
    # return the assembled dataframe
    
    return stats

In [574]:
def team_summary(inputlist):    
    '''
    create a summary table with team name and team score
    
    INPUT: use the pickle function to get the unique website identifier
    
    OUTPUT: return a pandas dataframe with a summary box score for the team
    '''
    
    year = inputlist[:4]
    month = inputlist[4:6]
    day = inputlist[6:8]
    team = inputlist[9:]
    # from the pickle, had to modify the inputlist single string to its components
    
    
    web_template = (f'https://www.basketball-reference.com/boxscores/{year}{month}{day}0{team}.html')
    data = requests.get(web_template)
    soup = BeautifulSoup(data.text, 'html.parser')
    # perform the web scrape
    
    rows2 = soup.findAll(class_='scorebox')
    overall_teams = [strong.getText() for strong in rows2[0].findAll('strong')]
    overall_teams = [items.strip('\n') for items in overall_teams]
    overall_score = [scores.getText() for scores in rows2[0].findAll(class_='scores')]
    overall_score = [items.strip('\n') for items in overall_score]
    # parse the scrape and perform two lsit comprehensions to identify team name and team scores. 
    # also, filter out the \n
    
    
    def date_adjustment():
        # create a sub function to scrape a different portion of the site for information about the date of the game
        overall_date = [dates.getText() for dates in rows2[0].findAll(class_='scorebox_meta')]
        overall_date = [items.strip('\n') for items in overall_date]
        overall_date_2 = [items.split(',') for items in overall_date]
        
        output_list = []
        output_list.append(overall_date_2[0][0])
        output_list.append(overall_date_2[0][1])
        output_list.append(overall_date_2[0][2][:5].strip(' '))
        # append three parts of the date: the time, the date, and the year
        # the output is a list of the three components
        
        return output_list
    
    
    date_list = [' '.join(date_adjustment())] * 2
    # create a list with two entries of the date_adjustment function for each team
    
    teams_scores = pd.DataFrame(overall_teams, columns=['Team_Name'])
    teams_scores['Score'] = overall_score
    teams_scores['Date'] = date_list
    # assemble the final dataframe from the three separate lists: scores, team names, and date
    
    return teams_scores
    

In [656]:
def four_factors_output(inputlist):
    '''
    Using the information from the web_scrape and team_summary dataframes, create a final dataframe
    with the relevant data for analysis. 
    
    INPUT: use the pickle function to get the unique website identifier
    
    OUTPUT: return a pandas dataframe with the information I am looking for to perform my statistical analysis. 
    '''
    
    
    year = inputlist[:4]
    month = inputlist[4:6]
    day = inputlist[6:8]
    team = inputlist[9:]
    # from the pickle, had to modify the inputlist single string to its components
    
    stats = web_scrape(inputlist)
    teams_scores = team_summary(inputlist)
    # input information from the web_scrape and team_summary functions
    
    test = stats[['Player','MP','FG', 'FGA', '3P', 'FT', 'ORB', 'TOV', 'FTA', 'DRB', 'PTS']]    
    test_list = ['FG', 'FGA', '3P', 'FT', 'ORB', 'TOV', 'FTA', 'DRB']
    test = test.dropna()
    for items in test_list:
        test[items] = pd.to_numeric(test[items], errors='coerce').fillna(0).astype(int)
    # drop all rows from the dataframe that had null values - this left me with a dataset that only
    # contained the rows of data I was interested in analyzing. Then convert all components to numerics
    # so that I can perform math functions on those columns. 
    
    
    test['Player'][test['PTS']==teams_scores['Score'].iloc[0]] = teams_scores['Team_Name'].iloc[0]
    test['Player'][test['PTS']==teams_scores['Score'].iloc[1]] = teams_scores['Team_Name'].iloc[1]
    test['Date'] = teams_scores['Date'].iloc[0]
    # rename the teams to match the point totals from the summary table. Also add in date. 
    
    
    test['eFG'] = (test['FG'] + 0.5* test['3P']) / test['FGA']
    test['TOV_per'] = test['TOV'] / (test['FGA'] + 0.44 * test['FTA'] + test['TOV'])
    test['ORB_per'] = test['ORB'] / (test['ORB'] + test['DRB'])
    test['FTr'] = test['FT'] / test['FGA']
    # breakdown the four factors that I plan to analyze. These are the formulas for those factors. 
    
    four_factors_dataframe = test[['Player', 'eFG', 'TOV_per', 'ORB_per', 'FTr', 'Date']]
    
    uniq_id = str(year)+str(month)+str(day)+team
    append_data = test[test['Player']==teams_scores['Team_Name'][0]]
    append_data = append_data.append(test[test['Player']==teams_scores['Team_Name'][1]])
    # add the team name to the output table - need to make sure this is done dynamically
    
    append_data['id_t'] = uniq_id
    append_data['loc'] = team
    # create a field to identify which team is the home team
    
    return append_data
