# Clean and Combine Player Data

Open necessary functions and set user preferences

In [1]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd
import numpy as np

%matplotlib inline

# various options in pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)
pd.set_option('display.precision', 3)


#### Connect to Baseball Ref Page of Individual Batter

In [2]:
# Test Batter Keith Hernandez
#url = 'http://www.baseball-reference.com/players/h/hernake01.shtml'

url = 'http://www.baseball-reference.com/players//a/alonsyo01.shtml'
us = UserAgent()
user_agent = {'User-Agent':us.random}

response = requests.get(url,headers = user_agent)
page = response.text
batter_page = BeautifulSoup(page,"lxml")

# Standard Batting Stats

### Write Standard Batting Stats to DF

In [3]:
def get_standard_batting(batter_page):
    batter_standard_data = batter_page.find(
        lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=='batting_standard')

    header_name_list = []
    # Populate a list containing column headers
    header_html = batter_standard_data.find('tr')
    for item in header_html.find_all('th'):
        header_name_list.append(item.text.strip())


    data_html = batter_standard_data.find_all('tr')
    data_list = []
    for ix, row in enumerate(data_html):
        temp_list = []
        if ix >0:
            if 'Yrs' in row.find('th').text.strip():
                break
            else:
                #print(row.find('th').text.strip())
                temp_list.append(row.find('th').text.strip())
                for item in row.find_all('td'):
                    temp_list.append(item.text.strip())
                data_list.append(temp_list)

    batting_df = pd.DataFrame(data_list,columns=header_name_list)
    return batting_df

### Clean Batter Stats DF

In [4]:
def clean_batter_df(df):
    numeric_fields = ['Year', 'Age', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR',\
                      'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', \
                      'TB','GDP', 'HBP', 'SH', 'SF', 'IBB']

    text_fields = ['Tm', 'Lg','Pos', 'Awards']
    df_clean = pd.DataFrame()
    
    for nf in numeric_fields:
        df_clean[nf] = pd.to_numeric(df[nf])
    
    for tf in text_fields:
        df_clean[tf] = df[tf]
    
    return df_clean    

### Test Functions

In [6]:
batting_df = get_standard_batting(batter_page)
batting_df_clean = clean_batter_df(batting_df)
batting_df_clean

Unnamed: 0,Year,Age,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Tm,Lg,Pos,Awards
0,2008,21,35,149,123,18,38,10,0,4,23,1,0,25,28,0.309,0.423,0.488,0.911,,60,0,0,0,1,0,CIN-min,"WRk,A+",,"WKI,SAR · HIWB,FLOR"
1,2009,22,123,511,442,61,122,34,1,12,81,1,2,61,76,0.276,0.36,0.439,0.799,,194,9,1,0,7,2,CIN-min,"A+,AA,Fal,FgW,Rk",,"SAR,CAR,PES,CGU,RDL · FLOR,SOUL,AZFL,PRWL,GULF"
2,2010,23,132,566,507,69,147,36,2,15,69,13,3,56,92,0.29,0.362,0.458,0.82,,232,17,2,0,1,4,CIN-min,"AAA,AA",,"LOU,CAR · IL,SOUL"
3,2010,23,22,29,29,2,6,2,0,0,3,0,0,0,10,0.207,0.207,0.276,0.483,28.0,8,1,0,0,0,0,CIN,NL,/3,
4,2011,24,91,409,358,46,106,24,4,12,56,6,5,46,60,0.296,0.374,0.486,0.86,,174,13,1,0,4,6,CIN-min,AAA,,LOU · IL
5,2011,24,47,98,88,9,29,4,0,5,15,0,0,10,21,0.33,0.398,0.545,0.943,153.0,48,2,0,0,0,0,CIN,NL,7/35,
6,2012,25,155,619,549,47,150,39,0,9,62,3,0,62,101,0.273,0.348,0.393,0.741,110.0,216,14,3,1,4,9,SDP,NL,*3,RoY-6
7,2013,26,4,14,14,1,8,0,0,0,2,0,1,0,0,0.571,0.571,0.571,1.143,,8,0,0,0,0,0,SDP-min,AAA,,TUC · PCL
8,2013,26,97,375,334,34,94,11,0,6,45,6,0,32,47,0.281,0.341,0.368,0.71,106.0,123,9,2,0,7,5,SDP,NL,3/457,
9,2014,27,7,25,25,1,7,0,0,1,5,0,0,0,3,0.28,0.28,0.4,0.68,,10,1,0,0,0,0,SDP-min,"AAA,Rk",,"ELP,PDS · PCL,ARIZ"


## Get Salary Stats

In [7]:
def get_salary_stats(batter_page):
    
    placeholder_list = batter_page.find_all('div', class_='placeholder')

    #div_br-salaries
    for ix, item in enumerate(placeholder_list):
        test_item = item.next_sibling.next_sibling
        test_str = str(test_item)
        if ('div_br-salaries' in test_str) == True:
            salary_table_placeholder = test_item

    salary_soup = BeautifulSoup(salary_table_placeholder, 'lxml')
    salary_data = salary_soup.find('table')

    header_name_list = []
    # Populate a list containing column headers
    header_html = salary_data.find('tr')
    for item in header_html.find_all('th'):
        header_name_list.append(item.text.strip())


    data_html = salary_data.find_all('tr')
    data_list = []
    for ix, row in enumerate(data_html):
        temp_list = []
        if ix >0:
            year = row.find('th').text.strip()
            if len(year) <= 4:
                temp_list.append(year)
                for item in row.find_all('td'):
                    temp_list.append(item.text.strip())
                data_list.append(temp_list)
            else:
                break

    salary_df = pd.DataFrame(data_list,columns=header_name_list)
    return salary_df

### Clean Batter Salary DF

In [8]:
salary_df = get_salary_stats(batter_page)
salary_df

Unnamed: 0,Year,Age,Tm,Salary,SrvTm,Sources,Notes/Other Sources
0,2008,21,Cincinnati Reds,"$50,000",?,,
1,2009,22,Cincinnati Reds,"$400,000",?,,
2,2010,23,Cincinnati Reds,"$500,000",?,,
3,2011,24,Cincinnati Reds,"$600,000",0.051,,
4,2012,25,San Diego Padres,"$1,000,000",0.116,contracts,
5,2013,26,San Diego Padres,"$1,120,000",1.116,contracts,
6,2014,27,San Diego Padres,"$980,000",2.116,contracts,
7,2015,28,San Diego Padres,"$1,650,000",3.116,contracts,
8,2016,29,Oakland Athletics,"$2,650,000",4.116,,
9,2017,30,Oakland Athletics,"$4,000,000",5.116,,


# Find other tables

In [None]:
header_names = []

h1 = batting_table.find('tr')

for item in h1.find_all('th'):
    header_names.append(item.text.strip())
    
header_names

# All Player Data

In [None]:
url = 'http://www.baseball-reference.com/leagues/MLB/2017-value-batting.shtml'
response = requests.get(url,headers = us)

page = response.text
players_2017 = BeautifulSoup(page,"lxml")

In [None]:
tables = players_2017.find_all('table') 
for ix, t in enumerate(tables):
    print(ix)

In [None]:
#soup.find('div', class_='placeholder').next_sibling.next_sibling
stuff = players_2017.find('div',class_ ='placeholder').next_sibling.next_sibling
new_soup = BeautifulSoup(stuff, 'lxml')
batting_table = new_soup.table

In [None]:
batting_table.find_all('a')
#test = pd.read_html(batting_table)

In [None]:
#players_2017.findAll(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id'] =='teams_value_batting')

In [None]:
x = players_2017.find_next(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=='players_value_batting')


In [None]:
players_2017.find_all('div',id_ ="all_players_value_batting")

In [None]:
# if needed: pip install requests
import requests

url = 'http://www.baseball-reference.com/robots.txt'

response = requests.get(url)

print(response.text)

In [None]:
tags = soup.findAll(['script', 'form'])
tags.extend(soup.findAll(id="footer"))

for tag in tags:
    tag.extract()