In [None]:
import requests
import re
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
import string
import multiprocess as mp
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import NearestNeighbors
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
import warnings
warnings.filterwarnings("ignore")

Link to Baseball Reference (https://www.baseball-reference.com) 

Link to the players - https://www.baseball-reference.com/players.html

Link to Hank Aaron - https://www.baseball-reference.com/players/a/aaronha01.shtml

### Extracting the Player URLs

In [None]:
def grab_urls(base_url, alphabet):
    ''' function to grab all baseball reference urls
    base_url - string
    alphabet - string (alphabet characters)
    
    returns urls (list or urls)'''
    
    urls = []
    for letter in alphabet:
        url_let = base_url + '/' + letter + '/'
        r = requests.get(url_let)
        soup = BeautifulSoup(r.text)
        for link in soup.findAll('a', attrs={'href': re.compile("^/players/[a-z]")}):
            if base_url + link.get('href') not in urls:
                urls.append(base_url + link.get('href'))
    return urls

#### Example of the iteration through base URLs to extract player links and create unique URLs

In [None]:
base_url = 'https://www.baseball-reference.com'
alphabet = string.ascii_lowercase
urls = []
for letter in ['a']:
    url_let = base_url + '/' + letter + '/'
    r = requests.get(url_let)
    soup = BeautifulSoup(r.text)
    for i, link in enumerate(soup.findAll('a', attrs={'href': re.compile("^/players/[a-z]")})):
        if base_url + link.get('href') not in urls:
            urls.append(base_url + link.get('href'))
            print(base_url + link.get('href'))
        if i==5:
            break
    print('..........')

https://www.baseball-reference.com/players/a/aaronha01.shtml
https://www.baseball-reference.com/players/a/abreubo01.shtml
https://www.baseball-reference.com/players/a/abreujo02.shtml
https://www.baseball-reference.com/players/a/adamsma01.shtml
https://www.baseball-reference.com/players/a/adcocjo01.shtml
https://www.baseball-reference.com/players/a/aguilje01.shtml
..........


In [None]:
soup.find('div', itemtype='https://schema.org/Person')

In [None]:
## uncomment and run this cell to collect all of the player URLs

# urls = grab_urls(base_url, alphabet)

### Collecting Data From A Player URL

In [None]:
## for this example we will use the first player URL extracted above [Hank Aaron]

example = urls[0]
print(example)

https://www.baseball-reference.com/players/a/aaronha01.shtml


In [None]:
## to start we will look to grab a players information
page = requests.get(example)
soup = BeautifulSoup(page.content, 'html.parser')
match = soup.find('div', itemtype='https://schema.org/Person')
playername = match.find('h1').text 
print('Player_name: ', playername.strip())
meta = soup.find(id='meta')
position = np.nan
last_game = 2021
for para in meta.findAll('p'):
    if ':' in para.text:
        info = para.text.split(':')
        if info[0].strip()=='Position' or info[0].strip()=='Positions':
            position = info[1].strip()
        elif info[0].strip()=='Last Game':
            temp = info[1].strip().split('\n')[0]
            try:
                last_game = int(temp.split(',')[-1].strip())
            except:
                last_game = temp.split(',')[-1].strip()
            break
print('Position: ', position.strip())
print('Last Game: ', last_game)

Player_name:  Henry Aaron
Position:  Rightfielder and First Baseman
Last Game:  1976


In [None]:
## next we will collect the summary career stats

career1 = soup.find('div', class_='p1')
keys = []
values = []
try:
    for stats in career1.find_all('p'):
        values.append(stats.text)
    for stats in career1.find_all('h4'):
        keys.append(stats.text)
except:
    print('No career information for ', example)
    df_season_bat = pd.DataFrame()
    df_season_pitch = pd.DataFrame()
try:
    if soup.find('div', class_='stats_pullout').p.text=='2021':
        values = values[1::2]
except:
    print('p.text error', url)
summary_stats= dict(zip(keys,values))
print('Summary stats for ', playername.strip(),summary_stats)

Summary stats for  Henry Aaron {'WAR': '143.1', 'AB': '12364', 'H': '3771', 'HR': '755', 'BA': '.305'}


In [None]:
## first thing to try with pandas to pull in the statistics is read_html as it will lower the amount
## of work needed to scrape all of the statistics

print(len(pd.read_html(example)))
pd.read_html(example)

1


[             Year            Age             Tm             Lg     G     PA  \
 0            1952             18        BSN-min              C    87    345   
 1            1953             19        MLN-min              A   137    574   
 2            1954             20            MLN             NL   122    509   
 3            1955             21            MLN             NL   153    665   
 4            1956             22            MLN             NL   153    660   
 5            1957             23            MLN             NL   151    675   
 6            1958             24            MLN             NL   153    664   
 7            1959             25            MLN             NL   154    693   
 8            1960             26            MLN             NL   153    664   
 9            1961             27            MLN             NL   155    671   
 10           1962             28            MLN             NL   156    667   
 11           1963             29       

In [None]:
## because the len of the list created from read_html is 1, we know that only 1 table exists

df = pd.read_html(example)[0]
df

Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards
0,1952,18,BSN-min,C,87,345,345,,116,19,4,9,,,,,,.336,.336,.493,.829,,170,,,,,,,EAU · NORL
1,1953,19,MLN-min,A,137,574,574,,208,36,14,22,,,,,,.362,.362,.589,.951,,338,,,,,,,JCK · SALL
2,1954,20,MLN,NL,122,509,468,58,131,27,6,13,69,2,2,28,39,.280,.322,.447,.769,104,209,13,3,6,4,0,*79/H,RoY-4
3,1955,21,MLN,NL,153,665,602,105,189,37,9,27,106,3,1,49,61,.314,.366,.540,.906,141,325,20,3,7,4,5,*974/H,"AS,MVP-9"
4,1956,22,MLN,NL,153,660,609,106,200,34,14,26,92,2,4,37,54,.328,.365,.558,.923,151,340,21,2,5,7,6,*9/H,"AS,MVP-3"
5,1957,23,MLN,NL,151,675,615,118,198,27,6,44,132,1,1,57,58,.322,.378,.600,.978,166,369,13,0,0,3,15,*98/H,"AS,MVP-1"
6,1958,24,MLN,NL,153,664,601,109,196,34,4,30,95,4,1,59,49,.326,.386,.546,.931,153,328,21,1,0,3,16,*98,"AS,MVP-3,GG"
7,1959,25,MLN,NL,154,693,629,116,223,46,7,39,123,8,0,51,54,.355,.401,.636,1.037,183,400,19,4,0,9,17,*98/5,"AS,AS,MVP-3,GG"
8,1960,26,MLN,NL,153,664,590,102,172,20,11,40,126,16,7,60,63,.292,.352,.566,.919,156,334,8,2,0,12,13,*9/84,"AS,AS,MVP-11,GG"
9,1961,27,MLN,NL,155,671,603,115,197,39,10,34,120,21,9,56,64,.327,.381,.594,.974,163,358,16,2,1,9,20,*89/5H,"AS,AS,MVP-8"


#### The four tables that we wanted to extract are:
- all_batting_standard
- all_batting_value
- all_batting_advanced_front
- all_standard_fielding

In order to extract the information for the other tables we have to write some code ourselves

In [None]:
def get_table(soup, tableName):
    ''' function to grab all baseball reference urls
    soup - output from Beautiful Soup
    tableName - string
    
    returns df'''
    try:
        id_content = soup.find('div', id=tableName)
        table = id_content.find_all('table')
        df = pd.read_html(str(table))[0]
        return df
    except:     
        placeholder = soup.select_one('#'+tableName+' .placeholder')
        comment = next(elem for elem in placeholder.next_siblings if isinstance(elem, Comment))
        table = BeautifulSoup(comment, 'html.parser')
        tableData = [[cell.text for cell in row.find_all(["th","td"])]
                                for row in table.find_all("tr")]
        df = pd.DataFrame(tableData)
        df.columns = df.iloc[0,:]
        df.drop(index=0,inplace=True)
        df.reset_index(inplace=True, drop=True)        
        return df

In [None]:
page = requests.get(example)
soup = BeautifulSoup(page.content, 'html.parser')

get_table(soup, 'all_batting_standard')

Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards
0,1952,18,BSN-min,C,87,345,345,,116,19,4,9,,,,,,.336,.336,.493,.829,,170,,,,,,,EAU · NORL
1,1953,19,MLN-min,A,137,574,574,,208,36,14,22,,,,,,.362,.362,.589,.951,,338,,,,,,,JCK · SALL
2,1954,20,MLN,NL,122,509,468,58,131,27,6,13,69,2,2,28,39,.280,.322,.447,.769,104,209,13,3,6,4,0,*79/H,RoY-4
3,1955,21,MLN,NL,153,665,602,105,189,37,9,27,106,3,1,49,61,.314,.366,.540,.906,141,325,20,3,7,4,5,*974/H,"AS,MVP-9"
4,1956,22,MLN,NL,153,660,609,106,200,34,14,26,92,2,4,37,54,.328,.365,.558,.923,151,340,21,2,5,7,6,*9/H,"AS,MVP-3"
5,1957,23,MLN,NL,151,675,615,118,198,27,6,44,132,1,1,57,58,.322,.378,.600,.978,166,369,13,0,0,3,15,*98/H,"AS,MVP-1"
6,1958,24,MLN,NL,153,664,601,109,196,34,4,30,95,4,1,59,49,.326,.386,.546,.931,153,328,21,1,0,3,16,*98,"AS,MVP-3,GG"
7,1959,25,MLN,NL,154,693,629,116,223,46,7,39,123,8,0,51,54,.355,.401,.636,1.037,183,400,19,4,0,9,17,*98/5,"AS,AS,MVP-3,GG"
8,1960,26,MLN,NL,153,664,590,102,172,20,11,40,126,16,7,60,63,.292,.352,.566,.919,156,334,8,2,0,12,13,*9/84,"AS,AS,MVP-11,GG"
9,1961,27,MLN,NL,155,671,603,115,197,39,10,34,120,21,9,56,64,.327,.381,.594,.974,163,358,16,2,1,9,20,*89/5H,"AS,AS,MVP-8"


In [None]:
get_table(soup, 'all_batting_value')

Unnamed: 0,Year,Age,Tm,Lg,G,PA,Rbat,Rbaser,Rdp,Rfield,Rpos,RAA,WAA,Rrep,RAR,WAR,waaWL%,162WL%,oWAR,dWAR,oRAR,Salary,Pos,Awards
0,1954,20,MLN,NL,122,509,5,-2,-1,-1,-5,-4,-0.5,19,16,1.4,.497,.498,1.6,-0.7,17,"$6,000",*79/H,RoY-4
1,1955,21,MLN,NL,153,665,37,2,-1,7,-5,40,3.8,25,65,6.2,.527,.525,5.8,0.0,58,"$10,000",*974/H,"AS,MVP-9"
2,1956,22,MLN,NL,153,660,41,0,-1,15,-7,47,4.6,24,72,7.2,.533,.531,6.0,0.7,57,"$17,500",*9/H,"AS,MVP-3"
3,1957,23,MLN,NL,151,675,53,2,0,4,-4,55,5.5,25,80,8.0,.538,.535,7.8,-0.1,75,"$22,500",*98/H,"AS,MVP-1"
4,1958,24,MLN,NL,153,664,44,4,-2,8,-6,48,4.8,25,72,7.3,.532,.531,6.6,0.2,64,"$35,000",*98,"AS,MVP-3,GG"
5,1959,25,MLN,NL,154,693,64,6,0,-4,-7,60,6.1,25,85,8.6,.540,.538,9.1,-1.1,89,"$35,000",*98/5,"AS,AS,MVP-3,GG"
6,1960,26,MLN,NL,153,664,41,2,2,14,-7,52,5.5,25,77,8.0,.537,.535,6.5,0.8,63,"$45,000",*9/84,"AS,AS,MVP-11,GG"
7,1961,27,MLN,NL,155,671,44,4,1,23,-4,69,6.9,25,94,9.5,.545,.543,7.1,2.0,71,"$45,000",*89/5H,"AS,AS,MVP-8"
8,1962,28,MLN,NL,156,667,57,0,0,6,-4,59,6.0,25,84,8.5,.538,.537,7.9,0.3,78,"$47,500",*89/H3,"AS,AS,MVP-6"
9,1963,29,MLN,NL,161,714,61,7,0,-5,-7,57,6.4,25,82,9.1,.540,.539,9.6,-1.3,86,"$53,000",*9,"AS,MVP-3"


In [None]:
get_table(soup, 'all_batting_advanced_front')

Unnamed: 0,Unnamed: 1,Unnamed: 2,Batting,Batting Ratios,Win Probability,Baserunning,NaN,NaN.1,NaN.2,NaN.3,NaN.4,NaN.5,NaN.6,NaN.7,NaN.8,NaN.9,NaN.10,NaN.11
0,Year,Age,Tm,Lg,PA,rOBA,Rbat+,BAbip,ISO,HR%,SO%,BB%,WPA,cWPA,RE24,RS%,SB%,XBT%
1,1954,20,MLN,NL,509,.356,109,.281,.167,2.6%,7.7%,5.5%,1.0,1.1%,7.6,30%,50%,51%
2,1955,21,MLN,NL,665,.406,149,.313,.226,4.1%,9.2%,7.4%,1.6,0.8%,31.7,36%,75%,47%
3,1956,22,MLN,NL,660,.402,157,.325,.230,3.9%,8.2%,5.6%,3.9,22.6%,23.6,38%,33%,53%
4,1957,23,MLN,NL,675,.428,173,.298,.278,6.5%,8.6%,8.4%,7.0,19.4%,73.1,35%,50%,51%
5,1958,24,MLN,NL,664,.416,160,.316,.220,4.5%,7.4%,8.9%,3.4,13.7%,37.2,35%,80%,57%
6,1959,25,MLN,NL,693,.448,184,.338,.281,5.6%,7.8%,7.4%,7.8,24.2%,69.9,32%,100%,60%
7,1960,26,MLN,NL,664,.406,158,.265,.275,6.0%,9.5%,9.0%,4.1,7.0%,56.4,32%,70%,72%
8,1961,27,MLN,NL,671,.415,158,.317,.267,5.1%,9.5%,8.4%,6.7,3.7%,50.5,37%,70%,54%
9,1962,28,MLN,NL,667,.448,176,.304,.296,6.8%,10.9%,9.9%,7.1,1.2%,60.0,38%,68%,51%


In [None]:
get_table(soup, 'all_standard_fielding')

Unnamed: 0,Year,Age,Tm,Pos,Lg,G,GS,CG,Inn,Ch,PO,A,E,DP,Fld%,Rtot,Rtot/yr,RF/9,RF/G,lgFld%,lgRF9,lgRFG,Awards
0,1954,20,MLN,OF,NL,116,113,111,1031.0,235,223,5,7,0,.970,-1,-1,1.99,1.97,.975,2.36,2.34,RoY-4
1,1954,20,MLN,LF,NL,105,102,100,924.1,200,191,3,6,0,.970,3,4,1.89,1.85,.970,2.17,2.15,RoY-4
2,1954,20,MLN,RF,NL,11,11,11,106.2,17,14,2,1,1,.941,-4,-45,1.35,1.45,.969,1.95,1.94,RoY-4
3,1955,21,MLN,OF,NL,126,125,113,1109.2,272,254,9,9,2,.967,7,8,2.13,2.09,.977,2.30,2.28,"AS,MVP-9"
4,1955,21,MLN,RF,NL,104,102,92,899.2,216,200,8,8,2,.963,7,9,2.08,2.00,.975,1.98,1.96,"AS,MVP-9"
5,1955,21,MLN,LF,NL,30,23,21,210.0,48,46,1,1,0,.979,0,0,2.01,1.57,.972,2.04,2.02,"AS,MVP-9"
6,1955,21,MLN,2B,NL,27,26,25,236.1,176,86,84,6,23,.966,0,1,6.47,6.30,.974,5.48,5.43,"AS,MVP-9"
7,1956,22,MLN,OF,NL,152,152,148,1349.1,346,316,17,13,4,.962,15,13,2.22,2.19,.977,2.28,2.25,"AS,MVP-3"
8,1956,22,MLN,RF,NL,152,152,148,1349.1,345,315,17,13,4,.962,15,13,2.21,2.18,.974,1.99,1.96,"AS,MVP-3"
9,1957,23,MLN,OF,NL,150,150,144,1356.1,361,346,9,6,0,.983,4,4,2.36,2.37,.980,2.29,2.31,"AS,MVP-1"


In [None]:
## if we wanted to extract the pitcher stats, we would run these commands

# get_table(soup, 'all_pitching_standard')
# get_table(soup, 'all_pitching_value')
# get_table(soup, 'all_pitching_advanced_front')

### Now that we can extract the statistics we might need later on, its time to clean the data

In [None]:
def df_cleaner(df, table_type=None):
    
    if table_type == 'all_batting_advanced_front' or table_type == 'all_pitching_advanced_front':
        df.columns = df.iloc[0,:]
        df.drop(index=0,inplace=True)
        df.reset_index(inplace=True, drop=True)  

    def yearConvert(x):
        try:
            return int(x)
        except:
            return np.nan
    
    df['Year'] = df['Year'].apply(yearConvert)
    df1 = df[df['Year'].notna()]
    df1 = df1.loc[:, ~df1.columns.duplicated()]
    df1 = df1[(df1['Lg']=='NL')|(df1['Lg']=='AL')|(df1['Lg']=='MLB')]
    if 'Salary' in df1.columns:
        df1['Salary'] = df1['Salary'].str.replace('[$]','', regex=False)
        df1['Salary'] = df1['Salary'].str.replace(',','', regex=False)
    
    for col in df1.columns:
        if '%' in col:
            try:
                df1[col] = df1[col].str.replace('%', '')
            except:
                pass
        elif col == 'PA' or col == 'GS' or col== 'IP':
            df1 = df1[df1[col]!='0']
        elif col == 'Salary':
            df1['Salary'] = df1['Salary'].str.extract('(\d+)')
    df1.replace(r'^\s*$', np.nan, regex=True, inplace=True)
                
    for col in df1.columns:
        try:
            df1.loc[:,col] = df1.loc[:,col].astype('str').astype('int')            
        except:
            try:
                df1.loc[:,col] = df1.loc[:,col].astype('str').astype('float')                
            except:
                df1.loc[:,col] = df1.loc[:,col].astype('str')
                
    if table_type == 'all_batting_standard':
        df1.rename(columns= {'Pos':'Pos_bat', 'G':'G_bat'}, inplace=True)
        df1 = clean_batting_standard(df1)
    elif table_type == 'all_batting_advanced_front' or table_type == 'all_pitching_standard' or table_type == 'all_pitching_advanced_front':        
        df1 = clean_batting_standard(df1)
    elif table_type == 'all_batting_value' or table_type == 'all_pitching_value':
        df1 = df1.select_dtypes(exclude='object')
        if 'Pos' in df1.columns:
            df1.rename(columns= {'Pos':'Pos_bat'}, inplace=True)
        df1 = groupby_sum_mean(df1)
    elif table_type == 'all_standard_fielding':
        df1 = df1.loc[:,~df1.columns.duplicated()]
        df1 = clean_standard_feilding(df1)
    
    
    if len(df1)!=len(df1['Year'].unique()):
        print('Duplicate year values exist in table ', table_type, ' for link: ', link)
            
    return df1

### support functions used by df_cleaner
def clean_batting_standard(df):
    df['marker'] = df.groupby('Year')['Tm'].transform('count')
    df = df[(df['marker']==1)|(df['Tm']=='TOT')]
    df['marker'] = df.groupby('Year')['Tm'].transform('count')
    df = df[(df['marker']==1)|(df['Lg']=='MLB')]
    df.reset_index(inplace=True, drop=True)
    return df

def clean_standard_feilding(df):
    #pos_adjust = {'P': 0, 'C': 240, '1B':12, '2B': 132, '3B':84, 'SS':168, 'LF':48, 'CF':48, 'RF':48 ,'OF':48, 'DH': 0} 
    pos_adjust = {'P': 0, 'C': 20, '1B':1, '2B': 11, '3B':7, 'SS':14, 'LF':3, 'CF':5, 'RF':4 , 'OF':4,'DH': 0} 
    df.sort_values('G', ascending=False, inplace=True)    
    df['Pos_2'] = df['Pos'].replace(pos_adjust)
    df['Pos_adjust'] = df['Pos_2']*df['G']
    flag = 0
    for year in df['Year'].unique():
        if set(['OF', 'RF', 'LF'])<=set(df[df['Year']==year]['Pos'].unique()):
            flag=1
            break
    if flag==1:
        df2 = df[(df['Tm']!='TOT') & (df['Pos']!= 'OF')]
    else:
        df2 = df[df['Tm']!='TOT']
    df2 = df[(df['Tm']!='TOT') & (df['Pos']!= 'OF')]
    df2 = df2[['Year', 'G', 'Pos_adjust']].groupby(['Year']).agg('sum').reset_index()
    df2['Pos_mean'] = df2['Pos_adjust']/df2['G']
    df.drop_duplicates(['Year'], inplace=True)
    df.sort_values('Year', inplace = True)
    df.reset_index(inplace=True, drop=True)
    df = pd.merge(df,df2[['Year', 'Pos_mean']], on='Year')
    return df

def clean_batting_value(df):
    df.sort_values('G_bat', ascending=False, inplace=True)
    df.drop_duplicates(['Year'], inplace=True)
    df.sort_values('Year', inplace = True)
    df.reset_index(inplace=True, drop=True)
    return df

def clean_career(df, player_id):
    df.drop(columns= ['Year', 'Age', 'Tm', 'Lg'], inplace = True)
    df['Player'] = player_id
    return df

def groupby_sum_mean(df, force={}):
    mod={}
    for col in df.columns:
        if col == 'Year':
            pass
        elif '%' in col:
            mod[col]='mean'
        else:
            mod[col]= 'sum'
    if force:
        for key, value in force.items():
            mod[key]= value
    return df.groupby('Year').agg(mod).reset_index()

In [None]:
## original scraped dataframe

df1 = get_table(soup, 'all_batting_standard')
df1

Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards
0,1952,18,BSN-min,C,87,345,345,,116,19,4,9,,,,,,.336,.336,.493,.829,,170,,,,,,,EAU · NORL
1,1953,19,MLN-min,A,137,574,574,,208,36,14,22,,,,,,.362,.362,.589,.951,,338,,,,,,,JCK · SALL
2,1954,20,MLN,NL,122,509,468,58,131,27,6,13,69,2,2,28,39,.280,.322,.447,.769,104,209,13,3,6,4,0,*79/H,RoY-4
3,1955,21,MLN,NL,153,665,602,105,189,37,9,27,106,3,1,49,61,.314,.366,.540,.906,141,325,20,3,7,4,5,*974/H,"AS,MVP-9"
4,1956,22,MLN,NL,153,660,609,106,200,34,14,26,92,2,4,37,54,.328,.365,.558,.923,151,340,21,2,5,7,6,*9/H,"AS,MVP-3"
5,1957,23,MLN,NL,151,675,615,118,198,27,6,44,132,1,1,57,58,.322,.378,.600,.978,166,369,13,0,0,3,15,*98/H,"AS,MVP-1"
6,1958,24,MLN,NL,153,664,601,109,196,34,4,30,95,4,1,59,49,.326,.386,.546,.931,153,328,21,1,0,3,16,*98,"AS,MVP-3,GG"
7,1959,25,MLN,NL,154,693,629,116,223,46,7,39,123,8,0,51,54,.355,.401,.636,1.037,183,400,19,4,0,9,17,*98/5,"AS,AS,MVP-3,GG"
8,1960,26,MLN,NL,153,664,590,102,172,20,11,40,126,16,7,60,63,.292,.352,.566,.919,156,334,8,2,0,12,13,*9/84,"AS,AS,MVP-11,GG"
9,1961,27,MLN,NL,155,671,603,115,197,39,10,34,120,21,9,56,64,.327,.381,.594,.974,163,358,16,2,1,9,20,*89/5H,"AS,AS,MVP-8"


In [None]:
## cleaned dataframe
df1 = df_cleaner(df1, 'all_batting_standard')
df1

Unnamed: 0,Year,Age,Tm,Lg,G_bat,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos_bat,Awards,marker
0,1954.0,20,MLN,NL,122,509,468,58,131,27,6,13,69,2,2,28,39,0.28,0.322,0.447,0.769,104,209,13,3,6,4,0,*79/H,RoY-4,1
1,1955.0,21,MLN,NL,153,665,602,105,189,37,9,27,106,3,1,49,61,0.314,0.366,0.54,0.906,141,325,20,3,7,4,5,*974/H,"AS,MVP-9",1
2,1956.0,22,MLN,NL,153,660,609,106,200,34,14,26,92,2,4,37,54,0.328,0.365,0.558,0.923,151,340,21,2,5,7,6,*9/H,"AS,MVP-3",1
3,1957.0,23,MLN,NL,151,675,615,118,198,27,6,44,132,1,1,57,58,0.322,0.378,0.6,0.978,166,369,13,0,0,3,15,*98/H,"AS,MVP-1",1
4,1958.0,24,MLN,NL,153,664,601,109,196,34,4,30,95,4,1,59,49,0.326,0.386,0.546,0.931,153,328,21,1,0,3,16,*98,"AS,MVP-3,GG",1
5,1959.0,25,MLN,NL,154,693,629,116,223,46,7,39,123,8,0,51,54,0.355,0.401,0.636,1.037,183,400,19,4,0,9,17,*98/5,"AS,AS,MVP-3,GG",1
6,1960.0,26,MLN,NL,153,664,590,102,172,20,11,40,126,16,7,60,63,0.292,0.352,0.566,0.919,156,334,8,2,0,12,13,*9/84,"AS,AS,MVP-11,GG",1
7,1961.0,27,MLN,NL,155,671,603,115,197,39,10,34,120,21,9,56,64,0.327,0.381,0.594,0.974,163,358,16,2,1,9,20,*89/5H,"AS,AS,MVP-8",1
8,1962.0,28,MLN,NL,156,667,592,127,191,28,6,45,128,15,7,66,73,0.323,0.39,0.618,1.008,170,366,14,3,0,6,14,*89/H3,"AS,AS,MVP-6",1
9,1963.0,29,MLN,NL,161,714,631,121,201,29,4,44,130,31,5,78,94,0.319,0.391,0.586,0.977,179,370,11,0,0,5,18,*9,"AS,MVP-3",1


In [None]:
df2 = get_table(soup, 'all_batting_value')
df2 = df_cleaner(df2, 'all_batting_value')
df2

Unnamed: 0,Year,Age,G,PA,Rbat,Rbaser,Rdp,Rfield,Rpos,RAA,WAA,Rrep,RAR,WAR,waaWL%,162WL%,oWAR,dWAR,oRAR,Salary
0,1954.0,20,122,509,5,-2,-1,-1,-5,-4,-0.5,19,16,1.4,0.497,0.498,1.6,-0.7,17,6000
1,1955.0,21,153,665,37,2,-1,7,-5,40,3.8,25,65,6.2,0.527,0.525,5.8,0.0,58,10000
2,1956.0,22,153,660,41,0,-1,15,-7,47,4.6,24,72,7.2,0.533,0.531,6.0,0.7,57,17500
3,1957.0,23,151,675,53,2,0,4,-4,55,5.5,25,80,8.0,0.538,0.535,7.8,-0.1,75,22500
4,1958.0,24,153,664,44,4,-2,8,-6,48,4.8,25,72,7.3,0.532,0.531,6.6,0.2,64,35000
5,1959.0,25,154,693,64,6,0,-4,-7,60,6.1,25,85,8.6,0.54,0.538,9.1,-1.1,89,35000
6,1960.0,26,153,664,41,2,2,14,-7,52,5.5,25,77,8.0,0.537,0.535,6.5,0.8,63,45000
7,1961.0,27,155,671,44,4,1,23,-4,69,6.9,25,94,9.5,0.545,0.543,7.1,2.0,71,45000
8,1962.0,28,156,667,57,0,0,6,-4,59,6.0,25,84,8.5,0.538,0.537,7.9,0.3,78,47500
9,1963.0,29,161,714,61,7,0,-5,-7,57,6.4,25,82,9.1,0.54,0.539,9.6,-1.3,86,53000


In [None]:
df3 = get_table(soup, 'all_batting_advanced_front')
df3 = df_cleaner(df3, 'all_batting_advanced_front')
df3

Unnamed: 0,Year,Age,Tm,Lg,PA,rOBA,Rbat+,BAbip,ISO,HR%,SO%,BB%,WPA,cWPA,RE24,RS%,SB%,XBT%,marker
0,1954.0,20,MLN,NL,509,0.356,109,0.281,0.167,2.6,7.7,5.5,1.0,1.1%,7.6,30,50,51,1
1,1955.0,21,MLN,NL,665,0.406,149,0.313,0.226,4.1,9.2,7.4,1.6,0.8%,31.7,36,75,47,1
2,1956.0,22,MLN,NL,660,0.402,157,0.325,0.23,3.9,8.2,5.6,3.9,22.6%,23.6,38,33,53,1
3,1957.0,23,MLN,NL,675,0.428,173,0.298,0.278,6.5,8.6,8.4,7.0,19.4%,73.1,35,50,51,1
4,1958.0,24,MLN,NL,664,0.416,160,0.316,0.22,4.5,7.4,8.9,3.4,13.7%,37.2,35,80,57,1
5,1959.0,25,MLN,NL,693,0.448,184,0.338,0.281,5.6,7.8,7.4,7.8,24.2%,69.9,32,100,60,1
6,1960.0,26,MLN,NL,664,0.406,158,0.265,0.275,6.0,9.5,9.0,4.1,7.0%,56.4,32,70,72,1
7,1961.0,27,MLN,NL,671,0.415,158,0.317,0.267,5.1,9.5,8.4,6.7,3.7%,50.5,37,70,54,1
8,1962.0,28,MLN,NL,667,0.448,176,0.304,0.296,6.8,10.9,9.9,7.1,1.2%,60.0,38,68,51,1
9,1963.0,29,MLN,NL,714,0.451,188,0.315,0.268,6.2,13.2,10.9,6.7,3.3%,84.8,33,86,60,1


In [None]:
df4 = get_table(soup, 'all_standard_fielding')
df4 = df_cleaner(df4, 'all_standard_fielding')
df4

Unnamed: 0,Year,Age,Tm,Pos,Lg,G,GS,CG,Inn,Ch,PO,A,E,DP,Fld%,Rtot,Rtot/yr,RF/9,RF/G,lgFld%,lgRF9,lgRFG,Awards,Pos_2,Pos_adjust,Pos_mean
0,1954.0,20,MLN,OF,NL,116,113,111.0,1031.0,235.0,223.0,5.0,7.0,0.0,0.97,-1.0,-1.0,1.99,1.97,0.975,2.36,2.34,RoY-4,4,464,3.094828
1,1955.0,21,MLN,OF,NL,126,125,113.0,1109.2,272.0,254.0,9.0,9.0,2.0,0.967,7.0,8.0,2.13,2.09,0.977,2.3,2.28,"AS,MVP-9",4,504,4.987578
2,1956.0,22,MLN,OF,NL,152,152,148.0,1349.1,346.0,316.0,17.0,13.0,4.0,0.962,15.0,13.0,2.22,2.19,0.977,2.28,2.25,"AS,MVP-3",4,608,4.0
3,1957.0,23,MLN,OF,NL,150,150,144.0,1356.1,361.0,346.0,9.0,6.0,0.0,0.983,4.0,4.0,2.36,2.37,0.98,2.29,2.31,"AS,MVP-1",4,600,4.45098
4,1958.0,24,MLN,OF,NL,153,153,147.0,1361.0,322.0,305.0,12.0,5.0,0.0,0.984,8.0,7.0,2.1,2.07,0.98,2.26,2.25,"AS,MVP-3,GG",4,612,4.246835
5,1959.0,25,MLN,OF,NL,152,149,142.0,1326.2,278.0,261.0,12.0,5.0,3.0,0.982,-5.0,-4.0,1.85,1.8,0.977,2.21,2.19,"AS,AS,MVP-3,GG",4,608,4.17284
6,1960.0,26,MLN,OF,NL,153,153,149.0,1376.1,339.0,320.0,13.0,6.0,6.0,0.982,14.0,12.0,2.18,2.18,0.978,2.2,2.2,"AS,AS,MVP-11,GG",4,612,4.032258
7,1961.0,27,MLN,OF,NL,154,154,144.0,1367.0,397.0,377.0,13.0,7.0,3.0,0.982,23.0,21.0,2.57,2.53,0.977,2.16,2.14,"AS,AS,MVP-8",4,616,4.515528
8,1962.0,28,MLN,OF,NL,153,153,147.0,1339.2,358.0,340.0,11.0,7.0,1.0,0.98,6.0,5.0,2.36,2.29,0.974,2.07,2.05,"AS,AS,MVP-6",4,612,4.538961
9,1963.0,29,MLN,RF,NL,161,161,157.0,1446.0,282.0,266.0,11.0,5.0,1.0,0.982,-5.0,-4.0,1.72,1.72,0.976,1.83,1.82,"AS,MVP-3",4,644,4.0


#### Time to merge the dataframes together

In [None]:
def df_merger(df1,df2):
    if 'Year' in df1.columns:        
        common = list(set(df2.columns).intersection(df1.columns))
        common.remove('Year')
        uncommon = [x for x in df2.columns if x not in common]
        df = pd.merge(df1,df2[uncommon], on='Year')
    elif 'Player' in df1.columns:
        common = list(set(df2.columns).intersection(df1.columns))
        common.remove('Player')
        uncommon = [x for x in df2.columns if x not in common]
        df = pd.merge(df1,df2[uncommon], on='Player')
    return df

In [None]:
dfFinal = df_merger(df1, df2)
dfFinal = df_merger(dfFinal, df3)
dfFinal = df_merger(dfFinal, df4)
dfFinal

Unnamed: 0,Year,Age,Tm,Lg,G_bat,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos_bat,Awards,marker,G,Rbat,Rbaser,Rdp,Rfield,Rpos,RAA,WAA,Rrep,RAR,WAR,waaWL%,162WL%,oWAR,dWAR,oRAR,Salary,rOBA,Rbat+,BAbip,ISO,HR%,SO%,BB%,WPA,cWPA,RE24,RS%,SB%,XBT%,Pos,GS,CG,Inn,Ch,PO,A,E,DP,Fld%,Rtot,Rtot/yr,RF/9,RF/G,lgFld%,lgRF9,lgRFG,Pos_2,Pos_adjust,Pos_mean
0,1954.0,20,MLN,NL,122,509,468,58,131,27,6,13,69,2,2,28,39,0.28,0.322,0.447,0.769,104,209,13,3,6,4,0,*79/H,RoY-4,1,122,5,-2,-1,-1,-5,-4,-0.5,19,16,1.4,0.497,0.498,1.6,-0.7,17,6000,0.356,109,0.281,0.167,2.6,7.7,5.5,1.0,1.1%,7.6,30,50,51,OF,113,111.0,1031.0,235.0,223.0,5.0,7.0,0.0,0.97,-1.0,-1.0,1.99,1.97,0.975,2.36,2.34,4,464,3.094828
1,1955.0,21,MLN,NL,153,665,602,105,189,37,9,27,106,3,1,49,61,0.314,0.366,0.54,0.906,141,325,20,3,7,4,5,*974/H,"AS,MVP-9",1,153,37,2,-1,7,-5,40,3.8,25,65,6.2,0.527,0.525,5.8,0.0,58,10000,0.406,149,0.313,0.226,4.1,9.2,7.4,1.6,0.8%,31.7,36,75,47,OF,125,113.0,1109.2,272.0,254.0,9.0,9.0,2.0,0.967,7.0,8.0,2.13,2.09,0.977,2.3,2.28,4,504,4.987578
2,1956.0,22,MLN,NL,153,660,609,106,200,34,14,26,92,2,4,37,54,0.328,0.365,0.558,0.923,151,340,21,2,5,7,6,*9/H,"AS,MVP-3",1,153,41,0,-1,15,-7,47,4.6,24,72,7.2,0.533,0.531,6.0,0.7,57,17500,0.402,157,0.325,0.23,3.9,8.2,5.6,3.9,22.6%,23.6,38,33,53,OF,152,148.0,1349.1,346.0,316.0,17.0,13.0,4.0,0.962,15.0,13.0,2.22,2.19,0.977,2.28,2.25,4,608,4.0
3,1957.0,23,MLN,NL,151,675,615,118,198,27,6,44,132,1,1,57,58,0.322,0.378,0.6,0.978,166,369,13,0,0,3,15,*98/H,"AS,MVP-1",1,151,53,2,0,4,-4,55,5.5,25,80,8.0,0.538,0.535,7.8,-0.1,75,22500,0.428,173,0.298,0.278,6.5,8.6,8.4,7.0,19.4%,73.1,35,50,51,OF,150,144.0,1356.1,361.0,346.0,9.0,6.0,0.0,0.983,4.0,4.0,2.36,2.37,0.98,2.29,2.31,4,600,4.45098
4,1958.0,24,MLN,NL,153,664,601,109,196,34,4,30,95,4,1,59,49,0.326,0.386,0.546,0.931,153,328,21,1,0,3,16,*98,"AS,MVP-3,GG",1,153,44,4,-2,8,-6,48,4.8,25,72,7.3,0.532,0.531,6.6,0.2,64,35000,0.416,160,0.316,0.22,4.5,7.4,8.9,3.4,13.7%,37.2,35,80,57,OF,153,147.0,1361.0,322.0,305.0,12.0,5.0,0.0,0.984,8.0,7.0,2.1,2.07,0.98,2.26,2.25,4,612,4.246835
5,1959.0,25,MLN,NL,154,693,629,116,223,46,7,39,123,8,0,51,54,0.355,0.401,0.636,1.037,183,400,19,4,0,9,17,*98/5,"AS,AS,MVP-3,GG",1,154,64,6,0,-4,-7,60,6.1,25,85,8.6,0.54,0.538,9.1,-1.1,89,35000,0.448,184,0.338,0.281,5.6,7.8,7.4,7.8,24.2%,69.9,32,100,60,OF,149,142.0,1326.2,278.0,261.0,12.0,5.0,3.0,0.982,-5.0,-4.0,1.85,1.8,0.977,2.21,2.19,4,608,4.17284
6,1960.0,26,MLN,NL,153,664,590,102,172,20,11,40,126,16,7,60,63,0.292,0.352,0.566,0.919,156,334,8,2,0,12,13,*9/84,"AS,AS,MVP-11,GG",1,153,41,2,2,14,-7,52,5.5,25,77,8.0,0.537,0.535,6.5,0.8,63,45000,0.406,158,0.265,0.275,6.0,9.5,9.0,4.1,7.0%,56.4,32,70,72,OF,153,149.0,1376.1,339.0,320.0,13.0,6.0,6.0,0.982,14.0,12.0,2.18,2.18,0.978,2.2,2.2,4,612,4.032258
7,1961.0,27,MLN,NL,155,671,603,115,197,39,10,34,120,21,9,56,64,0.327,0.381,0.594,0.974,163,358,16,2,1,9,20,*89/5H,"AS,AS,MVP-8",1,155,44,4,1,23,-4,69,6.9,25,94,9.5,0.545,0.543,7.1,2.0,71,45000,0.415,158,0.317,0.267,5.1,9.5,8.4,6.7,3.7%,50.5,37,70,54,OF,154,144.0,1367.0,397.0,377.0,13.0,7.0,3.0,0.982,23.0,21.0,2.57,2.53,0.977,2.16,2.14,4,616,4.515528
8,1962.0,28,MLN,NL,156,667,592,127,191,28,6,45,128,15,7,66,73,0.323,0.39,0.618,1.008,170,366,14,3,0,6,14,*89/H3,"AS,AS,MVP-6",1,156,57,0,0,6,-4,59,6.0,25,84,8.5,0.538,0.537,7.9,0.3,78,47500,0.448,176,0.304,0.296,6.8,10.9,9.9,7.1,1.2%,60.0,38,68,51,OF,153,147.0,1339.2,358.0,340.0,11.0,7.0,1.0,0.98,6.0,5.0,2.36,2.29,0.974,2.07,2.05,4,612,4.538961
9,1963.0,29,MLN,NL,161,714,631,121,201,29,4,44,130,31,5,78,94,0.319,0.391,0.586,0.977,179,370,11,0,0,5,18,*9,"AS,MVP-3",1,161,61,7,0,-5,-7,57,6.4,25,82,9.1,0.54,0.539,9.6,-1.3,86,53000,0.451,188,0.315,0.268,6.2,13.2,10.9,6.7,3.3%,84.8,33,86,60,RF,161,157.0,1446.0,282.0,266.0,11.0,5.0,1.0,0.982,-5.0,-4.0,1.72,1.72,0.976,1.83,1.82,4,644,4.0


### Putting it all together

In [None]:
def individual_info(link, limit=100):    
    page = requests.get(link)
    soup = BeautifulSoup(page.content, 'html.parser')
    match = soup.find('div', itemtype='https://schema.org/Person')
    playername = match.find('h1').text    
    player_id = link.split("/")[-1].split('.')[0]
    bling = soup.find(id='bling')
    HOF=0
    try:
        if bling.find('li').text == 'Hall of Fame':
            HOF=1
    except:
        pass
    meta = soup.find(id='meta')
    position = np.nan
    last_game = 2021
    for para in meta.findAll('p'):
        if ':' in para.text:
            info = para.text.split(':')
            if info[0].strip()=='Position' or info[0].strip()=='Positions':
                position = info[1].strip()
            elif info[0].strip()=='Last Game':
                temp = info[1].strip().split('\n')[0]
                try:
                    last_game = int(temp.split(',')[-1].strip())
                except:
                    last_game = temp.split(',')[-1].strip()
                break
    #print(position)
    
    #keep only batters with Hits above(or =) the given threshold
    if limit and position != 'Pitcher':
        career1 = soup.find('div', class_='p1')
        keys = []
        values = []
        try:
            for stats in career1.find_all('p'):
                values.append(stats.text)
            for stats in career1.find_all('h4'):
                keys.append(stats.text)
        except:
            print('No career information for ', link)
            df_season_bat = pd.DataFrame()
            df_season_pitch = pd.DataFrame()
            return df_season_bat, df_season_pitch

        try:
            if soup.find('div', class_='stats_pullout').p.text=='2021':
                values = values[1::2]
        except:
            print('p.text error', url)
        summary_stats= dict(zip(keys,values))
        #print(summary_stats)
        
        if 'H' not in summary_stats:
            #print('No H for ', link)
            df_season_bat = pd.DataFrame()
            df_season_pitch = pd.DataFrame()
            return df_season_bat, df_season_pitch
        elif int(summary_stats['H'])<limit:
            #print('H less than ', limit,' for ',link)
            df_season_bat = pd.DataFrame()
            df_season_pitch = pd.DataFrame()
            return df_season_bat, df_season_pitch
            
    
    # get all the required tables for batters
    
    if position != 'Pitcher':        
        try:
            df1 = get_table(soup, 'all_batting_standard')
            df2 = get_table(soup, 'all_batting_value')
            df3 = get_table(soup, 'all_batting_advanced_front')
            df4 = get_table(soup, 'all_standard_fielding')
        except:
            print('Error in extracting tables for ', link)

        #cleaning tables and extracting stats by season and career as a whole
        try:
            df_season1 = df_cleaner(df1, 'all_batting_standard')
            df_season2 = df_cleaner(df2,'all_batting_value')
            df_season3 = df_cleaner(df3, 'all_batting_advanced_front')
            df_season4 = df_cleaner(df4, 'all_standard_fielding')
        except:
            print('Error in cleaning DFs for ', link)

        #merging season dataframes
        try:
            df_season_bat = df_merger(df_season1, df_season2)
            df_season_bat = df_merger(df_season_bat, df_season3)
            df_season_bat = df_merger(df_season_bat, df_season4)  
        except:
            print('Error in merging season data for ', link)
        
        try:
            df_season_bat['Player']= player_id
            df_season_bat['Player_name']= playername.strip()
            df_season_bat['Season']= df_season_bat.index+1
            df_season_bat['HOF'] = HOF
            df_season_bat['Type'] = position
            df_season_bat['last_game'] = last_game            
        except:
            print('Adding empty dataframe')
            df_season_bat = pd.DataFrame()
        df_season_pitch = pd.DataFrame()
        return df_season_bat, df_season_pitch
        
    else:
        #print('pitcher df')
        try:
            df1 = get_table(soup, 'all_pitching_standard')
            df2 = get_table(soup, 'all_pitching_value')
            df3 = get_table(soup, 'all_pitching_advanced_front')
        except:
            print('Error in extracting tables for ', link)

        #cleaning tables and extracting stats by season and career as a whole
        try:
            df_season1 = df_cleaner(df1, 'all_pitching_standard')
            df_season2 = df_cleaner(df2, 'all_pitching_value')
            df_season3 = df_cleaner(df3, 'all_pitching_advanced_front')
            
        except:
            print('Error in cleaning DFs for ', link)

        #merging season dataframes
        try:
            df_season_pitch = df_merger(df_season1, df_season2)
            df_season_pitch = df_merger(df_season_pitch, df_season3)              
        except:
            print('Error in merging season data for ', link)        
        try:
            df_season_pitch['Player']= player_id
            df_season_pitch['Player_name']= playername.strip()
            df_season_pitch['Season']= df_season_pitch.index+1
            df_season_pitch['HOF'] = HOF
            df_season_pitch['Type'] = position
            df_season_pitch['last_game'] = position
        except:
            print('Adding empty dataframe')
            df_season_pitch = pd.DataFrame()
        df_season_bat = pd.DataFrame()
        return (df_season_bat, df_season_pitch)

In [None]:
df_bat, df_pitch = individual_info(example)
df_bat

Unnamed: 0,Year,Age,Tm,Lg,G_bat,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos_bat,Awards,marker,G,Rbat,Rbaser,Rdp,Rfield,Rpos,RAA,WAA,Rrep,RAR,WAR,waaWL%,162WL%,oWAR,dWAR,oRAR,Salary,rOBA,Rbat+,BAbip,ISO,HR%,SO%,BB%,WPA,cWPA,RE24,RS%,SB%,XBT%,Pos,GS,CG,Inn,Ch,PO,A,E,DP,Fld%,Rtot,Rtot/yr,RF/9,RF/G,lgFld%,lgRF9,lgRFG,Pos_2,Pos_adjust,Pos_mean,Player,Player_name,Season,HOF,Type,last_game
0,1954.0,20,MLN,NL,122,509,468,58,131,27,6,13,69,2,2,28,39,0.28,0.322,0.447,0.769,104,209,13,3,6,4,0,*79/H,RoY-4,1,122,5,-2,-1,-1,-5,-4,-0.5,19,16,1.4,0.497,0.498,1.6,-0.7,17,6000,0.356,109,0.281,0.167,2.6,7.7,5.5,1.0,1.1%,7.6,30,50,51,OF,113,111.0,1031.0,235.0,223.0,5.0,7.0,0.0,0.97,-1.0,-1.0,1.99,1.97,0.975,2.36,2.34,4,464,3.094828,aaronha01,Henry Aaron,1,1,Rightfielder and First Baseman,1976
1,1955.0,21,MLN,NL,153,665,602,105,189,37,9,27,106,3,1,49,61,0.314,0.366,0.54,0.906,141,325,20,3,7,4,5,*974/H,"AS,MVP-9",1,153,37,2,-1,7,-5,40,3.8,25,65,6.2,0.527,0.525,5.8,0.0,58,10000,0.406,149,0.313,0.226,4.1,9.2,7.4,1.6,0.8%,31.7,36,75,47,OF,125,113.0,1109.2,272.0,254.0,9.0,9.0,2.0,0.967,7.0,8.0,2.13,2.09,0.977,2.3,2.28,4,504,4.987578,aaronha01,Henry Aaron,2,1,Rightfielder and First Baseman,1976
2,1956.0,22,MLN,NL,153,660,609,106,200,34,14,26,92,2,4,37,54,0.328,0.365,0.558,0.923,151,340,21,2,5,7,6,*9/H,"AS,MVP-3",1,153,41,0,-1,15,-7,47,4.6,24,72,7.2,0.533,0.531,6.0,0.7,57,17500,0.402,157,0.325,0.23,3.9,8.2,5.6,3.9,22.6%,23.6,38,33,53,OF,152,148.0,1349.1,346.0,316.0,17.0,13.0,4.0,0.962,15.0,13.0,2.22,2.19,0.977,2.28,2.25,4,608,4.0,aaronha01,Henry Aaron,3,1,Rightfielder and First Baseman,1976
3,1957.0,23,MLN,NL,151,675,615,118,198,27,6,44,132,1,1,57,58,0.322,0.378,0.6,0.978,166,369,13,0,0,3,15,*98/H,"AS,MVP-1",1,151,53,2,0,4,-4,55,5.5,25,80,8.0,0.538,0.535,7.8,-0.1,75,22500,0.428,173,0.298,0.278,6.5,8.6,8.4,7.0,19.4%,73.1,35,50,51,OF,150,144.0,1356.1,361.0,346.0,9.0,6.0,0.0,0.983,4.0,4.0,2.36,2.37,0.98,2.29,2.31,4,600,4.45098,aaronha01,Henry Aaron,4,1,Rightfielder and First Baseman,1976
4,1958.0,24,MLN,NL,153,664,601,109,196,34,4,30,95,4,1,59,49,0.326,0.386,0.546,0.931,153,328,21,1,0,3,16,*98,"AS,MVP-3,GG",1,153,44,4,-2,8,-6,48,4.8,25,72,7.3,0.532,0.531,6.6,0.2,64,35000,0.416,160,0.316,0.22,4.5,7.4,8.9,3.4,13.7%,37.2,35,80,57,OF,153,147.0,1361.0,322.0,305.0,12.0,5.0,0.0,0.984,8.0,7.0,2.1,2.07,0.98,2.26,2.25,4,612,4.246835,aaronha01,Henry Aaron,5,1,Rightfielder and First Baseman,1976
5,1959.0,25,MLN,NL,154,693,629,116,223,46,7,39,123,8,0,51,54,0.355,0.401,0.636,1.037,183,400,19,4,0,9,17,*98/5,"AS,AS,MVP-3,GG",1,154,64,6,0,-4,-7,60,6.1,25,85,8.6,0.54,0.538,9.1,-1.1,89,35000,0.448,184,0.338,0.281,5.6,7.8,7.4,7.8,24.2%,69.9,32,100,60,OF,149,142.0,1326.2,278.0,261.0,12.0,5.0,3.0,0.982,-5.0,-4.0,1.85,1.8,0.977,2.21,2.19,4,608,4.17284,aaronha01,Henry Aaron,6,1,Rightfielder and First Baseman,1976
6,1960.0,26,MLN,NL,153,664,590,102,172,20,11,40,126,16,7,60,63,0.292,0.352,0.566,0.919,156,334,8,2,0,12,13,*9/84,"AS,AS,MVP-11,GG",1,153,41,2,2,14,-7,52,5.5,25,77,8.0,0.537,0.535,6.5,0.8,63,45000,0.406,158,0.265,0.275,6.0,9.5,9.0,4.1,7.0%,56.4,32,70,72,OF,153,149.0,1376.1,339.0,320.0,13.0,6.0,6.0,0.982,14.0,12.0,2.18,2.18,0.978,2.2,2.2,4,612,4.032258,aaronha01,Henry Aaron,7,1,Rightfielder and First Baseman,1976
7,1961.0,27,MLN,NL,155,671,603,115,197,39,10,34,120,21,9,56,64,0.327,0.381,0.594,0.974,163,358,16,2,1,9,20,*89/5H,"AS,AS,MVP-8",1,155,44,4,1,23,-4,69,6.9,25,94,9.5,0.545,0.543,7.1,2.0,71,45000,0.415,158,0.317,0.267,5.1,9.5,8.4,6.7,3.7%,50.5,37,70,54,OF,154,144.0,1367.0,397.0,377.0,13.0,7.0,3.0,0.982,23.0,21.0,2.57,2.53,0.977,2.16,2.14,4,616,4.515528,aaronha01,Henry Aaron,8,1,Rightfielder and First Baseman,1976
8,1962.0,28,MLN,NL,156,667,592,127,191,28,6,45,128,15,7,66,73,0.323,0.39,0.618,1.008,170,366,14,3,0,6,14,*89/H3,"AS,AS,MVP-6",1,156,57,0,0,6,-4,59,6.0,25,84,8.5,0.538,0.537,7.9,0.3,78,47500,0.448,176,0.304,0.296,6.8,10.9,9.9,7.1,1.2%,60.0,38,68,51,OF,153,147.0,1339.2,358.0,340.0,11.0,7.0,1.0,0.98,6.0,5.0,2.36,2.29,0.974,2.07,2.05,4,612,4.538961,aaronha01,Henry Aaron,9,1,Rightfielder and First Baseman,1976
9,1963.0,29,MLN,NL,161,714,631,121,201,29,4,44,130,31,5,78,94,0.319,0.391,0.586,0.977,179,370,11,0,0,5,18,*9,"AS,MVP-3",1,161,61,7,0,-5,-7,57,6.4,25,82,9.1,0.54,0.539,9.6,-1.3,86,53000,0.451,188,0.315,0.268,6.2,13.2,10.9,6.7,3.3%,84.8,33,86,60,RF,161,157.0,1446.0,282.0,266.0,11.0,5.0,1.0,0.982,-5.0,-4.0,1.72,1.72,0.976,1.83,1.82,4,644,4.0,aaronha01,Henry Aaron,10,1,Rightfielder and First Baseman,1976


In [None]:
df_bat.to_csv('batters.csv')