# Analysis of Lahman Data

Calculate distributions of statistics across positions by year

## STEP 1: Import Packages

In [58]:
# import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## STEP 2: Read in data

In [4]:
batting_df = pd.read_csv('./data/core/Batting.csv',
                        delimiter=',')
#                         usecols=['playerID','yearID','stint','teamID'])
batting_df

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abercda01,1871,1,TRO,,1,4,0,0,0,...,0.0,0.0,0.0,0,0.0,,,,,0.0
1,addybo01,1871,1,RC1,,25,118,30,32,6,...,13.0,8.0,1.0,4,0.0,,,,,0.0
2,allisar01,1871,1,CL1,,29,137,28,40,4,...,19.0,3.0,1.0,2,5.0,,,,,1.0
3,allisdo01,1871,1,WS3,,27,133,28,44,10,...,27.0,1.0,1.0,0,2.0,,,,,0.0
4,ansonca01,1871,1,RC1,,25,120,29,39,11,...,16.0,6.0,2.0,2,1.0,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105856,zimmebr01,2018,1,CLE,AL,34,106,14,24,5,...,9.0,4.0,1.0,7,44.0,0.0,1.0,0.0,0.0,1.0
105857,zimmejo02,2018,1,DET,AL,25,2,0,0,0,...,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0
105858,zimmery01,2018,1,WAS,NL,85,288,33,76,21,...,51.0,1.0,1.0,30,55.0,1.0,3.0,0.0,2.0,10.0
105859,zobribe01,2018,1,CHN,NL,139,455,67,139,28,...,58.0,3.0,4.0,55,60.0,1.0,2.0,1.0,7.0,8.0


In [5]:
fielding_df = pd.read_csv('./data/core/Fielding.csv',
                        delimiter=',')
fielding_df

Unnamed: 0,playerID,yearID,stint,teamID,lgID,POS,G,GS,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR
0,abercda01,1871,1,TRO,,SS,1,1.0,24.0,1,3,2.0,0,,,,,
1,addybo01,1871,1,RC1,,2B,22,22.0,606.0,67,72,42.0,5,,,,,
2,addybo01,1871,1,RC1,,SS,3,3.0,96.0,8,14,7.0,0,,,,,
3,allisar01,1871,1,CL1,,2B,2,0.0,18.0,1,4,0.0,0,,,,,
4,allisar01,1871,1,CL1,,OF,29,29.0,729.0,51,3,7.0,1,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140916,zimmery01,2018,1,WAS,NL,1B,73,72.0,1861.0,538,39,2.0,41,,,,,
140917,zobribe01,2018,1,CHN,NL,1B,4,3.0,65.0,18,2,0.0,1,,,,,
140918,zobribe01,2018,1,CHN,NL,2B,63,42.0,1187.0,84,122,0.0,34,,,,,
140919,zobribe01,2018,1,CHN,NL,OF,84,62.0,1670.0,127,2,1.0,1,,,,,


## Count games played at each position by playerID, yearID, stint

In [6]:
list(fielding_df['POS'].unique())

['SS', '2B', 'OF', 'C', '1B', '3B', 'P']

In [7]:
# Sum total games for each player, year, stint
all_df = fielding_df.groupby(['playerID','yearID','stint'], as_index=False)['G'].sum().rename(columns={"G": "gAll"}, errors="raise")

# Dataframe for each position
ss_df = fielding_df[fielding_df['POS']=='SS'][['playerID','yearID','stint','G']].rename(columns={"G": "gSS"}, errors="raise")
second_df = fielding_df[fielding_df['POS']=='2B'][['playerID','yearID','stint','G']].rename(columns={"G": "g2B"}, errors="raise")
of_df = fielding_df[fielding_df['POS']=='OF'][['playerID','yearID','stint','G']].rename(columns={"G": "gOF"}, errors="raise")
c_df = fielding_df[fielding_df['POS']=='C'][['playerID','yearID','stint','G']].rename(columns={"G": "gC"}, errors="raise")
first_df = fielding_df[fielding_df['POS']=='1B'][['playerID','yearID','stint','G']].rename(columns={"G": "g1B"}, errors="raise")
third_df = fielding_df[fielding_df['POS']=='3B'][['playerID','yearID','stint','G']].rename(columns={"G": "g3B"}, errors="raise")
p_df = fielding_df[fielding_df['POS']=='P'][['playerID','yearID','stint','G']].rename(columns={"G": "gP"}, errors="raise")

# Merge all of the positions back with the total games
merged_pos_df = pd.merge(all_df, ss_df,  how='left', left_on=['playerID','yearID','stint'], right_on = ['playerID','yearID','stint'])
merged_pos_df = pd.merge(merged_pos_df, second_df,  how='left', left_on=['playerID','yearID','stint'], right_on = ['playerID','yearID','stint'])
merged_pos_df = pd.merge(merged_pos_df, of_df,  how='left', left_on=['playerID','yearID','stint'], right_on = ['playerID','yearID','stint'])
merged_pos_df = pd.merge(merged_pos_df, c_df,  how='left', left_on=['playerID','yearID','stint'], right_on = ['playerID','yearID','stint'])
merged_pos_df = pd.merge(merged_pos_df, first_df,  how='left', left_on=['playerID','yearID','stint'], right_on = ['playerID','yearID','stint'])
merged_pos_df = pd.merge(merged_pos_df, third_df,  how='left', left_on=['playerID','yearID','stint'], right_on = ['playerID','yearID','stint'])
merged_pos_df = pd.merge(merged_pos_df, p_df,  how='left', left_on=['playerID','yearID','stint'], right_on = ['playerID','yearID','stint'])
merged_pos_df

Unnamed: 0,playerID,yearID,stint,gAll,gSS,g2B,gOF,gC,g1B,g3B,gP
0,aardsda01,2004,1,11,,,,,,,11.0
1,aardsda01,2006,1,45,,,,,,,45.0
2,aardsda01,2007,1,25,,,,,,,25.0
3,aardsda01,2008,1,47,,,,,,,47.0
4,aardsda01,2009,1,73,,,,,,,73.0
...,...,...,...,...,...,...,...,...,...,...,...
104421,zwilldu01,1915,1,151,,,148.0,,3.0,,
104422,zwilldu01,1916,1,10,,,10.0,,,,
104423,zychto01,2015,1,13,,,,,,,13.0
104424,zychto01,2016,1,12,,,,,,,12.0


In [8]:
# Merge batting with the games by position
merged_batting_df = pd.merge(batting_df, merged_pos_df, how='left', left_on=['playerID','yearID','stint'], right_on = ['playerID','yearID','stint'])

# All the rows without any games by position data
merged_batting_df[~pd.notna(merged_batting_df['gAll'])]

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,SF,GIDP,gAll,gSS,g2B,gOF,gC,g1B,g3B,gP
6768,fostere01,1896,1,NY1,NL,1,1,0,0,0,...,,,,,,,,,,
7416,laddhi01,1898,1,PIT,NL,1,1,0,0,0,...,,,,,,,,,,
7515,stallge01,1898,1,PHI,NL,1,0,1,0,0,...,,,,,,,,,,
7623,croftha01,1899,1,LS3,NL,2,2,0,0,0,...,,,,,,,,,,
8561,burnscb01,1902,1,BLA,AL,1,1,0,1,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104031,sanchto01,2017,1,ATL,NL,1,1,0,0,0,...,0.0,0.0,,,,,,,,
105096,lavarry01,2018,1,PIT,NL,6,6,1,4,1,...,0.0,0.0,,,,,,,,
105224,mejiafr01,2018,1,CLE,AL,1,2,0,0,0,...,0.0,0.0,,,,,,,,
105383,peterdu01,2018,1,ATL,NL,2,2,0,0,0,...,0.0,0.0,,,,,,,,


## Distribute HR over positions based on game counts

In [9]:
# Calculate plate appearances and reduce the amount of columns
merged_batting_df['PA'] = merged_batting_df['AB'].fillna(0) + merged_batting_df['BB'].fillna(0) + merged_batting_df['IBB'].fillna(0) + merged_batting_df['HBP'].fillna(0) + merged_batting_df['SH'].fillna(0) + merged_batting_df['SF'].fillna(0)
reduced_batting_df = merged_batting_df[['playerID','yearID','stint','G','HR','PA','gAll','gSS','g2B','gOF','gC','g1B','g3B','gP']]
reduced_batting_df

Unnamed: 0,playerID,yearID,stint,G,HR,PA,gAll,gSS,g2B,gOF,gC,g1B,g3B,gP
0,abercda01,1871,1,1,0,4.0,1.0,1.0,,,,,,
1,addybo01,1871,1,25,0,122.0,25.0,3.0,22.0,,,,,
2,allisar01,1871,1,29,0,139.0,31.0,,2.0,29.0,,,,
3,allisdo01,1871,1,27,2,133.0,27.0,,,,27.0,,,
4,ansonca01,1871,1,25,0,122.0,29.0,,2.0,1.0,5.0,1.0,20.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105856,zimmebr01,2018,1,34,2,114.0,34.0,,,34.0,,,,
105857,zimmejo02,2018,1,25,0,2.0,25.0,,,,,,,25.0
105858,zimmery01,2018,1,85,13,324.0,73.0,,,,,73.0,,
105859,zobribe01,2018,1,139,9,521.0,151.0,,63.0,84.0,,4.0,,


In [10]:
# diff_games = reduced_batting_df['G'].fillna(0) - reduced_batting_df['gAll'].fillna(0)
# diff_games
dist_hr_df = reduced_batting_df.copy()
dist_hr_df['hrSS'] = (dist_hr_df['HR'].fillna(0)/dist_hr_df['gAll'].fillna(0))*dist_hr_df['gSS'].fillna(0)
dist_hr_df['hr2B'] = (dist_hr_df['HR'].fillna(0)/dist_hr_df['gAll'].fillna(0))*dist_hr_df['g2B'].fillna(0)
dist_hr_df['hrOF'] = (dist_hr_df['HR'].fillna(0)/dist_hr_df['gAll'].fillna(0))*dist_hr_df['gOF'].fillna(0)
dist_hr_df['hrC'] = (dist_hr_df['HR'].fillna(0)/dist_hr_df['gAll'].fillna(0))*dist_hr_df['gC'].fillna(0)
dist_hr_df['hr1B'] = (dist_hr_df['HR'].fillna(0)/dist_hr_df['gAll'].fillna(0))*dist_hr_df['g1B'].fillna(0)
dist_hr_df['hr3B'] = (dist_hr_df['HR'].fillna(0)/dist_hr_df['gAll'].fillna(0))*dist_hr_df['g3B'].fillna(0)
dist_hr_df['hrP'] = (dist_hr_df['HR'].fillna(0)/dist_hr_df['gAll'].fillna(0))*dist_hr_df['gP'].fillna(0)
dist_hr_df

Unnamed: 0,playerID,yearID,stint,G,HR,PA,gAll,gSS,g2B,gOF,...,g1B,g3B,gP,hrSS,hr2B,hrOF,hrC,hr1B,hr3B,hrP
0,abercda01,1871,1,1,0,4.0,1.0,1.0,,,...,,,,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
1,addybo01,1871,1,25,0,122.0,25.0,3.0,22.0,,...,,,,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
2,allisar01,1871,1,29,0,139.0,31.0,,2.0,29.0,...,,,,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
3,allisdo01,1871,1,27,2,133.0,27.0,,,,...,,,,0.0,0.000000,0.000000,2.0,0.000000,0.0,0.0
4,ansonca01,1871,1,25,0,122.0,29.0,,2.0,1.0,...,1.0,20.0,,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105856,zimmebr01,2018,1,34,2,114.0,34.0,,,34.0,...,,,,0.0,0.000000,2.000000,0.0,0.000000,0.0,0.0
105857,zimmejo02,2018,1,25,0,2.0,25.0,,,,...,,,25.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
105858,zimmery01,2018,1,85,13,324.0,73.0,,,,...,73.0,,,0.0,0.000000,0.000000,0.0,13.000000,0.0,0.0
105859,zobribe01,2018,1,139,9,521.0,151.0,,63.0,84.0,...,4.0,,,0.0,3.754967,5.006623,0.0,0.238411,0.0,0.0


In [11]:
# Sum of HRs by position grouped by year
year_dist_hr_df = dist_hr_df.groupby(["yearID"], as_index=False).sum()
year_dist_hr_df

Unnamed: 0,yearID,stint,G,HR,PA,gAll,gSS,g2B,gOF,gC,g1B,g3B,gP,hrSS,hr2B,hrOF,hrC,hr1B,hr3B,hrP
0,1871,115,2296,47,11215.0,2437.0,265.0,263.0,809.0,284.0,259.0,280.0,277.0,3.330770,2.103395,18.788858,6.315221,5.361282,9.042132,2.058342
1,1872,172,3305,37,15926.0,3473.0,378.0,383.0,1160.0,395.0,371.0,381.0,405.0,4.748400,8.551609,16.249465,1.743280,0.287395,4.413898,1.005952
2,1873,128,3604,47,17294.0,3826.0,420.0,424.0,1264.0,439.0,422.0,423.0,434.0,5.357259,6.177879,19.036728,5.036479,3.091500,6.801825,1.498330
3,1874,126,4199,40,19342.0,4449.0,485.0,492.0,1493.0,532.0,478.0,479.0,490.0,4.067946,1.422054,17.861690,6.642180,7.213672,2.355117,0.437340
4,1875,248,6248,40,27082.0,6621.0,722.0,731.0,2197.0,758.0,733.0,717.0,763.0,4.961115,4.026851,13.753634,2.483768,8.202965,4.960003,1.611664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,2014,1558,69564,4186,184891.0,63251.0,5304.0,5499.0,16630.0,5354.0,5573.0,5570.0,19321.0,326.360078,355.084326,1592.147875,500.344572,870.609105,519.916805,16.537239
144,2015,1630,70534,4909,184546.0,64318.0,5315.0,5519.0,16822.0,5408.0,5691.0,5599.0,19964.0,366.637990,415.643981,1899.349670,500.576827,1012.196506,687.907114,26.687913
145,2016,1620,70451,5610,185471.0,64420.0,5290.0,5587.0,16740.0,5407.0,5704.0,5533.0,20159.0,510.944224,591.385829,2052.126475,581.009003,1060.412055,779.864918,25.257495
146,2017,1638,70743,6105,186222.0,64869.0,5335.0,5629.0,16726.0,5514.0,5590.0,5558.0,20517.0,539.628708,572.437303,2323.081515,646.404118,1164.563120,818.993703,29.891533


In [71]:
# Compare 2018 and 1970 HR distributions by position
filtered_1_df = year_dist_hr_df[year_dist_hr_df['yearID']==2018]
filtered_1_df = filtered_1_df.iloc[0]

filtered_2_df = year_dist_hr_df[year_dist_hr_df['yearID']==1970]
filtered_2_df = filtered_2_df.iloc[0]

fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'polar'},{'type': 'polar'}]])

fig.add_trace(go.Barpolar(
    r=[filtered_1_df.hrC, filtered_1_df.hr1B, filtered_1_df.hr2B, filtered_1_df.hr3B, filtered_1_df.hrSS, filtered_1_df.hrOF/3],
    theta=[30,90,150,210,270,330],
    width=[60,60,60,60,60,60],
    marker_color=["#003f5c", '#444e86', '#955196', '#dd5182', '#ff6e54', '#ffa600'],
    marker_line_color="black",
    marker_line_width=1,
    opacity=0.8
),1,1)
fig.add_trace(go.Barpolar(
    r=[filtered_2_df.hrC, filtered_2_df.hr1B, filtered_2_df.hr2B, filtered_2_df.hr3B, filtered_2_df.hrSS, filtered_2_df.hrOF/3],
    theta=[30,90,150,210,270,330],
    width=[60,60,60,60,60,60],
    marker_color=["#003f5c", '#444e86', '#955196', '#dd5182', '#ff6e54', '#ffa600'],
    marker_line_color="black",
    marker_line_width=1,
    opacity=0.8
),1,2)

fig.update_layout(
    showlegend=False,
    template=None,
    polar = dict(
        radialaxis = dict(range=[0, 1200], visible = False),
        angularaxis = dict(
            thetaunit = "degrees",
            dtick = 60,
            showticklabels=False,
            ticks=''
        )
    ),
    polar2 = dict(
        radialaxis = dict(range=[0, 1200], visible = False),
        angularaxis = dict(
            thetaunit = "degrees",
            dtick = 60,
            showticklabels=False,
            ticks=''
        )
    )
)

fig.show()

# TODO: animate by year? not possible for polar, but can for other plots - https://plot.ly/python/animations/