# Analysis of Lahman Data

Check correlation of runs scored, win %, obp*TB, pythagorean

## STEP 1: Import Packages

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# import numpy as np
# from scipy.stats import pearsonr

## STEP 2: Read in data

In [31]:
teams_df = pd.read_csv('./data/core/Teams.csv',
                        delimiter=',',
                        usecols=['yearID','teamID','G','W','L','R','RA','AB','H','2B','3B','HR','BB','HBP','SF'])
teams_df

Unnamed: 0,yearID,teamID,G,W,L,R,AB,H,2B,3B,HR,BB,HBP,SF,RA
0,1871,BS1,31,20,10,401,1372,426,70,37,3,60.0,,,303
1,1871,CH1,28,19,9,302,1196,323,52,21,10,60.0,,,241
2,1871,CL1,29,10,19,249,1186,328,35,40,7,26.0,,,341
3,1871,FW1,19,7,12,137,746,178,19,8,2,33.0,,,243
4,1871,NY2,33,16,17,302,1404,403,43,21,1,33.0,,,313
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2890,2018,SLN,162,88,74,759,5498,1369,248,9,205,525.0,80.0,48.0,691
2891,2018,TBA,162,90,72,716,5475,1415,274,43,150,540.0,101.0,50.0,646
2892,2018,TEX,162,67,95,737,5453,1308,266,24,194,555.0,88.0,34.0,848
2893,2018,TOR,162,73,89,709,5477,1336,320,16,217,499.0,58.0,37.0,832


In [38]:
# Filtering to 1970+ because that is when HBP and SF started being tracked
obp_teams_df = teams_df[teams_df['yearID'] >= 1970].copy()

obp_teams_df['OBP'] = (obp_teams_df['H'].fillna(0) + obp_teams_df['BB'].fillna(0) + obp_teams_df['HBP'].fillna(0)) / (obp_teams_df['AB'].fillna(0) + obp_teams_df['SF'].fillna(0) + obp_teams_df['BB'].fillna(0) + obp_teams_df['HBP'].fillna(0))
obp_teams_df['TB'] = obp_teams_df['H'].fillna(0) + obp_teams_df['2B'].fillna(0) + (2*obp_teams_df['3B'].fillna(0)) + (3*obp_teams_df['HR'].fillna(0))
obp_teams_df['EST_R'] = obp_teams_df['OBP']*obp_teams_df['TB']
obp_teams_df['WIN_PERC'] = obp_teams_df['W']/(obp_teams_df['W'] + obp_teams_df['L'])
obp_teams_df['PYTHAG'] = (obp_teams_df['R']**1.81)/((obp_teams_df['R']**1.81)+(obp_teams_df['RA']**1.81))

obp_teams_df['RG'] = obp_teams_df['R']/obp_teams_df['G']
obp_teams_df['EST_RG'] = obp_teams_df['EST_R']/obp_teams_df['G']
obp_teams_df['RAG'] = obp_teams_df['RA']/obp_teams_df['G']
obp_teams_df['HG'] = obp_teams_df['H']/obp_teams_df['G']
obp_teams_df['TBG'] = obp_teams_df['TB']/obp_teams_df['G']

obp_teams_df

Unnamed: 0,yearID,teamID,G,W,L,R,AB,H,2B,3B,...,OBP,TB,EST_R,WIN_PERC,PYTHAG,RG,EST_RG,RAG,HG,TBG
1541,1970,ATL,162,76,86,736,5546,1495,215,24,...,0.334255,2238,748.062785,0.469136,0.478405,4.543210,4.617672,4.765432,9.228395,13.814815
1542,1970,BAL,162,108,54,792,5545,1424,213,25,...,0.343986,2224,765.025189,0.666667,0.641688,4.888889,4.722378,3.543210,8.790123,13.728395
1543,1970,BOS,162,87,75,786,5535,1450,252,28,...,0.335264,2367,793.569498,0.537037,0.538356,4.851852,4.898577,4.456790,8.950617,14.611111
1544,1970,CAL,162,86,76,631,5532,1391,197,40,...,0.308850,2010,620.789082,0.530864,0.500718,3.895062,3.832031,3.888889,8.586420,12.407407
1545,1970,CHA,162,56,106,633,5514,1394,192,20,...,0.314586,1995,627.599901,0.345679,0.383930,3.907407,3.874073,5.074074,8.604938,12.314815
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2890,2018,SLN,162,88,74,759,5498,1369,248,9,...,0.320923,2250,722.077711,0.543210,0.542371,4.685185,4.457270,4.265432,8.450617,13.888889
2891,2018,TBA,162,90,72,716,5475,1415,274,43,...,0.333441,2225,741.907233,0.555556,0.546419,4.419753,4.579674,3.987654,8.734568,13.734568
2892,2018,TEX,162,67,95,737,5453,1308,266,24,...,0.318271,2204,701.468842,0.413580,0.436856,4.549383,4.330055,5.234568,8.074074,13.604938
2893,2018,TOR,162,73,89,709,5477,1336,320,16,...,0.311810,2339,729.324164,0.450617,0.428112,4.376543,4.502001,5.135802,8.246914,14.438272


In [40]:
obp_teams_df[['WIN_PERC','PYTHAG','RG','EST_RG','RAG','HG','TBG']].corr()

Unnamed: 0,WIN_PERC,PYTHAG,RG,EST_RG,RAG,HG,TBG
WIN_PERC,1.0,0.932484,0.515876,0.460171,-0.558458,0.376652,0.396446
PYTHAG,0.932484,1.0,0.556235,0.489206,-0.592197,0.411587,0.426163
RG,0.515876,0.556235,1.0,0.957134,0.334004,0.797693,0.910406
EST_RG,0.460171,0.489206,0.957134,1.0,0.366902,0.846296,0.970472
RAG,-0.558458,-0.592197,0.334004,0.366902,1.0,0.302597,0.397054
HG,0.376652,0.411587,0.797693,0.846296,0.302597,1.0,0.788866
TBG,0.396446,0.426163,0.910406,0.970472,0.397054,0.788866,1.0
