In [1]:
import pandas as pd
from pathlib import Path

In [2]:
filename = Path("baseball_project_data.csv")

In [3]:
baseball_hitting_df = pd.read_csv(filename)
baseball_hitting_df.head()

Unnamed: 0,Year,Tms,#Bat,BatAge,R/G,G,PA,AB,R,H,...,OBP,SLG,OPS,TB,GDP,HBP,SH,SF,IBB,BIP
0,2023,30,662,28.2,4.59,3046,115238,102905,13973,25518,...,0.32,0.411,0.731,42328,2151.0,1299.0,267.0,802.0,272.0,73903
1,2022,30,790,28.2,4.28,4860,182052,163465,20817,39675,...,0.312,0.395,0.706,64546,3397.0,2046.0,390.0,1224.0,475.0,118662
2,2021,30,1373,28.4,4.53,4858,181821,161941,22010,39484,...,0.317,0.411,0.728,66521,3328.0,2112.0,766.0,1143.0,703.0,114995
3,2020,30,618,28.0,4.65,1796,66506,59030,8344,14439,...,0.322,0.418,0.74,24656,1237.0,821.0,126.0,402.0,202.0,41542
4,2019,30,1287,27.9,4.83,4858,186517,166651,23467,42039,...,0.323,0.435,0.758,72468,3463.0,1984.0,776.0,1150.0,753.0,118202


In [4]:
baseball_hitting_df.columns

Index(['Year', 'Tms', '#Bat', 'BatAge', 'R/G', 'G', 'PA', 'AB', 'R', 'H', '1B',
       '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG',
       'OPS', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'BIP'],
      dtype='object')

In [5]:
# Renaming columns

baseball_hitting_df.rename(columns={
    'Year': 'year',
    'R/G': 'runs_per_game',
    'G': 'number_of_games',
    'R': 'runs',
    'H': 'hits',
    'HR': 'homeruns',
    'SB': 'stolen_bases',
    'CS': 'caught_stealing',
    'BB': 'walks',
    'SO':'strikeouts',
    'BA': 'batting_average',
    'OBP': 'on_base_percentage',
    'SLG': 'slugging_percentage',
    'OPS': 'on_base_plus_slugging_percentage',
    'TB': 'total_bases',
    'SH': 'sacrifice_hits',
    'BIP': 'balls_in_play',
    'Tms': 'teams'                 
                  }, inplace=True)
columns = baseball_hitting_df.columns
for column in columns:
    print (column)

year
teams
#Bat
BatAge
runs_per_game
number_of_games
PA
AB
runs
hits
1B
2B
3B
homeruns
RBI
stolen_bases
caught_stealing
walks
strikeouts
batting_average
on_base_percentage
slugging_percentage
on_base_plus_slugging_percentage
total_bases
GDP
HBP
sacrifice_hits
SF
IBB
balls_in_play


In [6]:
# Slim it down!

# Reset index to year
#baseball_hitting_df.set_index('Year', inplace=True)

# Get the years we want
cleaned_hitting_df = baseball_hitting_df.iloc[4:104]

cleaned_hitting_df

Unnamed: 0,year,teams,#Bat,BatAge,runs_per_game,number_of_games,PA,AB,runs,hits,...,on_base_percentage,slugging_percentage,on_base_plus_slugging_percentage,total_bases,GDP,HBP,sacrifice_hits,SF,IBB,balls_in_play
4,2019,30,1287,27.9,4.83,4858,186517,166651,23467,42039,...,0.323,0.435,0.758,72468,3463.0,1984.0,776.0,1150.0,753.0,118202
5,2018,30,1271,28.1,4.45,4862,185139,165432,21630,41018,...,0.318,0.409,0.728,67731,3457.0,1922.0,823.0,1235.0,929.0,119875
6,2017,30,1229,28.3,4.65,4860,185295,165567,22582,42215,...,0.324,0.426,0.750,70517,3804.0,1763.0,925.0,1168.0,970.0,120526
7,2016,30,1247,28.4,4.48,4856,184580,165561,21744,42276,...,0.322,0.417,0.739,69106,3719.0,1651.0,1025.0,1214.0,932.0,122183
8,2015,30,1252,28.4,4.25,4858,183628,165488,20647,42106,...,0.317,0.405,0.721,66953,3739.0,1602.0,1200.0,1232.0,951.0,124365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,1924,33,867,27.5,4.91,3426,131245,117531,16835,33264,...,0.343,0.388,0.732,45606,,759.0,2704.0,,320.0,109652
100,1923,30,800,27.7,5.02,3299,128234,113501,16559,32200,...,0.346,0.392,0.737,44444,,817.0,3819.0,,,105124
101,1922,24,654,28.0,5.00,3037,117930,104403,15198,29892,...,0.347,0.399,0.746,41623,,767.0,3695.0,,,96154
102,1921,24,668,27.8,4.89,3131,121172,107615,15296,30758,...,0.344,0.394,0.737,42355,,830.0,3859.0,,,99526


In [7]:
cleaned_hitting_df = cleaned_hitting_df[['year','teams','runs_per_game','number_of_games','runs','hits',
                                         'homeruns','stolen_bases','caught_stealing','batting_average',
                                        'on_base_percentage','slugging_percentage','on_base_plus_slugging_percentage',
                                        'total_bases','sacrifice_hits','balls_in_play']]
cleaned_hitting_df.head()

Unnamed: 0,year,teams,runs_per_game,number_of_games,runs,hits,homeruns,stolen_bases,caught_stealing,batting_average,on_base_percentage,slugging_percentage,on_base_plus_slugging_percentage,total_bases,sacrifice_hits,balls_in_play
4,2019,30,4.83,4858,23467,42039,6776,2280.0,832.0,0.252,0.323,0.435,0.758,72468,776.0,118202
5,2018,30,4.45,4862,21630,41018,5585,2474.0,958.0,0.248,0.318,0.409,0.728,67731,823.0,119875
6,2017,30,4.65,4860,22582,42215,6105,2527.0,934.0,0.255,0.324,0.426,0.75,70517,925.0,120526
7,2016,30,4.48,4856,21744,42276,5610,2537.0,1001.0,0.255,0.322,0.417,0.739,69106,1025.0,122183
8,2015,30,4.25,4858,20647,42106,4909,2505.0,1064.0,0.254,0.317,0.405,0.721,66953,1200.0,124365


In [11]:
cleaned_hitting_df['hits_per_game'] = cleaned_hitting_df['hits']/cleaned_hitting_df['number_of_games']
cleaned_hitting_df['stolen_bases_per_game'] = cleaned_hitting_df['stolen_bases']/cleaned_hitting_df['number_of_games']
cleaned_hitting_df['caught_stealing_per_game'] = cleaned_hitting_df['caught_stealing']/cleaned_hitting_df['number_of_games']
cleaned_hitting_df['homeruns_per_game'] = cleaned_hitting_df['homeruns']/cleaned_hitting_df['number_of_games']
cleaned_hitting_df['total_bases_per_game'] = cleaned_hitting_df['total_bases']/cleaned_hitting_df['number_of_games']
cleaned_hitting_df['sacrifice_hits_per_game'] = cleaned_hitting_df['sacrifice_hits']/cleaned_hitting_df['number_of_games']
cleaned_hitting_df['balls_in_play_per_game'] = cleaned_hitting_df['balls_in_play']/cleaned_hitting_df['number_of_games']

cleaned_hitting_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_hitting_df['hits_per_game'] = cleaned_hitting_df['hits']/cleaned_hitting_df['number_of_games']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_hitting_df['stolen_bases_per_game'] = cleaned_hitting_df['stolen_bases']/cleaned_hitting_df['number_of_games']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

Unnamed: 0,year,teams,runs_per_game,number_of_games,runs,hits,homeruns,stolen_bases,caught_stealing,batting_average,...,total_bases,sacrifice_hits,balls_in_play,hits_per_game,stolen_bases_per_game,caught_stealing_per_game,homeruns_per_game,total_bases_per_game,sacrifice_hits_per_game,balls_in_play_per_game
4,2019,30,4.83,4858,23467,42039,6776,2280.0,832.0,0.252,...,72468,776.0,118202,8.653561,0.469329,0.171264,1.394813,14.91725,0.159737,24.331412
5,2018,30,4.45,4862,21630,41018,5585,2474.0,958.0,0.248,...,67731,823.0,119875,8.436446,0.508844,0.197038,1.148704,13.930687,0.169272,24.655492
6,2017,30,4.65,4860,22582,42215,6105,2527.0,934.0,0.255,...,70517,925.0,120526,8.686214,0.519959,0.192181,1.256173,14.509671,0.190329,24.799588
7,2016,30,4.48,4856,21744,42276,5610,2537.0,1001.0,0.255,...,69106,1025.0,122183,8.705931,0.522446,0.206137,1.155272,14.231054,0.211079,25.161244
8,2015,30,4.25,4858,20647,42106,4909,2505.0,1064.0,0.254,...,66953,1200.0,124365,8.667353,0.515644,0.21902,1.010498,13.782009,0.247015,25.600041


In [26]:
correlation_df = cleaned_hitting_df.corr()
threshold = .5

correlation_df = correlation_df[((correlation_df > threshold) & (correlation_df != 1.0)) |(correlation_df < -threshold)]

correlation_df

Unnamed: 0,year,teams,runs_per_game,number_of_games,runs,hits,homeruns,stolen_bases,caught_stealing,batting_average,...,total_bases,sacrifice_hits,balls_in_play,hits_per_game,stolen_bases_per_game,caught_stealing_per_game,homeruns_per_game,total_bases_per_game,sacrifice_hits_per_game,balls_in_play_per_game
year,,,,0.84431,0.724373,0.761357,0.930076,0.658886,,-0.521094,...,0.827743,-0.598213,0.694541,,,,0.860425,,-0.867945,-0.935558
teams,,,,0.624202,0.69811,0.692387,,0.592219,,,...,0.627882,,0.727491,,0.526498,,,,,
runs_per_game,,,,,,,,,,0.87884,...,,,,0.900084,,,,0.774845,,
number_of_games,0.84431,0.624202,,,0.941673,0.980248,0.885699,0.874909,0.679667,,...,0.978495,,0.9659,,0.62427,,0.637405,,-0.584815,-0.711284
runs,0.724373,0.69811,,0.941673,,0.982494,0.861255,0.831885,0.61681,,...,0.982785,,0.94346,,0.602268,,0.618084,0.691955,,-0.557781
hits,0.761357,0.692387,,0.980248,0.982494,,0.849085,0.885404,0.691911,,...,0.986711,,0.982669,,0.660309,,0.581628,0.569904,,-0.592371
homeruns,0.930076,,,0.885699,0.861255,0.849085,,0.662271,,,...,0.92111,,0.761054,,,,0.913081,0.613765,-0.773134,-0.875339
stolen_bases,0.658886,0.592219,,0.874909,0.831885,0.885404,0.662271,,0.912144,,...,0.838187,,0.912507,,0.915622,0.514008,,,,
caught_stealing,,,,0.679667,0.61681,0.691911,,0.912144,,,...,0.614769,,0.760866,,0.934188,0.788114,,,,
batting_average,-0.521094,,0.87884,,,,,,,,...,,0.671917,,0.990594,,,,,0.646213,0.68121
