In [1]:
import pandas as pd
import statsmodels.api as sm

#The goal is to run a multiple linear regression with Wins as the dependent variable and the various ways of reaching base (walk, single, double, triple, home run) as independent variables
#The correlations will be investigated
#Note: there is reason to suspect multicollinearity (ex. teams with more power hitters may have the multiple base hits categories with a strong linear relationship)

In [2]:
data = pd.read_csv('2019Teams.csv')
data.head()

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
0,2019,NL,ARI,ARI,W,2,162,81,85,77,...,136,0.986,Arizona Diamondbacks,Chase Field,2135510,101,101,ARI,ARI,ARI
1,2019,NL,ATL,ATL,E,1,162,81,97,65,...,154,0.987,Atlanta Braves,SunTrust Park,2655100,105,103,ATL,ATL,ATL
2,2019,AL,BAL,BAL,E,5,162,81,54,108,...,155,0.982,Baltimore Orioles,Oriole Park at Camden Yards,1307807,99,102,BAL,BAL,BAL
3,2019,AL,BOS,BOS,E,3,162,81,84,78,...,115,0.985,Boston Red Sox,Fenway Park II,2924627,105,104,BOS,BOS,BOS
4,2019,AL,CHA,CHW,C,3,161,80,72,89,...,171,0.98,Chicago White Sox,Guaranteed Rate Field,1649775,97,99,CHW,CHA,CHA


In [3]:
Wins = data['W']
Singles =(data['H']-data['2B']-data['3B']-data['HR'])

#The excel sheet does not have a column for singles, so they must be calculated here

data['1B'] = Singles
data.head()

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro,1B
0,2019,NL,ARI,ARI,W,2,162,81,85,77,...,0.986,Arizona Diamondbacks,Chase Field,2135510,101,101,ARI,ARI,ARI,871
1,2019,NL,ATL,ATL,E,1,162,81,97,65,...,0.987,Atlanta Braves,SunTrust Park,2655100,105,103,ATL,ATL,ATL,877
2,2019,AL,BAL,BAL,E,5,162,81,54,108,...,0.982,Baltimore Orioles,Oriole Park at Camden Yards,1307807,99,102,BAL,BAL,BAL,889
3,2019,AL,BOS,BOS,E,3,162,81,84,78,...,0.985,Boston Red Sox,Fenway Park II,2924627,105,104,BOS,BOS,BOS,937
4,2019,AL,CHA,CHW,C,3,161,80,72,89,...,0.98,Chicago White Sox,Guaranteed Rate Field,1649775,97,99,CHW,CHA,CHA,981


In [4]:
y = Wins
x1 = data[['1B', '2B', '3B', 'HR', 'BB']]

In [5]:
x = sm.add_constant(x1)
regression = sm.OLS(y,x).fit()
regression.summary()

0,1,2,3
Dep. Variable:,W,R-squared:,0.743
Model:,OLS,Adj. R-squared:,0.69
Method:,Least Squares,F-statistic:,13.89
Date:,"Sun, 29 Nov 2020",Prob (F-statistic):,1.99e-06
Time:,21:17:11,Log-Likelihood:,-104.66
No. Observations:,30,AIC:,221.3
Df Residuals:,24,BIC:,229.7
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-83.1750,36.990,-2.249,0.034,-159.519,-6.831
1B,0.0721,0.039,1.860,0.075,-0.008,0.152
2B,0.0835,0.074,1.124,0.272,-0.070,0.237
3B,-0.3854,0.268,-1.439,0.163,-0.938,0.168
HR,0.1271,0.064,1.973,0.060,-0.006,0.260
BB,0.1121,0.038,2.944,0.007,0.034,0.191

0,1,2,3
Omnibus:,1.494,Durbin-Watson:,1.885
Prob(Omnibus):,0.474,Jarque-Bera (JB):,1.291
Skew:,-0.48,Prob(JB):,0.524
Kurtosis:,2.669,Cond. No.,24700.0


Our linear regression model is:
Wins = -83.1750 + 0.0721(1B) + 0.0835(2B) - 0.3854(3B) + 0.1271(HR) + 0.1121(BB)

As mentioned above there is likely some multicollinearity issues with the dataset. It also seems unlikely that hitting triples makes a team **less** likely to win games. This may be due to the multicollinearity as well as the rarity of triples, compared to the other offensive results. 