In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [3]:
# Read in combined data, filter out batters without a hit
df = pd.read_excel("batting_data_2010-2018.xlsx")
df = df[df['H'] > 0]

# leave just the variables we are using
df = df[['BB', 'HBP', 'H', 'R', '2B', '3B', 'HR', 'SB', 'CS']]
df

Unnamed: 0,BB,HBP,H,R,2B,3B,HR,SB,CS
2,87,2,146,88,41,1,20,24,10
3,4,0,45,16,11,1,1,2,1
9,0,0,1,0,0,1,0,0,0
10,0,0,9,4,1,0,1,0,0
11,10,0,12,5,3,0,1,0,0
...,...,...,...,...,...,...,...,...,...
10918,1,0,2,0,1,0,0,0,0
10923,7,1,24,14,5,0,2,4,1
10924,30,3,76,33,21,2,13,1,1
10926,55,2,139,67,28,3,9,3,4


In [4]:
# Combine walks and hbp and calculate singles
df['BB_HPB'] = df['BB'] + df['HBP']
df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']

# delete redundant variables
del df['BB']
del df['HBP']
del df['H']
df

Unnamed: 0,R,2B,3B,HR,SB,CS,BB_HPB,1B
2,88,41,1,20,24,10,89,84
3,16,11,1,1,2,1,4,32
9,0,0,1,0,0,0,0,0
10,4,1,0,1,0,0,0,7
11,5,3,0,1,0,0,10,8
...,...,...,...,...,...,...,...,...
10918,0,1,0,0,0,0,1,1
10923,14,5,0,2,4,1,8,17
10924,33,21,2,13,1,1,33,40
10926,67,28,3,9,3,4,57,99


In [5]:
# set X to independent varaibles, Y to dependent (Runs)
X = df.drop('R', axis = 1)
Y = df["R"]

#Split data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=1)
print(X_train)
print(X_test)

# Fit the model
regression_model = LinearRegression()
regression_model.fit(X_train, Y_train)

       2B  3B  HR  SB  CS  BB_HPB   1B
10640   3   1   6   3   1       5   17
7033    0   0   0   0   0       2    5
4103    0   0   0   0   0       0    3
3336   32   4   7   9   4      44  147
4226    1   0   1   0   0       7    6
...    ..  ..  ..  ..  ..     ...  ...
1387    2   0   0   0   0       1    2
8119    0   0   0   0   0       0    1
6162    5   0   0   3   2      11   26
361     1   0   0   0   0       0    1
8065    6   0   1   0   0      19   19

[5883 rows x 7 columns]
       2B  3B  HR  SB  CS  BB_HPB  1B
10300   1   0   0   4   0       3   1
2860    7   2   2   1   1       3  29
6669    7   3   4   2   2      24  12
8314   30   1  31   2   4      32  82
8972   33   1   6   7   4      22  78
...    ..  ..  ..  ..  ..     ...  ..
1548    1   0   1   0   1      10  10
6346   10   2   2   4   1      10  28
5111   25   8  17  21   5      62  92
3495    3   0   1   0   0       0   6
3577   24   0  28   7   2      72  65

[1039 rows x 7 columns]


LinearRegression()

In [6]:
intercept = regression_model.intercept_
coefficent = regression_model.coef_[0]

print("The intercept for our model is {:.4}".format(intercept))
print('-'*100)

for coef in zip(X.columns, regression_model.coef_):
    print("The Coefficient for {} is {:.2}".format(coef[0],coef[1]))

The intercept for our model is -0.1568
----------------------------------------------------------------------------------------------------
The Coefficient for 2B is 0.43
The Coefficient for 3B is 1.2
The Coefficient for HR is 0.93
The Coefficient for SB is 0.38
The Coefficient for CS is 0.16
The Coefficient for BB_HPB is 0.23
The Coefficient for 1B is 0.25


In [7]:
Y_predict = regression_model.predict(X_test)
model_r2 = r2_score(Y_test, Y_predict)
print("R2: {:.2}".format(model_r2))

R2: 0.97


In [9]:
import statsmodels.api as sm

# Add constant for test set
X2 = sm.add_constant(X)

# Fit an OLS of test set for analysis of our model
model = sm.OLS(Y, X2)
est = model.fit()
est.pvalues

const      1.095734e-01
2B        4.199805e-189
3B        4.844378e-153
HR         0.000000e+00
SB        7.173345e-153
CS         2.629573e-05
BB_HPB     0.000000e+00
1B         0.000000e+00
dtype: float64

In [10]:
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:                      R   R-squared:                       0.973
Model:                            OLS   Adj. R-squared:                  0.973
Method:                 Least Squares   F-statistic:                 3.543e+04
Date:                Fri, 18 Sep 2020   Prob (F-statistic):               0.00
Time:                        11:58:34   Log-Likelihood:                -20692.
No. Observations:                6922   AIC:                         4.140e+04
Df Residuals:                    6914   BIC:                         4.145e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1310      0.082     -1.600      0.1

In [None]:
 prediction = regression_model.predict([["INDEPENDENT VARIABLES HERE FOR SINGLE PREDICTION"]])