In [88]:
#imports
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

In [34]:
data = pd.read_csv("Hitters.csv")
data.head()
#if the salary column is NaN, we will drop that row as that is the variable to be predicted 
#if one of the other columns is NaN, we will replace it with the mean of that column so the data from the other columns can be used 

Data_drop_NA = data.dropna(subset = "Salary")

data_clean = Data_drop_NA.apply(lambda x: x.fillna(x.mean()) if x.dtype in ['float64', 'int64'] else x)
data_clean

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,N,E,325,9,3,700.0,N
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,A,E,313,381,20,875.0,A
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,A,W,37,113,7,385.0,A
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,A,E,1314,131,12,960.0,A


In [48]:
# data_clean["Division"].unique()

**Pipeline 1**
All columns as predictor for salary using basic linear regression


In [57]:
X = data_clean.drop("Salary", axis = 1)
y = data_clean["Salary"]
#League, division, and NewLeague are categorical and need to be dummified

In [65]:
ct_1 = ColumnTransformer(
    [("dummify", OneHotEncoder(sparse_output= False, drop= "first"),  make_column_selector(dtype_include=object)),
    ("standardize", StandardScaler(), make_column_selector(dtype_include= np.number))
    ],
    remainder = "passthrough"
).set_output(transform = "pandas")
variables = ct_1.fit_transform(data_clean)

In [54]:
pipeline_1 = Pipeline(
    [("preprocessing", ct_1),
    ("Regression", LinearRegression())]
).set_output(transform= "pandas")

Interpretting some important coeffecients of model 1:

In [83]:
#fitting to full dataset 
pipeline_1_fitted = pipeline_1.fit(X, y)
coeff = pipeline_1_fitted.named_steps['Regression'].coef_
variable = variables.columns
variable = variable[0:19]
important_variables = pd.DataFrame({"Variables" : variable, "Coeffecients" : coeff, "Absolute Value" : abs(coeff)})
important_variables.sort_values("Absolute Value", ascending= False)

Unnamed: 0,Variables,Coeffecients,Absolute Value
13,standardize__CRuns,480.747135,480.747135
10,standardize__CAtBat,-391.038655,391.038655
4,standardize__Hits,337.830479,337.830479
3,standardize__AtBat,-291.094556,291.094556
14,standardize__CRBI,260.689886,260.689886
15,standardize__CWalks,-213.892259,213.892259
8,standardize__Walks,135.073897,135.073897
1,dummify__Division_W,-116.849246,116.849246
11,standardize__CHits,86.687617,86.687617
16,standardize__PutOuts,78.761296,78.761296


When sorting the coeffecients based on their absolute value, the top five most important coeffecients are: the number of career runs, the number of career at bats, the number of hits in 1986, the number of at bats in 1986, and the number of career RBIs. Career runs, 1986 hits, and career RBIs are the highest positive coeffecients. This means that if you are a player with lots of runs and RBIs throughout your career and hit well in 1986 your salary will likely be higher. 

On the other side it seems that career at bats and 1986 at bats negatively correlate to a higher salary in 1989. This seems odd at first glance and I am curious to see how this changes as some other variables are dropped, however in this model, a higher amount of at bats seems to mean a lower salary for the 1989 season. I notice that career years also has a negative correlation, so potentially older players who have more at bats are outside of the "prime" age to play and these variables are also showing that. 

**MSE scores for pipeline 1:**

In [86]:
scores = cross_val_score(pipeline_1, X, y, cv=5, scoring='r2')
pipeline_1_MSE = scores.mean()
print(pipeline_1_MSE)

0.34349502178816743
0.34349502178816743


**Pipeline 2**
All columns as predictor for salary using ridge regression