In [1]:
import pandas as pd
import numpy as np

data_2022 = pd.read_csv("2022stats.csv")

##First we will drop the columns that we do not need.
copy_2022 = data_2022.copy()
copy_2022.sort_values("Player")
copy_2022.drop(labels = ['G', 'Team', 'R', '2B', '3B', 'RBI', 'SB', 'CS', 'SH', 'SF', 'HBP'],axis='columns', inplace=True) 

#Now I will delete people that have lower than 100 At Bats(AB)
above_100_2022 = copy_2022['AB'] >= 100

copy_2022 = copy_2022[above_100_2022]


#Now we will combine the names that have multiple instances
copy_2022 = copy_2022.groupby('Player').agg({'AB' : 'sum', 'H' : 'sum','SO' : 'sum',  'BB' : 'sum', 'HR' : 'sum', 'AVG' : 'mean', 'OBP' : 'mean', 'SLG' : 'mean', 'OPS' : 'mean'})
def avg_group(x):
    if x < .150: return 1
    if x >= .150 and x < .175: return 2
    if x >= .175 and x < .200: return 3
    if x >= .200 and x < .225: return 4
    if x >= .225 and x < .250: return 5
    if x >= .250 and x < .275: return 6
    if x >= .275 and x < .300: return 7
    if x >= .300 and x < .325: return 8
    if x >= .325 and x < .350: return 9
    if x >= .350: return 10

copy_2022['AVG_group'] = copy_2022['AVG'].apply(avg_group)

print(copy_2022)

from sklearn.model_selection import train_test_split

train_2022, test_2022 = train_test_split(copy_2022, test_size=0.2, random_state=42)

                 AB    H   SO   BB  HR    AVG    OBP    SLG    OPS  AVG_group
Player                                                                       
AJ Pollock      489  120   98   32  14  0.245  0.292  0.389  0.681          5
Aaron Hicks     384   83  109   62   8  0.216  0.330  0.313  0.643          4
Aaron Judge     570  177  175  111  62  0.311  0.425  0.686  1.111          8
Abraham Toro    324   60   65   22  10  0.185  0.239  0.324  0.563          3
Adam Duvall     287   61  101   21  12  0.213  0.276  0.401  0.677          4
...             ...  ...  ...  ...  ..    ...    ...    ...    ...        ...
Yonathan Daza   372  112   58   26   2  0.301  0.349  0.384  0.733          8
Yordan Alvarez  470  144  106   78  37  0.306  0.406  0.613  1.019          8
Yoshi Tsutsugo  170   29   50   19   2  0.171  0.249  0.229  0.478          2
Yuli Gurriel    545  132   73   30   8  0.242  0.288  0.360  0.648          5
Zach McKinstry  155   32   48   13   4  0.206  0.272  0.361  0.6

In [2]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

model_svm = SVC(kernel='poly')
model_dt = DecisionTreeClassifier()

ensemble = VotingClassifier(
    voting = 'hard',
    estimators = [
        ('svm', model_svm),
        ('dt', model_dt)])
#OPS vs AVG
X = train_2022[['OPS']]
Y = train_2022['AVG_group']
ensemble.fit(X,Y)
X = test_2022[['OPS']]
Y = test_2022['AVG_group']
y_pred = ensemble.predict(X)
print("Accuracy score for OPS is ", accuracy_score(Y, y_pred))
#OBP vs AVG
X = train_2022[['OBP']]
Y = train_2022['AVG_group']
ensemble.fit(X,Y)
X = test_2022[['OBP']]
Y = test_2022['AVG_group']
y_pred = ensemble.predict(X)
print("Accuracy score for OBP is ", accuracy_score(Y, y_pred))
#AVG vs AVG
X = train_2022[['AVG']]
Y = train_2022['AVG_group']
ensemble.fit(X,Y)
X = test_2022[['AVG']]
Y = test_2022['AVG_group']
y_pred = ensemble.predict(X)
print("Accuracy score for AVG is ", accuracy_score(Y, y_pred))
#H vs AVG
X = train_2022[['H']]
Y = train_2022['AVG_group']
ensemble.fit(X,Y)
X = test_2022[['H']]
Y = test_2022['AVG_group']
y_pred = ensemble.predict(X)
print("Accuracy score for H is ", accuracy_score(Y, y_pred))
#SLG vs AVG
X = train_2022[['SLG']]
Y = train_2022['AVG_group']
ensemble.fit(X,Y)
X = test_2022[['SLG']]
Y = test_2022['AVG_group']
y_pred = ensemble.predict(X)
print("Accuracy score for SLG is", accuracy_score(Y, y_pred))
#SO vs AVG
X = train_2022[['SO']]
Y = train_2022['AVG_group']
ensemble.fit(X,Y)
X = test_2022[['SO']]
Y = test_2022['AVG_group']
y_pred = ensemble.predict(X)
print("Accuracy score for SO is ", accuracy_score(Y, y_pred))
#BB vs AVG
X = train_2022[['BB']]
Y = train_2022['AVG_group']
ensemble.fit(X,Y)
X = test_2022[['BB']]
Y = test_2022['AVG_group']
y_pred = ensemble.predict(X)
print("Accuracy score for BB is ", accuracy_score(Y, y_pred))

Accuracy score for OPS is  0.25555555555555554
Accuracy score for OBP is  0.2777777777777778
Accuracy score for AVG is  0.9888888888888889
Accuracy score for H is  0.4
Accuracy score for SLG is 0.28888888888888886
Accuracy score for SO is  0.3111111111111111
Accuracy score for BB is  0.3111111111111111


In [5]:
X = train_2022[['SO', 'H']]
Y = train_2022['AVG_group']
ensemble.fit(X,Y)
X = test_2022[['SO', 'H']]
Y = test_2022['AVG_group']
y_pred = ensemble.predict(X)
print("Accuracy score for H & SO is ", accuracy_score(Y, y_pred))

X = train_2022[['H', 'BB', 'SO']]
Y = train_2022['AVG_group']
ensemble.fit(X,Y)
X = test_2022[['H', 'BB', 'SO']]
Y = test_2022['AVG_group']
y_pred = ensemble.predict(X)
print("Accuracy score for BB is ", accuracy_score(Y, y_pred))
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(Y, y_pred)
print(matrix)

Accuracy score for H & SO is  0.4
Accuracy score for BB is  0.4777777777777778
[[ 0  1  2  0  0  0  0  0]
 [ 0  2  9  0  0  0  0  0]
 [ 0  1 19  2  0  0  0  0]
 [ 1  0 15 10  2  0  0  0]
 [ 0  0  4  3 10  0  0  0]
 [ 0  0  0  1  2  2  0  0]
 [ 0  0  0  0  2  0  0  1]
 [ 0  0  0  0  0  1  0  0]]
