In [121]:
import pandas as pd

data_2022 = pd.read_csv("2022stats.csv")

##First we will drop the columns that we do not need.
copy_2022 = data_2022.copy()
copy_2022.sort_values("Player")
copy_2022.drop(labels = ['G', 'Team', 'R', '2B', '3B', 'RBI', 'SB', 'CS', 'SH', 'SF', 'HBP'],axis='columns', inplace=True) 

#Now I will delete people that have lower than 100 At Bats(AB)
above_100_2022 = copy_2022['AB'] >= 100

copy_2022 = copy_2022[above_100_2022]


#Now we will combine the names that have multiple instances
copy_2022 = copy_2022.groupby('Player').agg({'AB' : 'sum', 'H' : 'sum','SO' : 'sum',  'BB' : 'sum', 'HR' : 'sum', 'AVG' : 'mean', 'OBP' : 'mean', 'SLG' : 'mean', 'OPS' : 'mean'})
def avg_group(x):
    if x < .150: return 1
    if x >= .150 and x < .175: return 2
    if x >= .175 and x < .200: return 3
    if x >= .200 and x < .225: return 4
    if x >= .225 and x < .250: return 5
    if x >= .250 and x < .275: return 6
    if x >= .275 and x < .300: return 7
    if x >= .300 and x < .325: return 8
    if x >= .325 and x < .350: return 9
    if x >= .350: return 10


    

copy_2022['AVG_group'] = copy_2022['AVG'].apply(avg_group)

print(copy_2022)

from sklearn.model_selection import train_test_split

train_2022, test_2022 = train_test_split(copy_2022, test_size=0.2, random_state=42)

                 AB    H   SO   BB  HR    AVG    OBP    SLG    OPS  AVG_group
Player                                                                       
AJ Pollock      489  120   98   32  14  0.245  0.292  0.389  0.681          5
Aaron Hicks     384   83  109   62   8  0.216  0.330  0.313  0.643          4
Aaron Judge     570  177  175  111  62  0.311  0.425  0.686  1.111          8
Abraham Toro    324   60   65   22  10  0.185  0.239  0.324  0.563          3
Adam Duvall     287   61  101   21  12  0.213  0.276  0.401  0.677          4
...             ...  ...  ...  ...  ..    ...    ...    ...    ...        ...
Yonathan Daza   372  112   58   26   2  0.301  0.349  0.384  0.733          8
Yordan Alvarez  470  144  106   78  37  0.306  0.406  0.613  1.019          8
Yoshi Tsutsugo  170   29   50   19   2  0.171  0.249  0.229  0.478          2
Yuli Gurriel    545  132   73   30   8  0.242  0.288  0.360  0.648          5
Zach McKinstry  155   32   48   13   4  0.206  0.272  0.361  0.6

# Markdown 1
I plan on using the average from 2022 and the groups I placed them in for this decision tree. I believe this one should have near perfect results.

In [122]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression


X = train_2022[['AVG']]
Y = train_2022['AVG_group']
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X,Y)

X = test_2022[['AVG']]
Y = test_2022['AVG_group']
from sklearn.metrics import confusion_matrix
y_pred = tree_classifier.predict(X)
matrix = confusion_matrix(Y, y_pred)
print(matrix)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy is ", accuracy_score(Y, y_pred))
print("Precision is ", precision_score(Y, y_pred, average='weighted'))
print("Sensitivity is ", recall_score(Y, y_pred, average='weighted'))
print("F1 is ", f1_score(Y, y_pred, average='weighted'))


[[ 3  0  0  0  0  0  0  0]
 [ 0 11  0  0  0  0  0  0]
 [ 0  0 22  0  0  0  0  0]
 [ 0  0  0 28  0  0  0  0]
 [ 0  0  0  0 17  0  0  0]
 [ 0  0  0  0  0  5  0  0]
 [ 0  0  0  0  0  0  3  0]
 [ 0  0  0  0  0  0  1  0]]
Accuracy is  0.9888888888888889
Precision is  0.9805555555555555
Sensitivity is  0.9888888888888889
F1 is  0.9841269841269841


  _warn_prf(average, modifier, msg_start, len(result))


# Markdown 2
The first analysis did very well it looks like based on the average it is a reasonable that you would be able to place the player in the correct category. Now I am going to take a look at the strike outs compared to average.

In [132]:
from sklearn.tree import DecisionTreeClassifier


X = train_2022[['SO']]
Y = train_2022['AVG_group']
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X,Y)

X = test_2022[['SO']]
Y = test_2022['AVG_group']

from sklearn.metrics import confusion_matrix
y_pred = tree_classifier.predict(X)
matrix = confusion_matrix(Y, y_pred)
print(matrix)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy is ", accuracy_score(Y, y_pred))
print("Precision is ", precision_score(Y, y_pred, average='weighted'))
print("Sensitivity is ", recall_score(Y, y_pred, average='weighted'))
print("F1 is ", f1_score(Y, y_pred, average='weighted'))


[[ 0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  2  1  0  0  0]
 [ 0  1  1  5  2  2  0  0  0]
 [ 1  0  2  7  6  6  0  0  0]
 [ 1  2  0  7 12  6  0  0  0]
 [ 1  1  0  4  3  4  4  0  0]
 [ 0  0  0  2  1  2  0  0  0]
 [ 0  0  0  0  2  1  0  0  0]
 [ 0  0  0  0  1  0  0  0  0]]
Accuracy is  0.26666666666666666
Precision is  0.2722642517125276
Sensitivity is  0.26666666666666666
F1 is  0.26001414702422543


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


This did not do well. There was only a 26% accuracy. It makes sense because the correlation between SO and AVG was very low. Next we will look at Hits vs Average.

In [133]:

X = train_2022[['H']]
Y = train_2022['AVG_group']
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X,Y)

X = test_2022[['H']]
Y = test_2022['AVG_group']

from sklearn.metrics import confusion_matrix
y_pred = tree_classifier.predict(X)
matrix = confusion_matrix(Y, y_pred)
print(matrix)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy is ", accuracy_score(Y, y_pred))
print("Precision is ", precision_score(Y, y_pred, average='weighted'))
print("Sensitivity is ", recall_score(Y, y_pred, average='weighted'))
print("F1 is ", f1_score(Y, y_pred, average='weighted'))

[[ 0  0  0  2  0  1  0  0]
 [ 2  4  2  3  0  0  0  0]
 [ 2  1  7  7  3  1  1  0]
 [ 3  2  6 11  4  2  0  0]
 [ 0  0  5  3  6  1  2  0]
 [ 0  0  0  1  3  1  0  0]
 [ 0  0  1  1  0  1  0  0]
 [ 0  0  0  0  0  1  0  0]]
Accuracy is  0.32222222222222224
Precision is  0.3513227513227513
Sensitivity is  0.32222222222222224
F1 is  0.33336365041791394


  _warn_prf(average, modifier, msg_start, len(result))


The hits were able to do better than the SO but I feel like there should still be a stronger correlation between average and the other feature. I am going to test more features and find which one does the best.

In [125]:
X = train_2022[['OBP']]
Y = train_2022['AVG_group']
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X,Y)

X = test_2022[['OBP']]
Y = test_2022['AVG_group']

from sklearn.metrics import confusion_matrix
y_pred = tree_classifier.predict(X)
matrix = confusion_matrix(Y, y_pred)
print(matrix)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy is ", accuracy_score(Y, y_pred))
print("Precision is ", precision_score(Y, y_pred, average='weighted'))
print("Sensitivity is ", recall_score(Y, y_pred, average='weighted'))
print("F1 is ", f1_score(Y, y_pred, average='weighted'))

[[ 1  0  2  0  0  0  0  0]
 [ 0  2  6  2  0  1  0  0]
 [ 0  4 10  3  5  0  0  0]
 [ 0  3 10  8  7  0  0  0]
 [ 0  0  3 10  4  0  0  0]
 [ 0  0  2  0  1  2  0  0]
 [ 0  0  0  1  1  0  1  0]
 [ 0  0  0  0  0  0  1  0]]
Accuracy is  0.3111111111111111
Precision is  0.3339506172839506
Sensitivity is  0.3111111111111111
F1 is  0.31001221001220997


  _warn_prf(average, modifier, msg_start, len(result))


In [134]:
X = train_2022[['SLG']]
Y = train_2022['AVG_group']
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X,Y)

X = test_2022[['SLG']]
Y = test_2022['AVG_group']

from sklearn.metrics import confusion_matrix
y_pred = tree_classifier.predict(X)
matrix = confusion_matrix(Y, y_pred)
print(matrix)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy is ", accuracy_score(Y, y_pred))
print("Precision is ", precision_score(Y, y_pred, average='weighted'))
print("Sensitivity is ", recall_score(Y, y_pred, average='weighted'))
print("F1 is ", f1_score(Y, y_pred, average='weighted'))

[[ 0  0  0  0  0  0  0  0  0]
 [ 0  0  2  1  0  0  0  0  0]
 [ 2  2  3  2  1  1  0  0  0]
 [ 0  1  4  3  9  3  2  0  0]
 [ 2  3  2  6 10  3  2  0  0]
 [ 0  0  0  3  7  6  0  0  1]
 [ 0  0  0  1  2  2  0  0  0]
 [ 0  0  0  2  1  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0]]
Accuracy is  0.24444444444444444
Precision is  0.2533333333333333
Sensitivity is  0.24444444444444444
F1 is  0.24811302681992337


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [135]:
X = train_2022[['BB']]
Y = train_2022['AVG_group']
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X,Y)

X = test_2022[['BB']]
Y = test_2022['AVG_group']

from sklearn.metrics import confusion_matrix
y_pred = tree_classifier.predict(X)
matrix = confusion_matrix(Y, y_pred)
print(matrix)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy is ", accuracy_score(Y, y_pred))
print("Precision is ", precision_score(Y, y_pred, average='weighted'))
print("Sensitivity is ", recall_score(Y, y_pred, average='weighted'))
print("F1 is ", f1_score(Y, y_pred, average='weighted'))

[[ 0  0  1  0  1  1  0  0]
 [ 0  0  2  6  3  0  0  0]
 [ 0  0  8  9  5  0  0  0]
 [ 0  0  7 14  7  0  0  0]
 [ 0  0  9  3  4  1  0  0]
 [ 0  0  1  2  2  0  0  0]
 [ 0  0  1  1  0  1  0  0]
 [ 0  0  1  0  0  0  0  0]]
Accuracy is  0.28888888888888886
Precision is  0.22397306397306396
Sensitivity is  0.28888888888888886
F1 is  0.2522317188983855


  _warn_prf(average, modifier, msg_start, len(result))


# The Test Set
In each decision tree classifier I trained my data using the training set and then tested it on the test set. If I were to use the training set in finding my metrics it would be overfit because the decision tree would know the what the data that I'm trying to guess is.

# Try out SVC

I want to use SVC and see if it is able to get the same results that my DecisionTreeClassifier was able to get.

In [129]:
from sklearn.svm import SVC
X = train_2022[['AVG']]
Y = train_2022['AVG_group']

svm_classifier = SVC(kernel='poly')
svm_classifier.fit(X, Y)

X = test_2022[['AVG']]
Y = test_2022['AVG_group']


from sklearn.metrics import confusion_matrix
y_predicted = svm_classifier.predict(X)
matrix = confusion_matrix(Y, y_predicted)
print(matrix)
print ("Accuracy is ", accuracy_score(Y, y_predicted))
print ("Precision is ", precision_score(Y, y_predicted, average="weighted"))
print ("Sensitivity is ", recall_score(Y, y_predicted, average="weighted"))
print ("F1 is ", f1_score(Y, y_predicted, average="weighted"))


[[ 3  0  0  0  0  0  0  0]
 [ 0 11  0  0  0  0  0  0]
 [ 0  0 22  0  0  0  0  0]
 [ 0  0  0 28  0  0  0  0]
 [ 0  0  0  0 17  0  0  0]
 [ 0  0  0  0  0  5  0  0]
 [ 0  0  0  0  0  0  3  0]
 [ 0  0  0  0  0  0  1  0]]
Accuracy is  0.9888888888888889
Precision is  0.9805555555555555
Sensitivity is  0.9888888888888889
F1 is  0.9841269841269841


  _warn_prf(average, modifier, msg_start, len(result))


In [130]:
from sklearn.svm import SVC
X = train_2022[['H']]
Y = train_2022['AVG_group']

svm_classifier = SVC(kernel='poly')
svm_classifier.fit(X, Y)

X = test_2022[['H']]
Y = test_2022['AVG_group']


from sklearn.metrics import confusion_matrix
y_predicted = svm_classifier.predict(X)
matrix = confusion_matrix(Y, y_predicted)
print(matrix)
print ("Accuracy is ", accuracy_score(Y, y_predicted))
print ("Precision is ", precision_score(Y, y_predicted, average="weighted"))
print ("Sensitivity is ", recall_score(Y, y_predicted, average="weighted"))
print ("F1 is ", f1_score(Y, y_predicted, average="weighted"))



[[ 0  0  3  0  0  0  0  0]
 [ 0  0 10  1  0  0  0  0]
 [ 0  0 19  3  0  0  0  0]
 [ 0  0 16  4  8  0  0  0]
 [ 0  0  4  2 11  0  0  0]
 [ 0  0  0  0  5  0  0  0]
 [ 0  0  0  1  2  0  0  0]
 [ 0  0  0  0  1  0  0  0]]
Accuracy is  0.37777777777777777
Precision is  0.2794022849578405
Sensitivity is  0.37777777777777777
F1 is  0.2837876337876338


  _warn_prf(average, modifier, msg_start, len(result))


In [131]:
from sklearn.svm import SVC
X = train_2022[['SO']]
Y = train_2022['AVG_group']

svm_classifier = SVC(kernel='poly')
svm_classifier.fit(X, Y)

X = test_2022[['SO']]
Y = test_2022['AVG_group']


from sklearn.metrics import confusion_matrix
y_predicted = svm_classifier.predict(X)
matrix = confusion_matrix(Y, y_predicted)
print(matrix)
print ("Accuracy is ", accuracy_score(Y, y_predicted))
print ("Precision is ", precision_score(Y, y_predicted, average="weighted"))
print ("Sensitivity is ", recall_score(Y, y_predicted, average="weighted"))
print ("F1 is ", f1_score(Y, y_predicted, average="weighted"))


[[ 0  0  0  0  3  0  0  0]
 [ 0  0  0  0 11  0  0  0]
 [ 0  0  0  0 22  0  0  0]
 [ 0  0  0  0 28  0  0  0]
 [ 0  0  0  0 17  0  0  0]
 [ 0  0  0  0  5  0  0  0]
 [ 0  0  0  0  3  0  0  0]
 [ 0  0  0  0  1  0  0  0]]
Accuracy is  0.18888888888888888
Precision is  0.035679012345679016
Sensitivity is  0.18888888888888888
F1 is  0.06002076843198338


  _warn_prf(average, modifier, msg_start, len(result))


SVC did much worse that decision trees did. I believe because the decision tree is much better at dealing with categorical data. I changed my data and made it categorical so the decision tree was going to work much better. I tried all of the different kernels in SVC but it still did not help the performance of the model.