# Golf Predictor 

## (0) Load Data

In [1]:
import pandas as pd
import numpy
# dfRaw is raw dataset read from csv
dfRaw = pd.read_csv("../data/combined.csv")
distance_dict = {"A" : 1, "B" : 2, "C" : 3, "D" : 4, "E" : 5, "F" : 6}

In [2]:
def convert_rank_to_class(raw_ans):
    ans = ""
    if raw_ans < 21:
        ans = "A"
    elif raw_ans < 41:
        ans = "B"
    elif raw_ans < 61:
        ans = "C"
    elif raw_ans < 101:
        ans = "D"
    elif raw_ans < 201:
        ans = "E"
    else:
        ans = "F"
    return ans

In [3]:
dfConv = dfRaw

# drop label 
dfConv = dfConv.drop(["owgr_rank_year_plus_two"], axis=1)
dfConv["owgr_rank_year_plus_two"] = dfRaw["owgr_rank_year_plus_two"]

# remove all ranks greater than 300
for index, row in dfConv.iterrows():
    
    if row["top_tens"] != row["top_tens"]: # https://stackoverflow.com/questions/944700/how-can-i-check-for-nan-values?rq=1
        dfConv.loc[index, "top_tens"] = 0.0
    
    # remove all ranks greater than 300 and convert ranks to classes
    if row["owgr_rank_year_plus_two"] > 300:
        dfConv.drop(index, inplace=True) #https://stackoverflow.com/questions/28876243/how-to-delete-the-current-row-in-pandas-dataframe-during-df-iterrows
    else:
        dfConv.loc[index, "owgr_rank_year_plus_two"] = convert_rank_to_class(row["owgr_rank_year_plus_two"]) # conver rank to class


### (0.1) K-Fold

In [4]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf.get_n_splits()
trainSets = []
testSets = []
for k in kf.split(dfConv):
    trainSets.append(dfConv.iloc[k[0]])
    testSets.append(dfConv.iloc[k[1]])

### (0.2) Find Feature Names

In [5]:
featureNames = dfConv.columns[2:-1] # Skip ID, name, year, rounds, and rank as label
nFeatures = featureNames.shape[0]
print("Number of features:", nFeatures)
featureNames

Number of features: 13


Index(['owgr_points_current_avg', 'driving_dist_avg', 'top_tens', 'gir_pct',
       'sg_p_avg', 'driving_acc_pct', 'scrambling_pct', 'adj_scoring_avg',
       'sg_ott_avg', 'sg_apr_avg', 'sg_arg_avg', 'rounds', 'year'],
      dtype='object')

### (0.3) Select Training Instances (features and values) from the Training Data

In [6]:
trainingInstancesList = []
for train in trainSets:
    trainingInstances_x = []
    trainingInstances_y = []
    for instance in train.to_numpy():
        featureValues = list(instance[2:-1])
        label = instance[-1]
        trainingInstances_x.append(featureValues)
        trainingInstances_y.append(label)
    nTrainingInstances = len(trainingInstances_x)
    trainingInstancesList.append((trainingInstances_x, trainingInstances_y, nTrainingInstances))
    print("Number of training instances:", nTrainingInstances)

Number of training instances: 1346
Number of training instances: 1346
Number of training instances: 1346
Number of training instances: 1347
Number of training instances: 1347


### (0.4) Select Test Instances from the Data

In [7]:
testInstancesList = []
for test in testSets:    
    testInstances_x = []
    testInstances_y = []
    for instance in test.to_numpy():
        featureValues = list(instance[2:-1])
        label = instance[-1]
        testInstances_x.append(featureValues)
        testInstances_y.append(label)
    nTrainingInstances = len(trainingInstances_x)
    print("Number of training instances:", nTrainingInstances)
    testInstancesList.append((testInstances_x, testInstances_y, nTrainingInstances))

Number of training instances: 1347
Number of training instances: 1347
Number of training instances: 1347
Number of training instances: 1347
Number of training instances: 1347


## 1. Support Vector Classification

### 1.1 Linear SVM

In [8]:
from sklearn.svm import LinearSVC
from sklearn import metrics
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    clf = LinearSVC()
    clf.fit(trainingInstances_x, trainingInstances_y)
    y_pred = clf.predict(testInstances_x)
    
    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    
    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(testInstances_y, y_pred, average=None))
    recall = numpy.add(recall, metrics.recall_score(testInstances_y, y_pred, average=None))
    
    #print("Predicted:", y_pred)
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    y_pred_svm = y_pred
    
    
#print("Overall Precision: ", numpy.divide(prec / [len(testSets), len(testSets), len(testSets), len(testSets), len(testSets), len(testSets)]))
#print("Overall Recall: ", numpy.divide(recall / [len(testSets), len(testSets), len(testSets), len(testSets), len(testSets), len(testSets)]))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.32047477744807124
F1 Micro: 0.32047477744807124
F1 Macro: 0.1645311978645312


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.19584569732937684
F1 Micro: 0.19584569732937684
F1 Macro: 0.06828181565023671


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.21364985163204747
F1 Micro: 0.21364985163204747
F1 Macro: 0.11500268384326356


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.30952380952380953
F1 Micro: 0.30952380952380953
F1 Macro: 0.10097001763668433
Accuracy: 0.25595238095238093
F1 Micro: 0.25595238095238093
F1 Macro: 0.1360506234878094
Overall MAE:  1.669588102303236
Overall Accuracy:  0.2590893033771372
Overall F1 Micro:  0.2590893033771372
Overall F1 Macro:  0.11696726769650505


  _warn_prf(average, modifier, msg_start, len(result))


### 1.2 Kernel SVM  

Kernel is Radius Basis Function: Gaussian:  

In [9]:
from sklearn.svm import SVC
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
acc_stdev = []
f1micro_stdev = []
f1macro_stdev = []
for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    clf = SVC()
    clf.fit(trainingInstances_x, trainingInstances_y)
    y_pred = clf.predict(testInstances_x)
    
    
    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    
    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(testInstances_y, y_pred, average=None))
    recall = numpy.add(recall, metrics.recall_score(testInstances_y, y_pred, average=None))
    
    
    #print("Predicted:", y_pred)
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    acc_stdev.append(metrics.accuracy_score(testInstances_y, y_pred))
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    f1micro_stdev.append(metrics.f1_score(testInstances_y, y_pred, average='micro'))
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    f1macro_stdev.append(metrics.f1_score(testInstances_y, y_pred, average='macro'))
    y_pred_svm = y_pred

print("Overall Precision: ", numpy.divide(prec, len(testSets)))
print("Overall Recall: ", numpy.divide(recall, len(testSets)))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

print("Accuracy Stdev: ", numpy.std(acc_stdev, ddof=1))
print("F1 Micro Stdev: ", numpy.std(f1micro_stdev, ddof=1))
print("F1 Macro Stdev: ", numpy.std(f1macro_stdev, ddof=1))

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.2878338278931751
F1 Micro: 0.2878338278931751
F1 Macro: 0.07450076804915515


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.27596439169139464
F1 Micro: 0.27596439169139464
F1 Macro: 0.07209302325581396


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.2878338278931751
F1 Micro: 0.2878338278931751
F1 Macro: 0.07450076804915515


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.2976190476190476
F1 Micro: 0.2976190476190476
F1 Macro: 0.0764525993883792
Accuracy: 0.2767857142857143
F1 Micro: 0.2767857142857143
F1 Macro: 0.07226107226107226
Overall Precision:  [0.         0.         0.         0.         0.28520736 0.        ]
Overall Recall:  [0. 0. 0. 0. 1. 0.]
Overall MAE:  1.4949095662003673
Overall Accuracy:  0.2852073618765013
Overall F1 Micro:  0.2852073618765013
Overall F1 Macro:  0.07396164620071514
Accuracy Stdev:  0.00900282228569729
F1 Micro Stdev:  0.00900282228569729
F1 Macro Stdev:  0.0018145148079078991


  _warn_prf(average, modifier, msg_start, len(result))


Kernel is Quadratic kernel ("Degree-2 polynomial kernel"):

In [10]:
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
acc_stdev = []
f1micro_stdev = []
f1macro_stdev = []
for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    clf = SVC(kernel='poly', degree=2)
    clf.fit(trainingInstances_x, trainingInstances_y)
    y_pred = clf.predict(testInstances_x)
    
    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    
    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(testInstances_y, y_pred, average=None))
    recall = numpy.add(recall, metrics.recall_score(testInstances_y, y_pred, average=None))
    
    #print("Predicted:", y_pred)
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    acc_stdev.append(metrics.accuracy_score(testInstances_y, y_pred))
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    f1micro_stdev.append(metrics.f1_score(testInstances_y, y_pred, average='micro'))
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    f1macro_stdev.append(metrics.f1_score(testInstances_y, y_pred, average='macro'))
    y_pred_svm = y_pred
print("Overall Precision: ", numpy.divide(prec, len(testSets)))
print("Overall Recall: ", numpy.divide(recall, len(testSets)))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

print("Accuracy Stdev: ", numpy.std(acc_stdev, ddof=1))
print("F1 Micro Stdev: ", numpy.std(f1micro_stdev, ddof=1))
print("F1 Macro Stdev: ", numpy.std(f1macro_stdev, ddof=1))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.2878338278931751
F1 Micro: 0.2878338278931751
F1 Macro: 0.07450076804915515
Accuracy: 0.27596439169139464
F1 Micro: 0.27596439169139464
F1 Macro: 0.07209302325581396


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.2878338278931751
F1 Micro: 0.2878338278931751
F1 Macro: 0.07450076804915515
Accuracy: 0.2976190476190476
F1 Micro: 0.2976190476190476
F1 Macro: 0.0764525993883792
Accuracy: 0.2767857142857143
F1 Micro: 0.2767857142857143
F1 Macro: 0.07226107226107226
Overall Precision:  [0.         0.         0.         0.         0.28520736 0.        ]
Overall Recall:  [0. 0. 0. 0. 1. 0.]
Overall MAE:  1.4949095662003673
Overall Accuracy:  0.2852073618765013
Overall F1 Micro:  0.2852073618765013
Overall F1 Macro:  0.07396164620071514
Accuracy Stdev:  0.00900282228569729
F1 Micro Stdev:  0.00900282228569729
F1 Macro Stdev:  0.0018145148079078991


  _warn_prf(average, modifier, msg_start, len(result))


## 2. KNN

### Training KNN Classifiers

In [11]:
# KNN WITH n_neighbors=5
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
acc_stdev = []
f1micro_stdev = []
f1macro_stdev = []
for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    clf = KNeighborsClassifier() # n_neighbors=5
    clf.fit(trainingInstances_x, trainingInstances_y)
    y_pred = clf.predict(testInstances_x)
    #print("Predicted:", y_pred)
    
    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    
    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(testInstances_y, y_pred, average=None))
    recall = numpy.add(recall, metrics.recall_score(testInstances_y, y_pred, average=None))
    
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    acc_stdev.append(metrics.accuracy_score(testInstances_y, y_pred))
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    f1micro_stdev.append(metrics.f1_score(testInstances_y, y_pred, average='micro'))
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    f1macro_stdev.append(metrics.f1_score(testInstances_y, y_pred, average='macro'))
    y_pred_svm = y_pred
print("Overall Precision: ", numpy.divide(prec, len(testSets)))
print("Overall Recall: ", numpy.divide(recall, len(testSets)))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

print("Accuracy Stdev: ", numpy.std(acc_stdev, ddof=1))
print("F1 Micro Stdev: ", numpy.std(f1micro_stdev, ddof=1))
print("F1 Macro Stdev: ", numpy.std(f1macro_stdev, ddof=1))

Accuracy: 0.29376854599406527
F1 Micro: 0.29376854599406527
F1 Macro: 0.23218674672180958
Accuracy: 0.27299703264094954
F1 Micro: 0.27299703264094954
F1 Macro: 0.23231607365473636
Accuracy: 0.28486646884273
F1 Micro: 0.28486646884273
F1 Macro: 0.25493801344420725
Accuracy: 0.27976190476190477
F1 Micro: 0.27976190476190477
F1 Macro: 0.22758595671639148
Accuracy: 0.2857142857142857
F1 Micro: 0.2857142857142857
F1 Macro: 0.2562070031003582
Overall Precision:  [0.38454445 0.17476101 0.0774359  0.17815657 0.35116887 0.26940716]
Overall Recall:  [0.48426001 0.16098638 0.05464926 0.18418819 0.42102649 0.19784876]
Overall MAE:  1.4433004804295606
Overall Accuracy:  0.28342164759078703
Overall F1 Micro:  0.28342164759078703
Overall F1 Macro:  0.24064675872750058
Accuracy Stdev:  0.007688763213434875
F1 Micro Stdev:  0.007688763213434875
F1 Macro Stdev:  0.013765153762419819


In [12]:
# KNN WITH n_neighbors=3
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
acc_stdev = []
f1micro_stdev = []
f1macro_stdev = []
for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    clf = KNeighborsClassifier(n_neighbors=3) # n_neighbors = 3
    clf.fit(trainingInstances_x, trainingInstances_y)
    y_pred = clf.predict(testInstances_x)
    #print("Predicted:", y_pred)

    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    
    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(testInstances_y, y_pred, average=None))
    recall = numpy.add(recall, metrics.recall_score(testInstances_y, y_pred, average=None))
    
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    acc_stdev.append(metrics.accuracy_score(testInstances_y, y_pred))
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    f1micro_stdev.append(metrics.f1_score(testInstances_y, y_pred, average='micro'))
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    f1macro_stdev.append(metrics.f1_score(testInstances_y, y_pred, average='macro'))
    y_pred_svm = y_pred
    
print("Overall Precision: ", numpy.divide(prec, len(testSets)))
print("Overall Recall: ", numpy.divide(recall, len(testSets)))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

print("Accuracy Stdev: ", numpy.std(acc_stdev, ddof=1))
print("F1 Micro Stdev: ", numpy.std(f1micro_stdev, ddof=1))
print("F1 Macro Stdev: ", numpy.std(f1macro_stdev, ddof=1))

Accuracy: 0.26112759643916916
F1 Micro: 0.26112759643916916
F1 Macro: 0.23112623308875543
Accuracy: 0.22255192878338279
F1 Micro: 0.22255192878338276
F1 Macro: 0.20359842982944684
Accuracy: 0.2344213649851632
F1 Micro: 0.2344213649851632
F1 Macro: 0.21010058122876377
Accuracy: 0.26785714285714285
F1 Micro: 0.26785714285714285
F1 Macro: 0.23080318601055314
Accuracy: 0.25892857142857145
F1 Micro: 0.25892857142857145
F1 Macro: 0.2418149164806831
Overall Precision:  [0.3349094  0.14213983 0.08022887 0.20400299 0.35101328 0.23722462]
Overall Recall:  [0.54721486 0.20422648 0.08292627 0.1887061  0.26838865 0.15498073]
Overall MAE:  1.608398685883849
Overall Accuracy:  0.2489773208986859
Overall F1 Micro:  0.24897732089868585
Overall F1 Macro:  0.22348866932764047
Accuracy Stdev:  0.01945045925553679
F1 Micro Stdev:  0.0194504592555368
F1 Macro Stdev:  0.015988655348894206


## 4. Gaussian Naive Bayes

In [13]:
from sklearn.naive_bayes import GaussianNB
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
acc_stdev = []
f1micro_stdev = []
f1macro_stdev = []
for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    clf = GaussianNB()
    clf.fit(trainingInstances_x, trainingInstances_y)
    y_pred = clf.predict(testInstances_x)
    #print("Predicted:"print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    
    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    
    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(testInstances_y, y_pred, average=None))
    recall = numpy.add(recall, metrics.recall_score(testInstances_y, y_pred, average=None))
    
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    acc_stdev.append(metrics.accuracy_score(testInstances_y, y_pred))
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    f1micro_stdev.append(metrics.f1_score(testInstances_y, y_pred, average='micro'))
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    f1macro_stdev.append(metrics.f1_score(testInstances_y, y_pred, average='macro'))
    y_pred_svm = y_pred

print("Overall Precision: ", numpy.divide(prec, len(testSets)))
print("Overall Recall: ", numpy.divide(recall, len(testSets)))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

print("Accuracy Stdev: ", numpy.std(acc_stdev, ddof=1))
print("F1 Micro Stdev: ", numpy.std(f1micro_stdev, ddof=1))
print("F1 Macro Stdev: ", numpy.std(f1macro_stdev, ddof=1))

Accuracy: 0.42136498516320475
F1 Micro: 0.42136498516320475
F1 Macro: 0.3368244106552645
Accuracy: 0.36795252225519287
F1 Micro: 0.36795252225519287
F1 Macro: 0.31810963133788234
Accuracy: 0.3264094955489614
F1 Micro: 0.3264094955489614
F1 Macro: 0.288230457980347
Accuracy: 0.34226190476190477
F1 Micro: 0.34226190476190477
F1 Macro: 0.2914765751554731
Accuracy: 0.39880952380952384
F1 Micro: 0.39880952380952384
F1 Macro: 0.32952812769192874
Overall Precision:  [0.59338227 0.27223535 0.125      0.21027489 0.38193731 0.35335912]
Overall Recall:  [0.56552414 0.29344927 0.01311828 0.15282949 0.51117747 0.42160046]
Overall MAE:  1.0986134661579765
Overall Accuracy:  0.37135968630775745
Overall F1 Micro:  0.37135968630775745
Overall F1 Macro:  0.3128338405641792
Accuracy Stdev:  0.03915620257225982
F1 Micro Stdev:  0.03915620257225982
F1 Macro Stdev:  0.02204279850989457


# 5. Naive Solution - Random Guess

In [14]:
import random
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
acc_stdev = []
f1micro_stdev = []
f1macro_stdev = []
representative_sample = ["A", "A", "B", "B", "C", "C", "D", "D", "D", "D", "E", "E", "E", "E", "E", "E", "E", "E", "E",
                        "E", "F","F", "F", "F", "F", "F", "F", "F", "F", "F"]

for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    y_pred = [random.sample(representative_sample, 1)[0] for y in testInstances_y]
    
    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    

    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(testInstances_y, y_pred, average=None)) # https://www.educative.io/edpresso/how-to-add-one-array-to-another-array-in-python
    recall = numpy.add(recall, metrics.recall_score(testInstances_y, y_pred, average=None))
    
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    acc_stdev.append(metrics.accuracy_score(testInstances_y, y_pred))
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    f1micro_stdev.append(metrics.f1_score(testInstances_y, y_pred, average='micro'))
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    f1macro_stdev.append(metrics.f1_score(testInstances_y, y_pred, average='macro'))
    y_pred_svm = y_pred
print("Overall Precision: ", numpy.divide(prec, len(testSets)))
print("Overall Recall: ", numpy.divide(recall, len(testSets)))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

print("Accuracy Stdev: ", numpy.std(acc_stdev, ddof=1))
print("F1 Micro Stdev: ", numpy.std(f1micro_stdev, ddof=1))
print("F1 Macro Stdev: ", numpy.std(f1macro_stdev, ddof=1))

Accuracy: 0.17210682492581603
F1 Micro: 0.17210682492581603
F1 Macro: 0.1455795701199647
Accuracy: 0.2195845697329377
F1 Micro: 0.2195845697329377
F1 Macro: 0.15909187810580952
Accuracy: 0.18397626112759644
F1 Micro: 0.18397626112759644
F1 Macro: 0.13629870295548277
Accuracy: 0.17261904761904762
F1 Micro: 0.17261904761904762
F1 Macro: 0.1293027758607469
Accuracy: 0.18154761904761904
F1 Micro: 0.18154761904761904
F1 Macro: 0.13965868784559932
Overall Precision:  [0.21500733 0.08969714 0.05781513 0.12033726 0.24852813 0.18453173]
Overall Recall:  [0.08680292 0.050265   0.03401434 0.11890283 0.2941951  0.32215033]
Overall MAE:  1.864483538222411
Overall Accuracy:  0.18596686449060335
Overall F1 Micro:  0.18596686449060335
Overall F1 Macro:  0.14198632297752062
Accuracy Stdev:  0.01951862731961884
F1 Micro Stdev:  0.01951862731961884
F1 Macro Stdev:  0.011226650386017554


# 6. Naive Solution - Use Last Year's Rank

In [16]:
import random
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]

main_data = pd.read_csv("../data/combined.csv")
df_continuity = pd.read_csv("../owgr_2004-2019.csv")

y_pred = []
y_ans = []

# run predictions from 2004 to 2019 (using players in main_data)
for index, row in main_data.iterrows():
            
    if row["year"] < 2004 or row["year"] > 2019:
        continue
        
    player = (df_continuity['name'] == row['name']) & (df_continuity['year'] == row['year'])
    player_ranking_data = df_continuity.loc[player]

    current_rank = player_ranking_data['current_rank'].values[0]
    rank_year_plus_two = player_ranking_data['rank_year_plus_two'].values[0]

    if rank_year_plus_two > 300:
        continue

    guess_instance = convert_rank_to_class(current_rank)
    ans_instance = convert_rank_to_class(rank_year_plus_two)

    y_pred.append(guess_instance)
    y_ans.append(ans_instance)

print(len(y_pred))
print(len(y_ans))

# CALCULATIONS FOR MAE
y_pred_as_distance = [distance_dict[a] for a in y_pred]
y_ans_as_distance = [distance_dict[a] for a in y_ans]
MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)

# Calculations for Precision and Recall
prec = numpy.add(prec, metrics.precision_score(y_ans, y_pred, average=None)) # from https://stackoverflow.com/questions/45890328/sklearn-metrics-for-multiclass-classification
recall = numpy.add(recall, metrics.recall_score(y_ans, y_pred, average=None))

print("Accuracy:", metrics.accuracy_score(y_ans, y_pred))
acc_sum += metrics.accuracy_score(y_ans, y_pred)
print("F1 Micro:", metrics.f1_score(y_ans, y_pred, average='micro'))
f1micro_sum += metrics.f1_score(y_ans, y_pred, average='micro')
print("F1 Macro:", metrics.f1_score(y_ans, y_pred, average='macro'))
f1macro_sum += metrics.f1_score(y_ans, y_pred, average='macro')

print("Overall Precision: ", prec/ 1)
print("Overall Recall: ", recall/ 1)
print("Overall MAE: ", MAE_sum / 1)
print("Overall Accuracy: ", acc_sum / 1)
print("Overall F1 Micro: ", f1micro_sum / 1)
print("Overall F1 Macro: ", f1macro_sum / 1)

1683
1683
Accuracy: 0.3196672608437314
F1 Micro: 0.3196672608437314
F1 Macro: 0.307310147545276
Overall Precision:  [0.54612546 0.22943723 0.14754098 0.20437956 0.35159817 0.34965035]
Overall Recall:  [0.59919028 0.265      0.15697674 0.20895522 0.32083333 0.3164557 ]
Overall MAE:  1.1354723707664884
Overall Accuracy:  0.3196672608437314
Overall F1 Micro:  0.3196672608437314
Overall F1 Macro:  0.307310147545276
