# Golf Predictor 

## (0) Load Data

In [1]:
import pandas as pd
# dfRaw is raw dataset read from csv
dfRaw = pd.read_csv("../data/combined.csv")
distance_dict = {"A" : 1, "B" : 2, "C" : 3, "D" : 4, "E" : 5, "F" : 6}

In [2]:
def convert_rank_to_class(raw_ans):
    ans = ""
    if raw_ans < 21:
        ans = "A"
    elif raw_ans < 41:
        ans = "B"
    elif raw_ans < 61:
        ans = "C"
    elif raw_ans < 101:
        ans = "D"
    elif raw_ans < 201:
        ans = "E"
    else:
        ans = "F"
    return ans

In [3]:
dfConv = dfRaw

# remove all ranks greater than 300
for index, row in dfConv.iterrows():
    if row["owgr_rank_year_plus_two"] > 300:
        dfConv.drop(index, inplace=True) #https://stackoverflow.com/questions/28876243/how-to-delete-the-current-row-in-pandas-dataframe-during-df-iterrows
    else:
        dfConv.loc[index, "owgr_rank_year_plus_two"] = convert_rank_to_class(row["owgr_rank_year_plus_two"]) # conver rank to class

# drop label 
dfConv = dfConv.drop(["owgr_rank_year_plus_two"], axis=1)
print(dfConv)

      Unnamed: 0            name  owgr_points_current_avg  driving_dist_avg  \
0              0     Tiger Woods                    13.22             293.2   
1              1      Adam Scott                     9.25             297.8   
2              2  Phil Mickelson                     8.52             287.9   
3              3  Henrik Stenson                     8.23             290.9   
4              4     Justin Rose                     7.78             296.6   
...          ...             ...                      ...               ...   
2364        2364     Matt Kuchar                     0.45             287.3   
2370        2370    Jason Dufner                     0.33             290.7   
2373        2373     Chris Couch                     0.27             302.1   
2374        2374     Omar Uresti                     0.27             272.2   
2375        2375        Ken Duke                     0.17             284.3   

      top_tens  gir_pct  sg_p_avg  driving_acc_pct 

### (0.1) K-Fold

In [4]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf.get_n_splits()
trainSets = []
testSets = []
for k in kf.split(dfRaw):
    trainSets.append(dfConv.iloc[k[0]])
    testSets.append(dfConv.iloc[k[1]])

### (0.2) Find Feature Names

In [5]:
featureNames = dfConv.columns[3:-1] # Skip ID, name, year, rounds, and rank as label
nFeatures = featureNames.shape[0]
print("Number of features:", nFeatures)
featureNames

Number of features: 11


Index(['driving_dist_avg', 'top_tens', 'gir_pct', 'sg_p_avg',
       'driving_acc_pct', 'scrambling_pct', 'adj_scoring_avg', 'sg_ott_avg',
       'sg_apr_avg', 'sg_arg_avg', 'rounds'],
      dtype='object')

### (0.3) Select Training Instances (features and values) from the Training Data

In [6]:
trainingInstancesList = []
for train in trainSets:
    trainingInstances_x = []
    trainingInstances_y = []
    for instance in train.to_numpy():
        featureValues = list(instance[3:-1])
        label = instance[-1]
        trainingInstances_x.append(featureValues)
        trainingInstances_y.append(label)
    nTrainingInstances = len(trainingInstances_x)
    trainingInstancesList.append((trainingInstances_x, trainingInstances_y, nTrainingInstances))
    print("Number of training instances:", nTrainingInstances)

Number of training instances: 1346
Number of training instances: 1346
Number of training instances: 1346
Number of training instances: 1347
Number of training instances: 1347


### (0.4) Select Test Instances from the Data

In [7]:
testInstancesList = []
for test in testSets:    
    testInstances_x = []
    testInstances_y = []
    for instance in test.to_numpy():
        featureValues = list(instance[3:-1])
        label = instance[-1]
        testInstances_x.append(featureValues)
        testInstances_y.append(label)
    nTrainingInstances = len(trainingInstances_x)
    print("Number of training instances:", nTrainingInstances)
    testInstancesList.append((testInstances_x, testInstances_y, nTrainingInstances))

Number of training instances: 1347
Number of training instances: 1347
Number of training instances: 1347
Number of training instances: 1347
Number of training instances: 1347


## 1. Support Vector Classification

### 1.1 Linear SVM

In [8]:
from sklearn.svm import LinearSVC
from sklearn import metrics
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    clf = LinearSVC()
    clf.fit(trainingInstances_x, trainingInstances_y)
    y_pred = clf.predict(testInstances_x)
    
    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    
    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(y_ans, y_pred, average=None))
    recall = numpy.add(recall, metrics.recall_score(y_ans, y_pred, average=None))
    
    #print("Predicted:", y_pred)
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    y_pred_svm = y_pred
print("Overall Precision: ", prec / len(testSets))
print("Overall Recall: ", recall / len(testSets))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### 1.2 Kernel SVM  

Kernel is Radius Basis Function: Gaussian:  

In [None]:
from sklearn.svm import SVC
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    clf = SVC()
    clf.fit(trainingInstances_x, trainingInstances_y)
    y_pred = clf.predict(testInstances_x)
    
    
    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    
    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(y_ans, y_pred, average=None))
    recall = numpy.add(recall, metrics.recall_score(y_ans, y_pred, average=None))
    
    
    #print("Predicted:", y_pred)
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    y_pred_svm = y_pred
    
print("Overall Precision: ", prec / len(testSets))
print("Overall Recall: ", recall / len(testSets))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

Kernel is Quadratic kernel ("Degree-2 polynomial kernel"):

In [None]:
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    clf = SVC(kernel='poly', degree=2)
    clf.fit(trainingInstances_x, trainingInstances_y)
    y_pred = clf.predict(testInstances_x)
    
    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    
    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(y_ans, y_pred, average=None))
    recall = numpy.add(recall, metrics.recall_score(y_ans, y_pred, average=None))
    
    #print("Predicted:", y_pred)
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    y_pred_svm = y_pred
print("Overall Precision: ", prec / len(testSets))
print("Overall Recall: ", recall / len(testSets))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

## 2. KNN

### Training KNN Classifiers

In [None]:
# KNN WITH n_neighbors=5
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    clf = KNeighborsClassifier() # n_neighbors=5
    clf.fit(trainingInstances_x, trainingInstances_y)
    y_pred = clf.predict(testInstances_x)
    #print("Predicted:", y_pred)
    
    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    
    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(y_ans, y_pred, average=None))
    recall = numpy.add(recall, metrics.recall_score(y_ans, y_pred, average=None))
    
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    y_pred_svm = y_pred
print("Overall Precision: ", prec / len(testSets))
print("Overall Recall: ", recall / len(testSets))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

In [None]:
# KNN WITH n_neighbors=3
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    clf = KNeighborsClassifier(n_neighbors=3) # n_neighbors = 3
    clf.fit(trainingInstances_x, trainingInstances_y)
    y_pred = clf.predict(testInstances_x)
    #print("Predicted:", y_pred)

    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    
    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(y_ans, y_pred, average=None))
    recall = numpy.add(recall, metrics.recall_score(y_ans, y_pred, average=None))
    
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    y_pred_svm = y_pred
    
print("Overall Precision: ", prec / len(testSets))
print("Overall Recall: ", recall / len(testSets))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

## 4. Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    clf = GaussianNB()
    clf.fit(trainingInstances_x, trainingInstances_y)
    y_pred = clf.predict(testInstances_x)
    #print("Predicted:"print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    
    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    
    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(y_ans, y_pred, average=None))
    recall = numpy.add(recall, metrics.recall_score(y_ans, y_pred, average=None))
    
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    y_pred_svm = y_pred
    
print("Overall Precision: ", prec/ len(testSets))
print("Overall Recall: ", recall/ len(testSets))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

# 5. Naive Solution - Random Guess

In [None]:
import random
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0
prec = [0,0,0,0,0,0]
recall = [0,0,0,0,0,0]
representative_sample = ["A", "A", "B", "B", "C", "C", "D", "D", "D", "D", "E", "E", "E", "E", "E", "E", "E", "E", "E",
                        "E", "F","F", "F", "F", "F", "F", "F", "F", "F", "F"]

for trainingInstances, testInstances in zip(trainingInstancesList, testInstancesList):
    trainingInstances_x = trainingInstances[0]
    trainingInstances_y = trainingInstances[1]
    testInstances_x = testInstances[0]
    testInstances_y = testInstances[1]
    y_pred = [random.sample(representative_sample, 1)[0] for y in testInstances_y]
    
    # CALCULATIONS FOR MAE
    y_pred_as_distance = [distance_dict[a] for a in y_pred]
    y_ans_as_distance = [distance_dict[a] for a in testInstances_y]
    MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)
    
    # Calculations for Precision and Recall
    prec = numpy.add(prec, metrics.precision_score(y_ans, y_pred, average=None)) # https://www.educative.io/edpresso/how-to-add-one-array-to-another-array-in-python
    recall = numpy.add(recall, metrics.recall_score(y_ans, y_pred, average=None))
    
    print("Accuracy:", metrics.accuracy_score(testInstances_y, y_pred))
    acc_sum += metrics.accuracy_score(testInstances_y, y_pred)
    print("F1 Micro:", metrics.f1_score(testInstances_y, y_pred, average='micro'))
    f1micro_sum += metrics.f1_score(testInstances_y, y_pred, average='micro')
    print("F1 Macro:", metrics.f1_score(testInstances_y, y_pred, average='macro'))
    f1macro_sum += metrics.f1_score(testInstances_y, y_pred, average='macro')
    y_pred_svm = y_pred
print("Overall Precision: ", prec/ len(testSets))
print("Overall Recall: ", recall/ len(testSets))
print("Overall MAE: ", MAE_sum / len(testSets))
print("Overall Accuracy: ", acc_sum / len(testSets))
print("Overall F1 Micro: ", f1micro_sum / len(testSets))
print("Overall F1 Macro: ", f1macro_sum / len(testSets))

# 6. Naive Solution - Use Last Year's Rank

In [None]:
import random
acc_sum = 0
f1micro_sum = 0
f1macro_sum = 0
MAE_sum = 0


main_data = pd.read_csv("../data/combined.csv")
df_continuity = pd.read_csv("../owgr_2004-2019.csv")

y_pred = []
y_ans = []

# run predictions from 2004 to 2019 (using players in main_data)
for index, row in main_data[main_data['year'] >= 2004 & main_data['year'] <= 2019].iterrows():
    player = (df_continuity['name'] == row['name']) & (df_continuity['year'] == row['year'])
    player_ranking_data = df_continuity.loc[player]

    current_rank = player_ranking_data['current_rank'].values[0]
    rank_year_plus_two = player_ranking_data['rank_year_plus_two'].values[0]

    if rank_year_plus_two > 300:
        continue

    guess_instance = convert_rank_to_class(current_rank)
    ans_instance = convert_rank_to_class(rank_year_plus_two)

    y_pred.append(guess_instance)
    y_ans.append(ans_instance)

print(len(y_pred))
print(len(y_ans))

# CALCULATIONS FOR MAE
y_pred_as_distance = [distance_dict[a] for a in y_pred]
y_ans_as_distance = [distance_dict[a] for a in y_ans]
MAE_sum += metrics.mean_absolute_error(y_ans_as_distance, y_pred_as_distance)

# Calculations for Precision and Recall
prec = numpy.add(prec, metrics.precision_score(y_ans, y_pred, average=None)) # from https://stackoverflow.com/questions/45890328/sklearn-metrics-for-multiclass-classification
recall = numpy.add(recall, metrics.recall_score(y_ans, y_pred, average=None))

print("Accuracy:", metrics.accuracy_score(y_ans, y_pred))
acc_sum += metrics.accuracy_score(y_ans, y_pred)
print("F1 Micro:", metrics.f1_score(y_ans, y_pred, average='micro'))
f1micro_sum += metrics.f1_score(y_ans, y_pred, average='micro')
print("F1 Macro:", metrics.f1_score(y_ans, y_pred, average='macro'))
f1macro_sum += metrics.f1_score(y_ans, y_pred, average='macro')

print("Overall Precision: ", prec/ 1)
print("Overall Recall: ", recall/ 1
print("Overall MAE: ", MAE_sum / 1)
print("Overall Accuracy: ", acc_sum / 1)
print("Overall F1 Micro: ", f1micro_sum / 1)
print("Overall F1 Macro: ", f1macro_sum / 1)