<h1 style="text-align: center;">Tennis Analyzer ML Model V2</h1>
<h3 style="text-align: center;">Dan Warnick</h3>

<p>To start we will begin by selecting the data points we want to analyze with existing known results. For each data entry we will have two players each with the following data entries.</p>
<table style="font-size: .8em;">
    <tr>
        <th>Player Name</th>
    </tr>
</table>
<p>Along with match facts like Clay/Hard/Grass Court or Indoor/Outdoor. In the future may want to add weather and adjust certain parameters for more accuracy and more data points to train from, however for now this seems a good start.</p>

<h2>1.) Collect Data Efficiently</h2>

In [6]:
import django
from django.http import HttpResponse
from django.template import loader
import os
from django.db import models
import torch
from math import ceil, floor
import math
import numpy as np
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as data
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.utils as utils
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
from datetime import datetime
from asgiref.sync import sync_to_async # type: ignore
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import roc_curve, roc_auc_score
import copy
import joblib

# os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'breakpoint.settings')
# django.setup()

# from render.models import *

In [7]:
start = '20120101'
end = '20231231'
match_type = 'm'

In [8]:
start_date = datetime.strptime(start, '%Y%m%d').date()
end_date = datetime.strptime(end, '%Y%m%d').date()

# if match_type == 'm':   
#     typer = MensTennisMatch
#     insert_db = MensTennisMatchStats
# else:
#     typer = WomensTennisMatch
#     insert_db = WomensTennisMatchStats

# query = insert_db.objects.filter(
#         tourney_date__range=(start_date, end_date)
#     ).order_by('tourney_date')
    
# games = await sync_to_async(list)(query.all().values())

RD_CUTOFF = 125

df = pd.read_csv('../../testcsvs/GlickoTempTau6.0.csv')
print(df.columns)
df['tourney_date'] = pd.to_datetime(df['tourney_date']).dt.date
df = df[(df['tourney_date'] >= start_date) & (df['tourney_date'] <= end_date)]
df = df[(df['a_glicko_rd'] <= RD_CUTOFF) & (df['b_glicko_rd'] <= RD_CUTOFF)]

df = df.drop(['tourney_id', 'tourney_name', 'match_num', 'tourney_date', 'a_player_name', 'b_player_name', 'a_player_id', 'a_player_slug', 'b_player_id', 'b_player_slug', 'sets', 'games', 'tiebreaks'], axis=1)
# print(df.duplicated().sum())

print(len(df))

one_hot_encoder = OneHotEncoder(sparse_output=False)

# One-Hot Encode the 'category_text' column
category_encoded = one_hot_encoder.fit_transform(df[['surface']])

# Convert to DataFrame
category_encoded_df = pd.DataFrame(category_encoded, columns=one_hot_encoder.get_feature_names_out(['surface']))

# Concatenate the one-hot encoded columns back to the original DataFrame
df = pd.concat([df.reset_index(drop=True), category_encoded_df], axis=1)

# Drop the original 'category_text' column
df.drop('surface', axis=1, inplace=True)
# print(df.duplicated().sum())

print(len(df))

Index(['tourney_id', 'tourney_name', 'tourney_date', 'surface', 'best_of',
       'match_num', 'tourney_level', 'tourney_round', 'a_player_id',
       'a_player_name',
       ...
       'a_surface_return_second_won_glicko_rating',
       'b_surface_second_won_glicko_rating',
       'a_surface_return_second_won_glicko_rd',
       'b_surface_second_won_glicko_rd', 'sets', 'games', 'tiebreaks',
       'a_odds', 'b_odds', 'a_b_win'],
      dtype='object', length=174)
26748
26748


In [9]:
df.head()

Unnamed: 0,best_of,tourney_level,tourney_round,a_player_age,a_player_hand,a_player_ht,a_player_rank,a_player_rank_points,b_player_age,b_player_hand,...,a_surface_return_second_won_glicko_rating,b_surface_second_won_glicko_rating,a_surface_return_second_won_glicko_rd,b_surface_second_won_glicko_rd,a_odds,b_odds,a_b_win,surface_Clay,surface_Grass,surface_Hard
0,3.0,2.0,0.8,28.0,0.0,185.0,74.0,671.0,23.0,1.0,...,1511.785013,1528.17828,65.035496,61.424189,3.59,1.28,0.0,0.0,0.0,1.0
1,3.0,2.0,0.8,28.0,1.0,188.0,65.0,755.0,25.0,1.0,...,1522.769999,1495.331465,62.134827,70.640554,,,1.0,0.0,0.0,1.0
2,3.0,2.0,0.8,23.0,1.0,183.0,89.0,612.0,22.0,1.0,...,1492.043641,1527.915304,78.973379,83.154575,2.29,1.59,0.0,0.0,0.0,1.0
3,3.0,2.0,0.8,24.0,1.0,178.0,48.0,915.0,26.0,1.0,...,1510.305024,1508.599843,63.986136,65.213573,2.4,1.54,0.0,0.0,0.0,1.0
4,3.0,2.0,0.8,25.0,1.0,193.0,22.0,1655.0,26.0,1.0,...,1510.529509,1475.691709,61.507367,80.902188,1.19,4.44,1.0,0.0,0.0,1.0


In [10]:
# Data preprocessing
df = df[df['tourney_level'] == 6.0]
df = df.dropna()

margin = df['a_b_win'].values.astype(int)
df = df.drop(columns=['a_b_win', 'a_odds', 'b_odds', 'a_b_win'])

data = df.values.astype(float)

# Normalize features
scaler = StandardScaler()
data = scaler.fit_transform(data)

# Train-test split
x_train, x_val, y_train, y_val = train_test_split(data, margin, test_size=0.1, random_state=42)

# Convert to PyTorch tensors
x_train = torch.tensor(x_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
x_val = torch.tensor(x_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.long)

# Define the PyTorch model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(x_train.shape[1], 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 2)
        self.relu = nn.SiLU() #hehe
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.flatten(x)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)  # Softmax is applied in the loss function
        return x

# Model, loss, and optimizer
model = NeuralNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 100
batch_size = 32
train_size = x_train.shape[0]

for epoch in range(epochs):
    model.train()
    for i in range(0, train_size, batch_size):
        # Batch data
        x_batch = x_train[i:i + batch_size]
        y_batch = y_train[i:i + batch_size]

        # Forward pass
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate on validation data
    model.eval()
    with torch.no_grad():
        val_outputs = model(x_val)
        val_loss = criterion(val_outputs, y_val)
        val_preds = torch.argmax(val_outputs, dim=1)
        val_accuracy = (val_preds == y_val).float().mean().item()

    # Print progress
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}, Val Accuracy: {val_accuracy:.4f}")

print("Training complete")

Epoch 1/100, Loss: 0.8666, Val Loss: 0.5404, Val Accuracy: 0.7275
Epoch 2/100, Loss: 0.7772, Val Loss: 0.5365, Val Accuracy: 0.7302
Epoch 3/100, Loss: 0.7195, Val Loss: 0.5384, Val Accuracy: 0.7357
Epoch 4/100, Loss: 0.6483, Val Loss: 0.5442, Val Accuracy: 0.7357
Epoch 5/100, Loss: 0.5653, Val Loss: 0.5520, Val Accuracy: 0.7084
Epoch 6/100, Loss: 0.5426, Val Loss: 0.5580, Val Accuracy: 0.7030
Epoch 7/100, Loss: 0.5633, Val Loss: 0.5712, Val Accuracy: 0.6894
Epoch 8/100, Loss: 0.5357, Val Loss: 0.5619, Val Accuracy: 0.6975
Epoch 9/100, Loss: 0.4059, Val Loss: 0.5695, Val Accuracy: 0.6921
Epoch 10/100, Loss: 0.3654, Val Loss: 0.5769, Val Accuracy: 0.7057
Epoch 11/100, Loss: 0.2100, Val Loss: 0.5866, Val Accuracy: 0.7057
Epoch 12/100, Loss: 0.1434, Val Loss: 0.6129, Val Accuracy: 0.6948
Epoch 13/100, Loss: 0.0975, Val Loss: 0.6419, Val Accuracy: 0.6921
Epoch 14/100, Loss: 0.0786, Val Loss: 0.6823, Val Accuracy: 0.6839
Epoch 15/100, Loss: 0.0672, Val Loss: 0.7331, Val Accuracy: 0.7084
Epoc