# Import

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os
import torch, torch.nn as nn, torch.optim as optim, torch.nn.functional as F
import math

### Using Google Colab

In [2]:
races = pd.read_csv('data/races.csv')
circuits = pd.read_csv('data/circuits.csv')
drivers = pd.read_csv('data/drivers.csv')
constructor = pd.read_csv('data/constructors.csv')
status = pd.read_csv('data/status.csv')
d_standings = pd.read_csv('data/driver_standings.csv')
c_standings = pd.read_csv('data/constructor_standings.csv')
quali = pd.read_csv('data/qualifying.csv')
pit_stops = pd.read_csv('data/pit_stops.csv')
lap_times = pd.read_csv('data/lap_times.csv')
results = pd.read_csv('data/results.csv')

In [3]:
# Changing the time from 00.00.00 to an integer
def time_to_int(time):
  if (time == float):
    return time
  time_str = str(time)
  time_series = time_str.rsplit(':')
  if ('\\N' in time_str):
    return None
  if (not '.' in time_str):
    return None
  if (len(time_series) > 1):
    return int(time_series[0]) * 60 + float(time_series[1])
  else:
    return float(time_series[0])

In [4]:
time_to_int(22.)

22.0

In [6]:
races = pd.read_csv('data/races.csv')
results = pd.read_csv('data/results.csv')

races_newer = races.query('year >= 2001')
rids = races_newer['raceId']
df = pd.DataFrame(columns=['driverId'])
for i in tqdm(rids):
  r_results = results.query(f'raceId == {i}')
  for id in r_results['driverId']:
    df = df.append({'driverId': id}, ignore_index=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=416.0), HTML(value='')))




In [7]:
drivers = pd.DataFrame({'driverId':df['driverId'].unique()})
drivers = drivers.sort_values(by=['driverId']).reset_index()
drivers = drivers.drop(columns=['index'])
drivers

Unnamed: 0,driverId
0,1
1,2
2,3
3,4
4,5
...,...
111,850
112,851
113,852
114,853


In [8]:
drivers.to_csv('data/drivers_short.csv')

In [None]:
len(numlaps)

In [None]:
numlaps.sort()
print(numlaps)

In [None]:
races = pd.read_csv('data/races.csv')
circuits = pd.read_csv('data/circuits.csv')
drivers = pd.read_csv('data/drivers.csv')
constructor = pd.read_csv('data/constructors.csv')
status = pd.read_csv('data/status.csv')

def race_info(raceId):
  _races = races
  _r = _races.query(f'raceId  == {raceId}')
  if (_r.empty):
    return None, None, None
  _year = _r['year'].item()
  _round = _r['round'].item()
  _circuitId = _r['circuitId'].item()
  return _year, _round, _circuitId

def circuit_info(circuitId):
  _circuits = circuits
  _c = _circuits.query(f'circuitId  == {circuitId}')
  if (_c.empty):
    return None, None, None
  _name = _c['name'].item()
  _location = _c['location'].item()
  _country = _c['country'].item()
  return _name, _location, _country

def driver_info(id):
  _drivers = drivers
  _d = _drivers.query(f'driverId  == {id}')
  if (_d.empty):
    return None, None, None, None, None, None
  _number = _d['number'].item()
  _code = _d['code'].item()
  _forename = _d['forename'].item()
  _surname = _d['surname'].item()
  _dob = _d['dob'].item()
  _nationality = _d['nationality'].item()
  return _number, _code, _forename, _surname, _dob, _nationality

def constructor_info(id):
  _constructor = constructors
  _c = _constructor.query(f'constructorId  == {id}')
  if (_c.empty):
    return None, None
  _name = _d['name'].item()
  _nationality = _d['nationality'].item()
  return _name, _nationality

def status_info(id):
  _status = status
  _s = _status.query(f'statusId == {id}')
  if (_s.empty):
    return None
  _sstr = _s['status'].item()
  return _sstr

In [None]:
race_info(1053)

In [None]:
class RaceDataset(torch.utils.data.Dataset):
  def __init__(self, dir):

    self.dir = dir
    self.year = 2001
    self.round = 1

    
    self.cur_year = os.listdir(self.dir + f'{self.year}/')

    if (self.round < len(self.cur_year)):
      self.cur_race = pd.read_csv(self.dir + f'{self.year}/{self.cur_year[self.round - 1]}')
    else:
      self.cur_race = pd.read_csv(self.dir + f'{self.year}/{self.cur_year[-1]}')


  def set_year(self, year):
    self.year = year
    self.cur_year = os.listdir(self.dir + f'{self.year}/')

  def set_round(self, round):
    self.round = round
    if (self.year == 2021):
      return # no data yet
    
    if (self.round < len(self.cur_year)):
      self.cur_race = pd.read_csv(self.dir + f'{self.year}/{self.cur_year[self.round - 1]}')
    else:
      self.cur_race = pd.read_csv(self.dir + f'{self.year}/{self.cur_year[-1]}')

  def next_round(self):
    self.round += 1
    if (self.round < len(self.cur_year)):
      self.set_round(self.round)
    else:
      self.round = 1
      self.set_year(self.year+1)
      self.set_round(self.round)


  def __len__(self):
    # -1 since the last lap will always only be a label
    return len(self.cur_race) - 1

  def __getitem__(self, i):
    for j in range(i+1):
      # torch.Size([141])
      cur = torch.tensor(self.cur_race.iloc[j].values)[1:142] # removes index and trim size
      # replace NaN with 0
      cur[cur != cur] = 0
      for k in range(0, 140, 7):
        cur[k] = cur[k] / 100
        cur[k+6] = cur[k+6] / 10
      if (j == 0):
        cur_ret = cur.clone()
      elif (j == 1):
        cur_ret = cur_ret.unsqueeze(0)
        cur_ret = torch.cat((cur_ret, cur.clone().unsqueeze(0)), 0)
      else:
        cur_ret = torch.cat((cur_ret, cur.clone().unsqueeze(0)), 0)

      next = torch.tensor(self.cur_race.iloc[i+1].values)[1:142] # removes index and trim size
      next_exp = torch.cat((next[4:6], next[7:8]),0) # this crashes the session if both are single elements
      for k in range (1, 20):
        next_exp = torch.cat((
            next_exp,
            next[4 + 7*k :6+7*k],
            next[7 + 7*k: 8+7*k]
            ), 0)
      next_exp[next_exp != next_exp] = 0
      for k in range(0, 60, 3):
        next_exp[k+2] = next_exp[k+2] / 10
      if (j == 0):
        next_ret = next_exp.clone()
      elif (j == 1):
        next_ret = next_ret.unsqueeze(0)
        next_ret = torch.cat((next_ret, next_exp.clone().unsqueeze(0)), 0)
      else:
        next_ret = torch.cat((next_ret, next_exp.clone().unsqueeze(0)), 0)

    return (cur_ret, next_ret)

In [None]:
ds = RaceDataset('races/')

In [None]:
ds.year

In [None]:
ds[0]

In [None]:
def show_positions(lap_in, out):
  _lap = lap_in.detach().clone()
  _o = out.detach().clone()
  _lap = _lap.apply_(lambda x: x*100)
  _o = _o.apply_(lambda x: x*100)
  _name, _loc, _country = circuit_info(round(_lap[0].item()))
  print(f'Circuit: {_name}, {_loc}, {_country}')
  j = 0
  for i in range(1, 140, 7):
    _num, _code, _fn, _ln, _, _ = driver_info(round(_lap[i].item()))
    _pos = _o[j].item()
    _time = _o[j+3].item()
    _status = _o[j+2].item()
    _statusstr = status_info(round(_status))
    j += 4
    print(f'Driver: {_num}  {_fn} {_ln}')
    print(f'Position: {_pos}')
    print(f'Laptime: {_time}')
    print(f'Status: {_statusstr}')

In [None]:
def pos_df(lap_in, out):
  df = pd.DataFrame(columns=['code', 'driver', 'position', 'pitting', 'laptime'])
  _lap = lap_in.detach().clone()
  _o = out.detach().clone()
  #_lap = _lap.apply_(lambda x: x*10)
  #_o = _o.apply_(lambda x: x*10)
  for i in range(0, 140, 7):
    _lap[i] = _lap[i] * 100
    _lap[i+6] = _lap[i+6] * 10
  for i in range(0, 60, 3):
    _o[i+2] = _o[i+2] * 10
  _name, _loc, _country = circuit_info(round(_lap[0].item()))
  j = 0
  for i in range(1, 140, 7):
    _num, _code, _fn, _ln, _, _ = driver_info(round(_lap[i].item()))
    _pos = _o[j].item()
    _pitting = _o[j+1].item()
    _time = _o[j+2].item()
    df = df.append({
        'code': f'{_code}',
        'driver': f'{_fn} {_ln}',
        'position': _pos,
        'pitting': _pitting,
        'laptime': _time
    }, ignore_index=True)
    j += 3

  df = df.sort_values(by=['position', 'laptime'])
  return _name, _loc, _country, df

In [None]:
df

In [None]:
def out_to_in(in_, out_):
  _ret = in_.detach().clone()
  try:
    _ret[0][0][4] = out_[0][0][0]
    _ret[0][0][5] = out_[0][0][1]
    _ret[0][0][7] = out_[0][0][2]
    for i in range (1, 20):
      _ret[0][0][4 + 7*i] = out_[0][0][3*i]
      _ret[0][0][5 + 7*i] = out_[0][0][3*i + 1]
      _ret[0][0][7 + 7*i] = out_[0][0][3*i + 2]
    return _ret.squeeze().squeeze()
  except:
    _ret[4] = out_[0]
    _ret[5] = out_[1]
    _ret[7] = out_[2]
    for i in range (1, 20):
      _ret[4 + 7*i] = out_[3*i]
      _ret[5 + 7*i] = out_[3*i + 1]
      _ret[7 + 7*i] = out_[3*i + 2]
    return _ret

In [None]:
class RacePredictionModel(nn.Module):
    def __init__(self, input_size, output_size, lstm_hids, lstm_layers, dropout):
        super(RacePredictionModel, self).__init__()

        self.input_size = input_size
        self.lstm_layers = lstm_layers
        self.lstm_hids = lstm_hids

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=lstm_hids, num_layers=lstm_layers, dropout=dropout, batch_first=True)

        self.fc = nn.Linear(lstm_hids, output_size)

        nn.init.xavier_uniform_(self.fc.weight.data)
        for name, params in self.lstm.named_parameters():
            if name[:6] == 'weight':
                nn.init.xavier_uniform_(params)
            elif name[:4] == 'bias':
                nn.init.constant_(params, 0.0)

    def zero_states(self):
        hidden_state = torch.zeros(self.lstm_layers, 1, self.lstm_hids)
        cell_state = torch.zeros(self.lstm_layers, 1, self.lstm_hids)
        return (hidden_state, cell_state)

    def forward(self, ins, prev_states=None):
        lstm_outs, next_states = self.lstm(ins, prev_states)
        outs = self.fc(lstm_outs)
        return outs, next_states

In [None]:
def run_train(model, ds, crit, opt, sched):
    model.train()
    total_loss, total_count = 0, 0

    ds.set_year(2001)
    ds.set_round(1)

    while (ds.year != 2020):
      hid_state, cell_state = model.zero_states()
      states = hid_state.to(device), cell_state.to(device)
      #for i in range(len(ds)):
      i = len(ds) - 1
      opt.zero_grad()
      lap_in, lap_exp = ds[i]

      # fix shape and type
      # input size should be (batch, seq_len, input_size)
      # we are not using batches
      if (i == 0):
        lap_in = lap_in.unsqueeze(0)
        lap_exp = lap_exp.unsqueeze(0)
      lap_in = lap_in.unsqueeze(0).float()
      lap_exp = lap_exp.unsqueeze(0).float()

      lap_in, lap_exp = lap_in.to(device), lap_exp.to(device)
      lap_out, states = model(lap_in, states)
      loss = crit(lap_out, lap_exp)
      for s in states:
          s.detach_()
      loss.backward()
      if (math.isnan(loss)):
        print('Loss is nan')
        print(f'Year: {ds.year}')
        print(f'round: {ds.round}')
        print(f'lap: {i}')
        print(lap_in)
        input()
      opt.step()
      total_loss += loss.item() * 1 #lap_in.size(0) # since we are not doing batches
      total_count += 1 #lap_in.size(0)

      ds.next_round()

    sched.step()
    return total_loss/total_count

def run_test(model, ds, crit):
    model.eval()
    total_loss, total_count = 0, 0

    ds.set_year(2020)
    ds.set_round(1)

    hid_state, cell_state = model.zero_states()
    states = hid_state.to(device), cell_state.to(device)

    #for i in range(len(ds)):

    i = len(ds) - 1
    lap_in, lap_exp = ds[i]
    if (i == 0):
      lap_in = lap_in.unsqueeze(0)
      lap_exp = lap_exp.unsqueeze(0)
    lap_in = lap_in.unsqueeze(0).float()
    lap_exp = lap_exp.unsqueeze(0).float()
    lap_in, lap_exp = lap_in.to(device), lap_exp.to(device)
    lap_out, states = model(lap_in, states)
    loss = crit(lap_out, lap_exp)
    total_loss += loss.item() * 1 #lap_in.size(0) # since we are not doing batches
    total_count += 1 #lap_in.size(0)

    return total_loss/total_count

def run_all(model, ds, crit, opt, sched, versionId, n_epochs=10):
    for epoch in tqdm(range(n_epochs), desc='epochs', unit='ep'):
      train_loss = run_train(model, ds, crit, opt, sched)
      test_loss = run_test(model, ds, crit)
      tqdm.write(f'epoch {epoch}   train loss {train_loss:.6f}  test loss {test_loss:.6f}')
      if epoch % 5 == 0:
        torch.save(model.state_dict(), f'cache/{versionId}-{epoch}-loss-{test_loss:.2f}.pth')
    torch.save(model.state_dict(), f'cache/{versionId}-{n_epochs}-loss-{test_loss:.2f}.pth')

In [None]:
#device = torch.device('cuda:0')
device = torch.device('cpu')
model = RacePredictionModel(141, 60, 141, 2, 0.2)
model.to(device)
crit = nn.MSELoss().to(device)
opt = optim.Adam(model.parameters(), lr=0.001)
sched = optim.lr_scheduler.StepLR(opt, 3, gamma=0.1)
ds = RaceDataset('races/')

In [None]:
run_all(model, ds, crit, opt, sched, 25, 10)

In [None]:
cpu = torch.device('cpu')
model.load_state_dict(torch.load('cache/25-5-loss-19.81.pth'))
model.eval()
ds.set_year(2020)
ds.set_round(17)
p, n = ds[0]
p = p.to(device)
hid_state, cell_state = model.zero_states()
states = (hid_state.to(device), cell_state.to(device))
out, s = model(p.unsqueeze(0).unsqueeze(0).float(), states)
out = out.squeeze().squeeze()
#print(out)
#for i in range(50):
  #out, s = model(out_to_in(p, out).unsqueeze(0).unsqueeze(0).float(), s)
  #out = out.squeeze().squeeze()
  #_, _, _, d = pos_df(p.to(cpu), out)
  #print(d)
#print(out)
#out = out.detach().to(cpu)
d