# Import

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os
import torch, torch.nn as nn, torch.optim as optim, torch.nn.functional as F
import math

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
directory = '/content/drive/My Drive/formula1_capstone/data/'

In [None]:
races = pd.read_csv(directory + 'races.csv')
circuits = pd.read_csv(directory + 'circuits.csv')
drivers = pd.read_csv(directory + 'drivers.csv')
constructor = pd.read_csv(directory + 'constructors.csv')
status = pd.read_csv(directory + 'status.csv')
d_standings = pd.read_csv(directory + 'driver_standings.csv')
c_standings = pd.read_csv(directory + 'constructor_standings.csv')
quali = pd.read_csv(directory + 'qualifying.csv')
pit_stops = pd.read_csv(directory + 'pit_stops.csv')
lap_times = pd.read_csv(directory + 'lap_times.csv')
results = pd.read_csv(directory + 'results.csv')

In [None]:
def race_info(raceId):
  race_df = races.query(f'raceId  == {raceId}')
  if (race_df.empty):
    return None, None, None
  _year = race_df['year'].item()
  _round = race_df['round'].item()
  _circuitId = race_df['circuitId'].item()
  return _year, _round, _circuitId

def circuit_info(circuitId):
  circuit_df = circuits.query(f'circuitId  == {circuitId}')
  if (circuit_df.empty):
    return None, None, None
  _name = circuit_df['name'].item()
  _location = circuit_df['location'].item()
  _country = circuit_df['country'].item()
  return _name, _location, _country

def driver_info(id):
  drivers_df = drivers.query(f'driverId  == {id}')
  if (drivers_df.empty):
    return None, None, None, None, None, None
  _number = drivers_df['number'].item()
  _code = drivers_df['code'].item()
  _forename = drivers_df['forename'].item()
  _surname = drivers_df['surname'].item()
  _dob = drivers_df['dob'].item()
  _nationality = drivers_df['nationality'].item()
  return _number, _code, _forename, _surname, _dob, _nationality

def constructor_info(id):
  constructor_df = constructor.query(f'constructorId  == {id}')
  if (constructor_df.empty):
    return None, None
  _name = drivers['name'].item()
  _nationality = drivers['nationality'].item()
  return _name, _nationality

def status_info(id):
  status_df = status.query(f'statusId == {id}')
  if (status_df.empty):
    return None
  _sstr = status_df['status'].item()
  return _sstr

# Feature Engineering

### Changing Laptimes 

I changed the laptimes from MM(minutes).SS(seconds).mm(milliseconds) to an integer to make it easier to understand on the basis of seconds.

In [None]:
# Changing the time from 00.00.00 to an integer
def time_to_int(time):
  if (time == float):
    return time
  time_str = str(time)
  time_series = time_str.rsplit(':')
  if ('\\N' in time_str):
    return None
  if (not '.' in time_str):
    return None
  if (len(time_series) > 1):
    return int(time_series[0]) * 60 + float(time_series[1])
  else:
    return float(time_series[0])

### Embedding for Car Status

Use the same method of grouping similar car statuses and embedding for each group from 2nd modelling notebook.

In [None]:
status_emb = [
  [4.0, 3.0, 130.0], # Accident/Collision
  [22.0, 5.0, 10.0, 23.0, 44.0, 47.0, 30.0, 32.0, 8.0, 38.0, 43.0, 85.0, 9.0, 86.0, 6.0, 2.0, 7.0, 87.0, 71.0, 41.0, 46.0, 37.0, 65.0, 78.0, 25.0, 74.0, 75.0, 26.0, 51.0, 40.0, 79.0, 36.0, 83.0, 80.0, 21.0, 69.0, 72.0, 70.0, 27.0, 60.0, 63.0, 29.0, 64.0, 66.0, 56.0, 59.0, 61.0, 42.0, 39.0, 48.0, 49.0, 34.0, 35.0, 28.0, 24.0, 33.0, 129.0, 76.0, 91.0, 131.0, 101.0, 132.0, 135.0,  84.0,  136.0,  105.0,  137.0,  138.0,  139.0], # Car issues
  [11.0,  13.0,  12.0,  14.0,  17.0,  15.0,  16.0, 18.0,  55.0,  58.0,  45.0, 88.0], # Lapped
  [0.0], # No problem
  [77.0, 73.0, 82.0, 81.0, 62.0, 54.0, 31.0, 96.0], # Other
  [20.0] #'Spun off'
] 

def status_emb(id):
  _embed = np.zeros(6)
  for i in range(6):
    if id in status_emb[i]:
      _embed[i] = 1
      return _embed
  _embed[4] = 1
  return _embed # Other

def status_unbed(array, retired=False):
  _a = np.copy(array)
  if (retired):
    _a[3] = 0
  _i = np.argmax(_a)
  if (_i == 0):
    return 'Accident/Collision'
  elif (_i == 1):
    return 'Car Issues'
  elif (_i == 2):
    return 'Lapped'
  elif (_i == 3):
    return 'No Problem'
  elif (_i == 4):
    return 'Other'
  elif (_i == 5):
    return 'Spun off'
  else:
    return 'something is wrong'

## Embedding for Lap Times

In [None]:
def laptime_embed(laptime):
  # laptime should be a float with 3 decimal places
  _lt = math.floor(laptime * 10)
  _lt_emb = []
  _ret = []
  for i in range(4):
    _lt_emb.append(int(_lt % 10))
    _lt = math.floor(_lt / 10)
  _ret = np.zeros(2)
  if (_lt_emb[-1] == 1):
    _ret[0] = 1
  elif (_lt_emb[-1] == 2):
    _ret[1] = 1
  elif (_lt_emb[-1] > 2):
    _ret[0] = 1
    _ret[1] = 1
  for i in range(3):
    _t = np.zeros(10)
    _t[_lt_emb[2 - i]] = 1
    _ret = np.append(_ret, _t)
  return _ret

def laptime_unbed(l_array):
  _ret = 0
  if (l_array[0] >= 0.5 and l_array[1] >= 0.5):
    _ret += 300
  elif (l_array[0] >= 0.5):
    _ret += 100
  elif (l_array[1] >= 0.5):
    _ret += 200
  _ret += np.argmax(l_array[2:12]) * 10
  _ret += np.argmax(l_array[12:22]) * 1
  _ret += np.argmax(l_array[22:32]) * 0.1
  return _ret

In [None]:
laptime_testing = laptime_embed(123.456)
print(len(laptime_testing))

In [None]:
undo_laptime_embed = laptime_unbed(laptime_testing)
print(undo_laptime_embed)

In [None]:
drivers_short = pd.read_csv(directory + 'drivers_short.csv')
# from driverId to our id
def driver_embed_id(driverId):
  row = drivers_short.query(f'driverId == {driverId}').index
  if (row.empty):
    return 0
  return row.item() + 1

# from our id to driverId
def driver_unbed_id(id):
  row = drivers_short.iloc[id-1]
  return row['driverId']

# from our id to array
def driver_embed(id):
  _e = np.zeros(130)
  _e[id-1] = 1
  return _e

# from array to our id
def driver_unbed(d_array):
  return np.argmax(d_array) + 1

In [None]:
class RaceDataset(torch.utils.data.Dataset):
  def __init__(self, dir):

    self.dir = dir
    self.year = 2001
    self.round = 1
    self.cache = {}

    # number of races this year
    self.cur_year = int(len(os.listdir(self.dir + f'{self.year}/'))/2)


    if (self.round < self.cur_year):
      self.cur_race = np.load(self.dir + f'{self.year}/{self.round-1}_in.npy'), np.load(self.dir + f'{self.year}/{self.round-1}_exp.npy')
    else:
      self.cur_race = np.load(self.dir + f'{self.year}/{self.cur_year-1}_in.npy'), np.load(self.dir + f'{self.year}/{self.cur_year - 1}_exp.npy')
    self.cache[f'{self.year}_{self.round}'] = self.cur_race


  def set_year(self, year):
    if (year == 2021):
      self.year = 2021
      return
    self.year = year
    self.cur_year = int(len(os.listdir(self.dir + f'{self.year}/'))/2)

  def set_round(self, round):
    self.round = round
    if (self.year == 2021):
      return # no data yet

    if (f'{self.year}_{self.round}' in self.cache):
      self.cur_race = self.cache[f'{self.year}_{self.round}']
      return
    # set round, if round > number of rounds in that season, set to the last round
    if (self.round < self.cur_year):
      self.cur_race = np.load(self.dir + f'{self.year}/{self.round-1}_in.npy'), np.load(self.dir + f'{self.year}/{self.round-1}_exp.npy')
    else:
      self.cur_race = np.load(self.dir + f'{self.year}/{self.cur_year-1}_in.npy'), np.load(self.dir + f'{self.year}/{self.cur_year - 1}_exp.npy')
    self.cache[f'{self.year}_{self.round}'] = self.cur_race

  def next_round(self):
    self.round += 1
    if (self.round < self.cur_year):
      self.set_round(self.round)
    else:
      self.round = 1
      self.set_year(self.year+1)
      self.set_round(self.round)


  def __len__(self):
    return len(self.cur_race[0])

  def __getitem__(self, i):
    cur_ret, next_ret = self.cur_race[0][i], self.cur_race[1][i]

    return (torch.from_numpy(cur_ret), torch.from_numpy(next_ret))

In [None]:
ds = RaceDataset(directory + 'races_npy/')

In [None]:
ds[0]

In [None]:
len(ds[0][0])

In [None]:
len(ds[0][1])

In [None]:
def pos_df(lap_in, out, num_of_laps=1):
  df = pd.DataFrame(columns=['code', 'driver', 'position', 'status', 'laptime'])
  lapin = lap_in.detach().clone().numpy()
  output = out.detach().clone().numpy()
  _name, _loc, _country = circuit_info(np.argmax(lapin[:130]))
  for i in range(20):
    _d_idx = driver_unbed_id(driver_unbed(lapin[131 + i * 196 : 131 + i * 196 + 130]))
    _num, _code, _fn, _ln, _, _ = driver_info(_d_idx)
    position = np.argmax(output[i*60 : i*60 + 21]) + 1
    pitting = output[i*60 + 21] * num_of_laps
    if (pitting == 0):
      pitting = 'NA'
    _retired = False
    if (position == 21):
      _retired = True
    _status = status_unbed(output[i*60 + 22: i*60 + 28], _retired)
    #_time = laptime_unbed(output[i*34 + 28:])
    df = df.append({
        'code': f'{_code}',
        'driver': f'{_fn} {_ln}',
        'position': position,
        'status': _status,
        'laptime': _time
    }, ignore_index=True)

  df = df.sort_values(by=['position', 'laptime'], ascending=[True])
  return _name, _loc, _country, df

In [None]:
ds.set_year(2001)
ds.set_round(1)

In [None]:
_, _, _, df = pos_df(ds[1][0], ds[1][1])

In [None]:
df

In [None]:
ds[0][0].shape

In [None]:
def out_to_in(in_, out_, random=False, num_of_laps=50):
  _ret = in_.detach().clone().numpy()
  _o = out_.detach().clone().numpy()
  _o = _o.reshape([1200])
  _ret = _ret.reshape([4051])
  j = 0
  for i in range(0, 20):
    _ret[131 + i*196 + 135: i*196 + 131 + 195] = _o[j*60: (j+1) * 60]
    if (round(_o[j*60 + 21] * num_of_laps) <= 1):
      _ret[131 + i*196 + 130 + 26] = 1
    else:
      _ret[131 + i*196 + 130 + 26] = 0
    if (random):
      _ret[i * 196 + 131 + 195] = np.random.uniform(0, 20, [1])
    j += 1

  return torch.from_numpy(_ret).float()

In [None]:
outfromin = out_to_in(ds[0][0], ds[0][1])

In [None]:
print(outfromin)

In [None]:
_, _, _, df2 = pos_df(outfromin, ds[0][1])

In [None]:
df2

In [None]:
class RacePredictionModel(nn.Module):
    def __init__(self, input_size, output_size, lstm_hids, lstm_layers, dropout):
        super(RacePredictionModel, self).__init__()

        self.input_size = input_size
        self.lstm_layers = lstm_layers
        self.lstm_hids = lstm_hids

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=lstm_hids, num_layers=lstm_layers, dropout=dropout, batch_first=True)

        self.fc = nn.Linear(lstm_hids, output_size)

        nn.init.xavier_uniform_(self.fc.weight.data)
        for name, params in self.lstm.named_parameters():
            if name[:6] == 'weight':
                nn.init.xavier_uniform_(params)
            elif name[:4] == 'bias':
                nn.init.constant_(params, 0.0)

    def zero_states(self, batchsize=1):
        hidden_state = torch.zeros(self.lstm_layers, batchsize, self.lstm_hids)
        cell_state = torch.zeros(self.lstm_layers, batchsize, self.lstm_hids)
        return (hidden_state, cell_state)

    def forward(self, ins, prev_states=None):
        lstm_outs, next_states = self.lstm(ins, prev_states)
        outs = self.fc(lstm_outs)
        return outs, next_states

In [None]:
def run_train(model, ds, crit, opt, sched):
    model.train()
    total_loss, total_count = 0, 0

    ds.set_year(2001)
    ds.set_round(1)

    while (ds.year != 2021):
      hid_state, cell_state = model.zero_states()
      states = hid_state.to(device), cell_state.to(device)
      for i in range(len(ds)):
          opt.zero_grad()
          lap_in, lap_exp = ds[i]
          
          if (True):
            lap_in = lap_in.unsqueeze(0)
            lap_exp = lap_exp.unsqueeze(0)
          lap_in = lap_in.unsqueeze(0).float()
          lap_exp = lap_exp.unsqueeze(0).float()

          lap_in, lap_exp = lap_in.to(device), lap_exp.to(device)
          lap_out, states = model(lap_in, states)
          loss = crit(lap_out, lap_exp)
          for s in states:
              s.detach_()
          loss.backward()
          if (math.isnan(loss)):
            print('Loss is nan')
            print(f'Year: {ds.year}')
            print(f'round: {ds.round}')
            print(f'lap: {i}')
            print(lap_in)
            input()
          opt.step()
          total_loss += loss.item() * 1
          total_count += 1
      ds.next_round()

    sched.step()
    return total_loss/total_count

def run_test(model, ds, crit):
    model.eval()
    total_loss, total_count = 0, 0

    ds.set_year(2020)
    ds.set_round(1)

    hid_state, cell_state = model.zero_states()
    states = hid_state.to(device), cell_state.to(device)

    for i in range(len(ds)):
      if (i == 0):
        lap_in, lap_exp = ds[i]
      else:
        _, lap_exp = ds[i]
        lap_in = out_to_in(lap_in.cpu(), lap_out.cpu()).to(device)
      #if (i == 0):
      if (True):
        lap_in = lap_in.unsqueeze(0)
        lap_exp = lap_exp.unsqueeze(0)
      lap_in = lap_in.unsqueeze(0).float()
      lap_exp = lap_exp.unsqueeze(0).float()
      lap_in, lap_exp = lap_in.to(device), lap_exp.to(device)
      lap_out, states = model(lap_in, states)
      loss = crit(lap_out, lap_exp)
      total_loss += loss.item() * 1
      total_count += 1

    return total_loss/total_count

def run_all(model, ds, crit, opt, sched, versionId, n_epochs=10):
    for epoch in tqdm(range(n_epochs), desc='epochs', unit='ep'):
      train_loss = run_train(model, ds, crit, opt, sched)
      test_loss = run_test(model, ds, crit)
      tqdm.write(f'epoch {epoch}   train loss {train_loss:.6f}  test loss {test_loss:.6f}')
      if epoch % 5 == 0:
        torch.save(model.state_dict(), f'/content/drive/My Drive/formula1_capstone/data/loss_folder/{versionId}-{epoch}-loss-{test_loss:.2f}.pth')

In [None]:
ds = RaceDataset(directory + 'races_npy/')

In [None]:
device = torch.device('cuda:0')
#device = torch.device('cpu')
model = RacePredictionModel(4051, 1200, 1200, 2, 0.2)
model.to(device)
crit = nn.MSELoss().to(device)
opt = optim.Adam(model.parameters(), lr=0.001)
sched = optim.lr_scheduler.StepLR(opt, 1, gamma=0.2)

In [None]:
run_all(model, ds, crit, opt, sched, 3, 10)

In [None]:
train_mse = run_train(model, ds, crit, opt, sched)

In [None]:
test_mse = run_test(model, ds, crit)

In [None]:
train_mse

In [None]:
test_mse

In [None]:
test_rmse = test_mse ** 0.5
test_rmse

In [None]:
cpu = torch.device('cpu')
model.load_state_dict(torch.load('/content/drive/My Drive/formula1_capstone/data/loss_folder/3-5-loss-0.06.pth'))
model.to(cpu)

In [None]:
gpu = torch.device('cuda')
model.to(gpu)

In [None]:
model.load_state_dict(torch.load('/content/drive/My Drive/formula1_capstone/data/loss_folder/3-5-loss-0.06.pth'))
model.eval()
ds.set_year(2020)
ds.set_round(16)
p, n = ds[0]
p = p.to(device)
hid_state, cell_state = model.zero_states()
states = (hid_state.to(device), cell_state.to(device))
out, s = model(p.unsqueeze(0).unsqueeze(0).float(), states)
out = out.squeeze().squeeze()
for i in range(50):
  out, s = model(out_to_in(p.to(cpu), out.to(cpu), True).to(device).unsqueeze(0).unsqueeze(0).float(), s)
  out = out.squeeze().squeeze()
  a, b, c, d = pos_df(p.to(cpu), out.to(cpu), len(ds))
out = out.detach().to(cpu)
pos_df(p.to(cpu), n.to(cpu))

In [None]:
d