# Betting Bot

Making a function to run over predicitons and calculate wins/losses

## Setup

In [None]:
!git clone https://github.com/guggio/soccer_bets.git

### Imports

In [2]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dropout, Dense
from sklearn.preprocessing import RobustScaler

In [None]:
import seaborn as sns
from pylab import rcParams
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rc
from  matplotlib.ticker import FuncFormatter

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 22, 10

In [3]:
# trying to make it reproducible according to https://keras.io/getting_started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development. 
# Unfortunately does not work when the runtime is reset.

import numpy as np
import tensorflow as tf
import random as python_random
import os

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.
np.random.seed(42)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.
python_random.seed(42)

# The below set_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/random/set_seed
tf.random.set_seed(42)

## Loading the data

In [4]:
dataset_18_19 = 'soccer_bets/data/Bundesliga_18_19_complete_B365.csv'

In [5]:
data = pd.read_csv(dataset_18_19)

In [None]:
data.head()

In [7]:
data['Score'] = data['FTHG'] - data['FTAG']

In [None]:
data['Round'] = np.zeros((len(data),1))

int_round = 1
for row in range(1,len(data)+1):
  data['Round'][row-1] = int_round
  if (row % 9) == 0:
    int_round += 1


## Formatting the data

In [9]:
# Team dictionary
teams = list(set(data['HomeTeam'].values))
n_teams = len(teams)
teamToIdx = {t: i for i, t in enumerate(teams)}
homeId = [teamToIdx[id] for id in list(data['HomeTeam'].values)]
data['Home_Id'] = homeId
awayId = [teamToIdx[id] for id in list(data['AwayTeam'].values)]
data['Away_Id'] = awayId

In [10]:
score_transformer = RobustScaler()

score_transformer = score_transformer.fit(data[['Score']])

data['Score'] = score_transformer.transform(data[['Score']])

In [11]:
data_considered = ['Round','Home_Id', 'Away_Id', 'Score', 'FTR', 'B365H',	'B365D',	'B365A'	]

In [None]:
used_data = data[data_considered]
used_data.head()

## First Modeling and Evaluation
Let's build a LSTM, train it on the first five rounds and determine the required training epochs for the real model afterwards.

In [13]:
def create_dataset(X, y):
    Xs, ys = [], []
    for i in range(len(X)):
        v = X.iloc[i].values
        Xs.append(v)
        ys.append(y.iloc[i])     
    return np.array(Xs), np.array(ys)

In [14]:
train_set = data[data['Round'] < 6]
X_train, y_train = create_dataset(train_set[['Home_Id', 'Away_Id']], train_set.Score)

In [15]:
model = Sequential()
model.add(Embedding(n_teams+1, 9, input_length=2))
model.add(LSTM(units=128, recurrent_dropout=0.2))
model.add(Dropout(rate=0.2))
model.add(Dense(units=1))
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
history = model.fit(X_train, y_train,
                    
                  epochs = 25,
                  batch_size=9,
                  validation_split=0.1,
                  shuffle=False)

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend();

## Putting it all together

In [22]:
def create_dataset(X, y):
    Xs, ys = [], []
    for i in range(len(X)):
        v = X.iloc[i].values
        Xs.append(v)
        ys.append(y.iloc[i])     
    return np.array(Xs), np.array(ys)

In [23]:
def create_train_test_data(data, round):
  train_set = data[data['Round'] < round]
  test_set = data[data['Round'] == round]
  X_train, y_train = create_dataset(train_set[['Home_Id', 'Away_Id']], train_set.Score)
  X_test, y_test = create_dataset(test_set[['Home_Id', 'Away_Id']], test_set.Score)
  return {'train_set':train_set,'test_set':test_set,'X_train':X_train, 'y_train': y_train, 'X_test':X_test, 'y_test': y_test}

In [24]:
def create_model (n_teams, lstm_units=128, optimizer='adam', loss='mean_squared_error', dropout_rate=0.2, input_length=2, batchsize=9):
  model = Sequential()
  model.add(Embedding(n_teams+1, batchsize, input_length=input_length))
  model.add(LSTM(units=lstm_units, recurrent_dropout=dropout_rate))
  model.add(Dropout(rate=dropout_rate))
  model.add(Dense(units=1))
  model.compile(optimizer=optimizer, loss=loss)
  return model

In [25]:
def train_model(model, X, y, epochs=20, batch_size=9):
  history = model.fit(X, y,
                    epochs = epochs,
                    batch_size=batch_size,
                    shuffle=False)
  return history

In [26]:
def predict_results(model, X):
  y_pred = model.predict(X)
  return y_pred

In [27]:
def retransform_y_data(y_test, y_pred):
  y_actual = score_transformer.inverse_transform(y_test.reshape(1, -1)).reshape(-1,1)
  y_pred = score_transformer.inverse_transform(y_pred).reshape(-1,1)
  return {'y_actual':y_actual, 'y_pred':y_pred}

In [28]:
def calculate_bets(prediction_data):
  invested, won = list(), list()

  for i in range(len(prediction_data)):
    if prediction_data.prediction.iloc[i] >= 1.1:
      invested.append(1)
      if prediction_data.Score.iloc[i] >= 1.0:
        won.append(prediction_data.B365H.iloc[i])
      else:
        won.append(0)
    elif prediction_data.prediction.iloc[i] <= -1.1:
      invested.append(1)
      if prediction_data.Score.iloc[i] <= -1.0:
        won.append(prediction_data.B365A.iloc[i])
      else:
        won.append(0)
    else:
      invested.append(0)
      won.append(0)
  return {'invested':invested, 'won':won}

In [29]:
def create_prediction_dataframe(data:pd.DataFrame, y_actual_pred):
  pred_data = data[['Round','Home_Id', 'Away_Id', 'Score', 'B365H',	'B365D',	'B365A'	]]
  pred_data['Score'] = y_actual_pred['y_actual']
  pred_data['prediction'] = y_actual_pred['y_pred']
  bet_data = calculate_bets(pred_data)
  pred_data['invested'] = bet_data['invested']
  pred_data['won'] = bet_data['won']
  return pred_data

In [30]:
def beat_the_bookie(data, start_round=6, n_teams=17):
  total_rounds_in_season = n_teams * 2
  calculated_bets = list()
  for i in range(start_round, total_rounds_in_season + 1):
    current_data = create_train_test_data(data, i)
    model = create_model(n_teams)
    history = train_model(model, X=current_data['X_train'], y=current_data['y_train'])
    y_pred = predict_results(model, current_data['X_test'])
    retransformed_y_data = retransform_y_data(current_data['y_test'], y_pred)
    calculated_bets.append(create_prediction_dataframe(current_data['test_set'], retransformed_y_data))
  df = pd.concat(calculated_bets)
  return df

## Let's beat the bookmakers and take a evaluate our performance

In [None]:
bet_data = beat_the_bookie(used_data)

In [None]:
bet_data.head(10)

In [None]:
bet_data['invested'].sum()

In [None]:
bet_data['won'].sum()

In [None]:
# looking at the games for which we made bets

bet_data_only_invested = bet_data[bet_data['invested'] != 0]

pd.options.display.max_rows = 999
bet_data_only_invested

## Presenting the Results

In [36]:
import seaborn as sns
from pylab import rcParams
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rc
from  matplotlib.ticker import FuncFormatter

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 22, 10

In [37]:
df = bet_data[['Round', 'invested', 'won']]

In [38]:
df_winnings = df[['Round', 'invested', 'won']]
df_winnings_by_round = df_winnings.groupby('Round').sum()
df_winnings_by_round = df_winnings_by_round.reset_index()

net_winnings = list()
current_winnings = 0
for row in range(len(df_winnings_by_round)):
  current_winnings += (df_winnings_by_round.won.iloc[row] - df_winnings_by_round.invested.iloc[row])
  net_winnings.append(current_winnings)

df_winnings_by_round['net_winnings'] = net_winnings

In [39]:
df = pd.melt(df, id_vars=['Round']).sort_values(['variable', 'value'])

df_by_round = df.groupby(['Round', 'variable']).sum()
df_by_round = df_by_round.reset_index()

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2)
fig.set_size_inches(22,16)

sns.barplot(x='Round', y='value', hue='variable', data =df_by_round, ax=ax1)
ax1.set_title('Units invested and won per Round')
ax1.set_ylabel('Bet Units')

sns.lineplot(x='Round', y='net_winnings', data =df_winnings_by_round, ax=ax2)
ax2.set_title('Net Winnings')
ax2.set_ylabel('Bet Units')

plt.subplots_adjust(hspace=0.5)