In [1]:
from selenium import webdriver
import pandas as pd
# import matplotlib
import seaborn as sns # visualization
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import math
import statistics
import resources # custom libraries
import numpy as np
from tensorflow import keras
import tensorflow as tf
from datetime import datetime

## Scrapping the data
- Entering url and stracting data as a Pandas df
- After scrapping the data, df will be stored as CSV file
- If data has been already scrapped, data is stracted from CSV

In [2]:
training = False

if training:
    today = str(datetime.today().strftime('%d-%m'))
    # Getting data
    df = resources.data_scraper.scrape()
    # Casting columns
    df[df.columns[2]] = df[df.columns[2]].astype(np.int64)
    df[df.columns[3]] = df[df.columns[3]].astype(np.int64)
    # Adding total_score column
    df["TotalScore"] = df[df.columns[2]] + df[df.columns[3]]
    # Store CSV
    df.to_csv('data/{}.csv'.format(today), encoding='utf-8', index=False)
else:
    today = '02-05'
    df = pd.read_csv('data/{}.csv'.format(today))

In [3]:
df.head()

Unnamed: 0,HomeTeam,AwayTeam,ScoreHome,ScoreAway,TotalScore
0,Los Angeles Clippers,Denver Nuggets,104,110,214
1,Utah Jazz,Toronto Raptors,106,102,208
2,Dallas Mavericks,Washington Wizards,125,124,249
3,Atlanta Hawks,Chicago Bulls,108,97,205
4,Cleveland Cavaliers,Miami Heat,107,124,231


#### Heatmap

In [4]:
# resources.graph.heatmap(df, f_size=10, cmap="afmhot_r",vmin=150, vmax=250)

### Frecuency histogram
- To know how many divisions or 'bins' should be done in the histogram, we must use Sturges law

$$ c = 1 + \dfrac{log(M)}{log(2)}$$

In [5]:
# resources.graph.frecuency_histogram(df)


#### Boxplot

In [6]:
# plt.boxplot(df['TotalScore'])
# plt.title("Total score distribution")

#### Normal dfistribution

In [7]:
# resources.graph.normal_distribution(df, 'Atlanta Hawks')

## Data analysis
- Analysing data
- Store results in a new df

In [8]:
stats_df = resources.nba_stats.stats(df)
stats_df.head()

Unnamed: 0,team,mean,P25,P75,factorDown,factorUp,WfactorDown,WfactorUp
0,Los Angeles Clippers,222.132353,210.0,234.0,8.823529,8.823529,61.438762,53.889101
1,Utah Jazz,222.703125,208.0,235.25,7.8125,9.375,24.282383,63.549351
2,Dallas Mavericks,221.859375,209.75,236.0,14.0625,10.9375,97.527389,115.52431
3,Atlanta Hawks,222.323077,209.0,234.0,7.692308,6.153846,24.102487,83.719822
4,Cleveland Cavaliers,213.123077,198.0,226.0,23.076923,3.076923,152.290171,7.896327


Correlation analysis

In [9]:
# x_col = "factorDown"
# y_col = "factorUp"
# z_col = "mean"

# # fig, axs = plt.subplots(2, 2, sharex=True, sharey=True)
# sub_df = stats_df.loc[:, [x_col, y_col, z_col]]
# # sub_df.drop_duplicates(subset=x_col, inplace=True)
# # sub_df.drop_duplicates(subset=y_col, inplace=True)
# pivoted = sub_df.pivot(x_col, y_col, z_col)

# ax = sns.heatmap(pivoted)

# plt.tight_layout()
# plt.show()

## Training model

In [10]:
def replace_teams(df, stats_df):
    merged = pd.merge(left=df.iloc[:, :2], right=stats_df, how='inner',
                     left_on='HomeTeam', right_on='team')

    merged = pd.merge(left=merged, right=stats_df, how='inner',
                     left_on='AwayTeam', right_on='team')

    X = merged.iloc[:, np.r_[6:10, 11:18]]

    X = X.iloc[:, [0,1,2   ,7,8,9]]
    return X

In [11]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe.fit(df[['TotalScore']])


X = replace_teams(df, stats_df)

y = pd.get_dummies(df['TotalScore'], prefix='score')
y = ohe.transform(df[['TotalScore']]).toarray()

In [12]:
from sklearn.preprocessing import MinMaxScaler, normalize
x_scaler = MinMaxScaler().fit(X)

if training:# Normalize data
    X_norm = x_scaler.transform(X)

In [13]:
if training: # Create training and validations sets
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.3, random_state=42)

In [14]:
if training: # Train the model 
    # Create you neural net
    from keras.models import Sequential
    from keras.layers import Dense
    model = Sequential()
    model.add(Dense(200, input_dim=6, activation='relu')) # 14
    # model.add(Dense(16, activation='relu'))
    model.add(Dense(150, activation='relu'))
    model.add(Dense(120, activation='relu'))
    model.add(Dense(98, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Train model
    model.fit(X_norm, y, epochs=1400, batch_size=50, shuffle=True, verbose=2)
    # Store model
    model.save('./models/{}'.format(today))
else: # Get stored model
    model = keras.models.load_model('./models'.format(today))

Analysing model

In [15]:
if training:
    predictions = model.predict(X_test)

    decoded_test = ohe.inverse_transform(y_test)
    decoded_prediction = ohe.inverse_transform(predictions)

    data = np.concatenate((decoded_test, decoded_prediction), axis=1)

    comparation = pd.DataFrame(data=data, columns=["Train", "Prediction"])
    comparation['Error'] = abs(comparation['Train'] - comparation['Prediction'])
    comparation = comparation.astype('int32')

    print('The maximum error:', comparation['Error'].max())
    print('The average error:', comparation['Error'].mean())
    print('The dispersion of data:', comparation['Error'].std())

    comparation.head(3)

## Manually studying data

In [16]:
future_games = [
    ['BKN', 'MIL', 242],
    ['POR', 'BOS', 234],
    ['MIA', 'CHA', 211],
    ['NY', 'HOU', 217],
    ['PHI', 'SA', 180],
    ['PHX', 'OKC', 222],
    ['SAC', 'DAL', 0],
    ['TOR', 'LAL', 217],  
]

new_games = pd.DataFrame(columns = ['HomeTeam', 'AwayTeam', 'BetScore'])
               
for game in future_games:
    home_team = resources.teams.conversion_dict[game[0]]
    away_team = resources.teams.conversion_dict[game[1]]
    game = [home_team, away_team, game[2]]
    resources.df_helper.append_row(new_games, game)
    
today_games = replace_teams(new_games, stats_df)
today_games_norm = x_scaler.transform(today_games)

predictions = model.predict(today_games_norm)
decoded_predictions = ohe.inverse_transform(predictions)

new_games['Predictions'] = decoded_predictions
new_games['Confidence'] = abs(new_games['BetScore'] - new_games['Predictions'])

new_games

Unnamed: 0,HomeTeam,AwayTeam,BetScore,Predictions,Confidence
0,Brooklyn Nets,Milwaukee Bucks,242,206,36
1,Portland Trail Blazers,Boston Celtics,234,183,51
2,Miami Heat,Charlotte Hornets,211,211,0
3,New York Knicks,Houston Rockets,217,220,3
4,Philadelphia 76ers,San Antonio Spurs,180,216,36
5,Phoenix Suns,Oklahoma City Thunder,222,229,7
6,Sacramento Kings,Dallas Mavericks,0,233,233
7,Toronto Raptors,Los Angeles Lakers,217,212,5
