In [3]:
# EPL

from mpl_toolkits import mplot3d
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from scipy.stats import gamma
from sklearn.neighbors import NearestNeighbors
import seaborn as sns


# Code block will analyze EPL data
# Load the data into variables
spi_global_ranking = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/538 SPI Data/spi_global_rankings.csv')
spi_matches = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/538 SPI Data/spi_matches.csv')
spi_matches_latest = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/538 SPI Data/spi_matches_latest.csv')

EPL_match_odds_2122 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/EPL/2021-2022.csv')
EPL_match_odds_2021 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/EPL/2020-2021.csv')
EPL_match_odds_1920 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/EPL/2019-2020.csv')
EPL_match_odds_1819 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/EPL/2018-2019.csv')
EPL_match_odds_1718 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/EPL/2017-2018.csv')
EPL_match_odds_1617 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/EPL/2016-2017.csv')

# This section stores the data I want to use regarding betting markets in a new dataframe
keep_cols = ['HomeTeam', 'AwayTeam', 'B365H', 'B365A', 'B365D', 'FTR', 'HTR']
EPL_match_odds = EPL_match_odds_1617[keep_cols].sort_values(['HomeTeam', 'AwayTeam'])
EPL_match_odds = EPL_match_odds.append(EPL_match_odds_1718[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
EPL_match_odds = EPL_match_odds.append(EPL_match_odds_1819[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
EPL_match_odds = EPL_match_odds.append(EPL_match_odds_1920[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
EPL_match_odds = EPL_match_odds.append(EPL_match_odds_2021[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
EPL_match_odds = EPL_match_odds.append(EPL_match_odds_2122[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))

# This section converts the data into usable forms
# Converting between decimal odds and implied probability is just y = 1/x
EPL_match_odds['B365A'] = EPL_match_odds['B365A'].apply(lambda x: 1/x)
EPL_match_odds['B365D'] = EPL_match_odds['B365D'].apply(lambda x: 1/x)
EPL_match_odds['B365H'] = EPL_match_odds['B365H'].apply(lambda x: 1/x)

# This changes any necessary team names so they are sorted exactly the same in both dataframes
spi_matches_league = spi_matches[spi_matches['league'] == 'Barclays Premier League']
spi_matches_league = spi_matches_league.replace(to_replace='AFC Bournemouth', value='Bournemouth').sort_values(['season', 'team1', 'team2'])

# Arrays to build the game_data DataFrame
season = [np.array(spi_matches_league['season'])[x+10] for x in range(len(spi_matches_league) -10)]
date = [np.array(spi_matches_league['date'])[x+10] for x in range(len(spi_matches_league) - 10)]
hometeam = [np.array(spi_matches_league['team1'])[x+10] for x in range(len(spi_matches_league) - 10)]
home_goals = [np.array(spi_matches_league['xg1'])[x+10] for x in range(len(spi_matches_league) - 10)]
away_goals = [np.array(spi_matches_league['xg2'])[x+10] for x in range(len(spi_matches_league) - 10)]
awayteam = [np.array(spi_matches_league['team2'])[x+10] for x in range(len(spi_matches_league) - 10)]
result = [np.array(EPL_match_odds['FTR'])[x+10] for x in range(len(EPL_match_odds) - 10)]
home_odds = [np.array(EPL_match_odds['B365H'])[x+10] for x in range(len(EPL_match_odds) - 10)]
away_odds = [np.array(EPL_match_odds['B365A'])[x+10] for x in range(len(EPL_match_odds) - 10)]
draw_odds = [np.array(EPL_match_odds['B365D'])[x+10] for x in range(len(EPL_match_odds) - 10)]
pythag = np.divide(np.multiply(home_goals, home_goals), np.add(np.multiply(home_goals, home_goals), np.multiply(away_goals, away_goals)))

# Object to handle data manipulation; initialized to create ratings
# Assumes spi_matches_league is the df being passed
class CalcTeamRatings:
  def __init__(self, home, away, date, df):
    self.home = home
    self.date = date
    self.away = away
    self.df = df[df['date'] < self.date].sort_values(['date'])
    self.df = self.df.dropna()
    self.home_df = self.df[self.df['team1'] == self.home]
    self.home_df = self.home_df.tail(30)
    self.away_df = self.df[self.df['team2'] == self.away]
    self.away_df = self.away_df.tail(30)
  def rollingAvgXG(self):
    if (len(self.home_df) < 10):
      return self.home_df['xg1'].mean()
    if (len(self.home_df) < 20):
      return ((self.home_df['xg1'].tail(10).mean() + self.home_df['xg1'].mean()) / 2)
    old_games = self.home_df['xg1'].head(20).mean()
    new_games = self.home_df['xg1'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def rollingAvgXGA(self):
    if (len(self.home_df) < 10):
      return self.home_df['xg2'].mean()
    if (len(self.home_df) < 20):
      return ((self.home_df['xg2'].tail(10).mean() + self.home_df['xg2'].mean()) / 2)
    old_games = self.home_df['xg2'].head(20).mean()
    new_games = self.home_df['xg2'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def away_rollingAvgXG(self):
    if (len(self.away_df) < 10):
      return self.away_df['xg2'].mean()
    if (len(self.away_df) < 20):
      return ((self.away_df['xg2'].tail(10).mean() + self.away_df['xg2'].mean()) / 2)
    old_games = self.away_df['xg2'].head(20).mean()
    new_games = self.away_df['xg2'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def away_rollingAvgXGA(self):
    if (len(self.away_df) < 10):
      return self.away_df['xg1'].mean()
    if (len(self.away_df) < 20):
      return ((self.away_df['xg1'].tail(10).mean() + self.away_df['xg1'].mean()) / 2)
    old_games = self.away_df['xg1'].head(20).mean()
    new_games = self.away_df['xg1'].tail(10).mean()
    return ((old_games + new_games) / 2)

lg_avg_home = spi_matches_league['xg1'].sum()/len(spi_matches_league)
lg_avg_away = spi_matches_league['xg2'].sum()/len(spi_matches_league)

game_data = pd.DataFrame({'Season': np.array(season),
                          'Date': np.array(date),
                          'Home Team': np.array(hometeam),
                          'Away Team': np.array(awayteam),
                          'Home Goals': np.array(home_goals),
                          'Away Goals': np.array(away_goals),
                          'Goal Difference': np.array(home_goals) - np.array(away_goals),
                          'Pythag': np.array(pythag),
                          'Result': np.array(result),
                          'Home Odds': np.array(home_odds),
                          'Away Odds': np.array(away_odds),
                          'Draw Odds': np.array(draw_odds)})

# This slice creates a sample that can be used to generate better atk/def ratings
# Keep in mind that this extra data used to create ratings does not apply to promoted teams
game_data = game_data[game_data['Season'] > 2016].reset_index(drop=True)

game_data['Object'] = [CalcTeamRatings(game_data['Home Team'][x], game_data['Away Team'][x], game_data['Date'][x], spi_matches_league) for x in range(len(game_data))]
game_data['Home Atk'] = [game_data['Object'][x].rollingAvgXG() for x in range(len(game_data))]
game_data['Home Def'] = [game_data['Object'][x].rollingAvgXGA() for x in range(len(game_data))]
game_data['Away Atk'] = [game_data['Object'][x].away_rollingAvgXG() for x in range(len(game_data))]
game_data['Away Def'] = [game_data['Object'][x].away_rollingAvgXGA() for x in range(len(game_data))]

game_data['Home Atk'] = game_data['Home Atk'].replace(to_replace=0, value=np.nan)
game_data['Home Def'] = game_data['Home Def'].replace(to_replace=0, value=np.nan)
game_data['Away Atk'] = game_data['Away Atk'].replace(to_replace=0, value=np.nan)
game_data['Away Def'] = game_data['Away Def'].replace(to_replace=0, value=np.nan)
game_data = game_data.dropna()
game_data = game_data.reset_index(drop=True).replace(to_replace=0, value=0.01)

game_data.to_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Processed Data/EPL.csv')
spi_matches_league.to_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Processed Data/spi_matches_EPL.csv')

In [4]:
# Eredivisie

from mpl_toolkits import mplot3d
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from scipy.stats import gamma
from sklearn.neighbors import NearestNeighbors
import seaborn as sns


# Code block will analyze EPL data
# Load the data into variables
spi_global_ranking = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/538 SPI Data/spi_global_rankings.csv')
spi_matches = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/538 SPI Data/spi_matches.csv')
spi_matches_latest = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/538 SPI Data/spi_matches_latest.csv')

Eredivisie_match_odds_2122 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/Eredivisie/2021-2022.csv')
Eredivisie_match_odds_2021 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/Eredivisie/2020-2021.csv')
Eredivisie_match_odds_1920 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/Eredivisie/2019-2020.csv')
Eredivisie_match_odds_1819 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/Eredivisie/2018-2019.csv')
Eredivisie_match_odds_1718 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/Eredivisie/2017-2018.csv')

# This section stores the data I want to use regarding betting markets in a new dataframe
keep_cols = ['HomeTeam', 'AwayTeam', 'B365H', 'B365A', 'B365D', 'FTR', 'HTR']
Eredivisie_match_odds = Eredivisie_match_odds_1718[keep_cols].sort_values(['HomeTeam', 'AwayTeam'])
Eredivisie_match_odds = Eredivisie_match_odds.append(Eredivisie_match_odds_1819[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
Eredivisie_match_odds = Eredivisie_match_odds.append(Eredivisie_match_odds_1920[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
Eredivisie_match_odds = Eredivisie_match_odds.append(Eredivisie_match_odds_2021[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
Eredivisie_match_odds = Eredivisie_match_odds.append(Eredivisie_match_odds_2122[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))

# This section converts the data into usable forms
# Converting between decimal odds and implied probability is just y = 1/x
Eredivisie_match_odds['B365A'] = Eredivisie_match_odds['B365A'].apply(lambda x: 1/x)
Eredivisie_match_odds['B365D'] = Eredivisie_match_odds['B365D'].apply(lambda x: 1/x)
Eredivisie_match_odds['B365H'] = Eredivisie_match_odds['B365H'].apply(lambda x: 1/x)

# This changes any necessary team names so they are sorted exactly the same in both dataframes
spi_matches_league = spi_matches[spi_matches['league'] == 'Dutch Eredivisie'].reset_index(drop=True)
spi_matches_league = spi_matches_league[spi_matches_league['score1'].notna()]
spi_matches_league = spi_matches_league.replace(to_replace='FC Groningen', value='Groningen')
spi_matches_league = spi_matches_league.replace(to_replace='FC Twente', value='Twente')
spi_matches_league = spi_matches_league.replace(to_replace='FC Utrecht', value='Utrecht')
spi_matches_league = spi_matches_league.replace(to_replace='De Graafschap', value='Graafschap')
spi_matches_league = spi_matches_league.replace(to_replace='PEC Zwolle', value='Zwolle')
spi_matches_league = spi_matches_league.replace(to_replace='Emmen', value='FC Emmen')
spi_matches_league = spi_matches_league.replace(to_replace='RKC', value='Waalwijk')
spi_matches_league = spi_matches_league.replace(to_replace='NEC', value='Nijmegen')
spi_matches_league = spi_matches_league.replace(to_replace='ADO Den Haag', value='Den Haag').sort_values(['season', 'team1', 'team2'])

# Arrays to build the game_data DataFrame
season = [np.array(spi_matches_league['season'])[x+10] for x in range(len(spi_matches_league) -10)]
date = [np.array(spi_matches_league['date'])[x+10] for x in range(len(spi_matches_league) - 10)]
hometeam = [np.array(spi_matches_league['team1'])[x+10] for x in range(len(spi_matches_league) - 10)]
home_goals = [np.array(spi_matches_league['xg1'])[x+10] for x in range(len(spi_matches_league) - 10)]
away_goals = [np.array(spi_matches_league['xg2'])[x+10] for x in range(len(spi_matches_league) - 10)]
awayteam = [np.array(spi_matches_league['team2'])[x+10] for x in range(len(spi_matches_league) - 10)]
result = [np.array(Eredivisie_match_odds['FTR'])[x+10] for x in range(len(Eredivisie_match_odds) - 10)]
home_odds = [np.array(Eredivisie_match_odds['B365H'])[x+10] for x in range(len(Eredivisie_match_odds) - 10)]
away_odds = [np.array(Eredivisie_match_odds['B365A'])[x+10] for x in range(len(Eredivisie_match_odds) - 10)]
draw_odds = [np.array(Eredivisie_match_odds['B365D'])[x+10] for x in range(len(Eredivisie_match_odds) - 10)]
pythag = np.divide(np.multiply(home_goals, home_goals), np.add(np.multiply(home_goals, home_goals), np.multiply(away_goals, away_goals)))

# Object to handle data manipulation; initialized to create ratings
# Assumes spi_matches_league is the df being passed
class CalcTeamRatings:
  def __init__(self, home, away, date, df):
    self.home = home
    self.date = date
    self.away = away
    self.df = df[df['date'] < self.date].sort_values(['date'])
    self.df = self.df.dropna()
    self.home_df = self.df[self.df['team1'] == self.home]
    self.home_df = self.home_df.tail(30)
    self.away_df = self.df[self.df['team2'] == self.away]
    self.away_df = self.away_df.tail(30)
  def rollingAvgXG(self):
    if (len(self.home_df) < 10):
      return self.home_df['xg1'].mean()
    if (len(self.home_df) < 20):
      return ((self.home_df['xg1'].tail(10).mean() + self.home_df['xg1'].mean()) / 2)
    old_games = self.home_df['xg1'].head(20).mean()
    new_games = self.home_df['xg1'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def rollingAvgXGA(self):
    if (len(self.home_df) < 10):
      return self.home_df['xg2'].mean()
    if (len(self.home_df) < 20):
      return ((self.home_df['xg2'].tail(10).mean() + self.home_df['xg2'].mean()) / 2)
    old_games = self.home_df['xg2'].head(20).mean()
    new_games = self.home_df['xg2'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def away_rollingAvgXG(self):
    if (len(self.away_df) < 10):
      return self.away_df['xg2'].mean()
    if (len(self.away_df) < 20):
      return ((self.away_df['xg2'].tail(10).mean() + self.away_df['xg2'].mean()) / 2)
    old_games = self.away_df['xg2'].head(20).mean()
    new_games = self.away_df['xg2'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def away_rollingAvgXGA(self):
    if (len(self.away_df) < 10):
      return self.away_df['xg1'].mean()
    if (len(self.away_df) < 20):
      return ((self.away_df['xg1'].tail(10).mean() + self.away_df['xg1'].mean()) / 2)
    old_games = self.away_df['xg1'].head(20).mean()
    new_games = self.away_df['xg1'].tail(10).mean()
    return ((old_games + new_games) / 2)

lg_avg_home = spi_matches_league['xg1'].sum()/len(spi_matches_league)
lg_avg_away = spi_matches_league['xg2'].sum()/len(spi_matches_league)

game_data = pd.DataFrame({'Season': np.array(season),
                          'Date': np.array(date),
                          'Home Team': np.array(hometeam),
                          'Away Team': np.array(awayteam),
                          'Home Goals': np.array(home_goals),
                          'Away Goals': np.array(away_goals),
                          'Goal Difference': np.array(home_goals) - np.array(away_goals),
                          'Pythag': np.array(pythag),
                          'Result': np.array(result),
                          'Home Odds': np.array(home_odds),
                          'Away Odds': np.array(away_odds),
                          'Draw Odds': np.array(draw_odds)})

# This slice creates a sample that can be used to generate better atk/def ratings
# Keep in mind that this extra data used to create ratings does not apply to promoted teams
game_data = game_data[game_data['Season'] > 2016].reset_index(drop=True)

game_data['Object'] = [CalcTeamRatings(game_data['Home Team'][x], game_data['Away Team'][x], game_data['Date'][x], spi_matches_league) for x in range(len(game_data))]
game_data['Home Atk'] = [game_data['Object'][x].rollingAvgXG() for x in range(len(game_data))]
game_data['Home Def'] = [game_data['Object'][x].rollingAvgXGA() for x in range(len(game_data))]
game_data['Away Atk'] = [game_data['Object'][x].away_rollingAvgXG() for x in range(len(game_data))]
game_data['Away Def'] = [game_data['Object'][x].away_rollingAvgXGA() for x in range(len(game_data))]

game_data['Home Atk'] = game_data['Home Atk'].replace(to_replace=0, value=np.nan)
game_data['Home Def'] = game_data['Home Def'].replace(to_replace=0, value=np.nan)
game_data['Away Atk'] = game_data['Away Atk'].replace(to_replace=0, value=np.nan)
game_data['Away Def'] = game_data['Away Def'].replace(to_replace=0, value=np.nan)
game_data = game_data.dropna()
game_data = game_data.reset_index(drop=True).replace(to_replace=0, value=0.01)

game_data.to_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Processed Data/Eredivisie.csv')
spi_matches_league.to_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Processed Data/spi_matches_Eredivisie.csv')

In [5]:
# Bundesliga

from mpl_toolkits import mplot3d
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from scipy.stats import gamma
from sklearn.neighbors import NearestNeighbors
import seaborn as sns


# Code block will analyze EPL data
# Load the data into variables
spi_global_ranking = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/538 SPI Data/spi_global_rankings.csv')
spi_matches = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/538 SPI Data/spi_matches.csv')
spi_matches_latest = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/538 SPI Data/spi_matches_latest.csv')

Bundesliga_match_odds_2122 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/Bundesliga/2021-2022.csv')
Bundesliga_match_odds_2021 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/Bundesliga/2020-2021.csv')
Bundesliga_match_odds_1920 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/Bundesliga/2019-2020.csv')
Bundesliga_match_odds_1819 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/Bundesliga/2018-2019.csv')
Bundesliga_match_odds_1718 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/Bundesliga/2017-2018.csv')
Bundesliga_match_odds_1617 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/Bundesliga/2016-2017.csv')

# This section stores the data I want to use regarding betting markets in a new dataframe
keep_cols = ['HomeTeam', 'AwayTeam', 'B365H', 'B365A', 'B365D', 'FTR', 'HTR']
Bundesliga_match_odds = Bundesliga_match_odds_1617[keep_cols].sort_values(['HomeTeam', 'AwayTeam'])
Bundesliga_match_odds = Bundesliga_match_odds.append(Bundesliga_match_odds_1718[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
Bundesliga_match_odds = Bundesliga_match_odds.append(Bundesliga_match_odds_1819[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
Bundesliga_match_odds = Bundesliga_match_odds.append(Bundesliga_match_odds_1920[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
Bundesliga_match_odds = Bundesliga_match_odds.append(Bundesliga_match_odds_2021[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
Bundesliga_match_odds = Bundesliga_match_odds.append(Bundesliga_match_odds_2122[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))

# This section converts the data into usable forms
# Converting between decimal odds and implied probability is just y = 1/x
Bundesliga_match_odds['B365A'] = Bundesliga_match_odds['B365A'].apply(lambda x: 1/x)
Bundesliga_match_odds['B365D'] = Bundesliga_match_odds['B365D'].apply(lambda x: 1/x)
Bundesliga_match_odds['B365H'] = Bundesliga_match_odds['B365H'].apply(lambda x: 1/x)

# This changes any necessary team names so they are sorted exactly the same in both dataframes
spi_matches_league = spi_matches[spi_matches['league'] == 'German Bundesliga']
spi_matches_league = spi_matches_league.replace(to_replace='FC Augsburg', value='Augsburg')
spi_matches_league = spi_matches_league.replace(to_replace='Borussia Dortmund', value='Dortmund')
spi_matches_league = spi_matches_league.replace(to_replace='Borussia Monchengladbach', value="M'gladbach")
spi_matches_league = spi_matches_league.replace(to_replace='Bayer Leverkusen', value='Leverkusen')
spi_matches_league = spi_matches_league.replace(to_replace='SpVgg Greuther Fürth', value='Greuther Fürth')
spi_matches_league = spi_matches_league.replace(to_replace='SV Darmstadt 98', value='Darmstadt')
spi_matches_league = spi_matches_league.replace(to_replace='FC Ingolstadt 04', value='Ingolstadt')
spi_matches_league = spi_matches_league.replace(to_replace='SC Freiburg', value='Freiburg')
spi_matches_league = spi_matches_league.replace(to_replace='Arminia Bielefeld', value='Bielefeld')
spi_matches_league = spi_matches_league.replace(to_replace='SC Paderborn', value='Paderborn')
spi_matches_league = spi_matches_league.replace(to_replace='1. FC Union Berlin', value='Union Berlin')
spi_matches_league = spi_matches_league.replace(to_replace='VfL Bochum', value='Bochum')
spi_matches_league = spi_matches_league.replace(to_replace='1. FC Nürnberg', value='Nürnberg')
spi_matches_league = spi_matches_league.replace(to_replace='FC Cologne', value='FC Koln')
spi_matches_league = spi_matches_league.replace(to_replace='Fortuna Düsseldorf', value='Fortuna Dusseldorf')
spi_matches_league = spi_matches_league.replace(to_replace='VfB Stuttgart', value='Stuttgart')
spi_matches_league = spi_matches_league.replace(to_replace='VfL Wolfsburg', value='Wolfsburg')
spi_matches_league = spi_matches_league.replace(to_replace='VfL Bochum', value='Bochum')
spi_matches_league = spi_matches_league.replace(to_replace='TSG Hoffenheim', value='Hoffenheim').sort_values(['season', 'team1', 'team2'])

# Arrays to build the game_data DataFrame
season = [np.array(spi_matches_league['season'])[x+10] for x in range(len(spi_matches_league) -10)]
date = [np.array(spi_matches_league['date'])[x+10] for x in range(len(spi_matches_league) - 10)]
hometeam = [np.array(spi_matches_league['team1'])[x+10] for x in range(len(spi_matches_league) - 10)]
home_goals = [np.array(spi_matches_league['xg1'])[x+10] for x in range(len(spi_matches_league) - 10)]
away_goals = [np.array(spi_matches_league['xg2'])[x+10] for x in range(len(spi_matches_league) - 10)]
awayteam = [np.array(spi_matches_league['team2'])[x+10] for x in range(len(spi_matches_league) - 10)]
result = [np.array(Bundesliga_match_odds['FTR'])[x+10] for x in range(len(Bundesliga_match_odds) - 10)]
home_odds = [np.array(Bundesliga_match_odds['B365H'])[x+10] for x in range(len(Bundesliga_match_odds) - 10)]
away_odds = [np.array(Bundesliga_match_odds['B365A'])[x+10] for x in range(len(Bundesliga_match_odds) - 10)]
draw_odds = [np.array(Bundesliga_match_odds['B365D'])[x+10] for x in range(len(Bundesliga_match_odds) - 10)]
pythag = np.divide(np.multiply(home_goals, home_goals), np.add(np.multiply(home_goals, home_goals), np.multiply(away_goals, away_goals)))

# Object to handle data manipulation; initialized to create ratings
# Assumes spi_matches_league is the df being passed
class CalcTeamRatings:
  def __init__(self, home, away, date, df):
    self.home = home
    self.date = date
    self.away = away
    self.df = df[df['date'] < self.date].sort_values(['date'])
    self.df = self.df.dropna()
    self.home_df = self.df[self.df['team1'] == self.home]
    self.home_df = self.home_df.tail(30)
    self.away_df = self.df[self.df['team2'] == self.away]
    self.away_df = self.away_df.tail(30)
  def rollingAvgXG(self):
    if (len(self.home_df) < 10):
      return self.home_df['xg1'].mean()
    if (len(self.home_df) < 20):
      return ((self.home_df['xg1'].tail(10).mean() + self.home_df['xg1'].mean()) / 2)
    old_games = self.home_df['xg1'].head(20).mean()
    new_games = self.home_df['xg1'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def rollingAvgXGA(self):
    if (len(self.home_df) < 10):
      return self.home_df['xg2'].mean()
    if (len(self.home_df) < 20):
      return ((self.home_df['xg2'].tail(10).mean() + self.home_df['xg2'].mean()) / 2)
    old_games = self.home_df['xg2'].head(20).mean()
    new_games = self.home_df['xg2'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def away_rollingAvgXG(self):
    if (len(self.away_df) < 10):
      return self.away_df['xg2'].mean()
    if (len(self.away_df) < 20):
      return ((self.away_df['xg2'].tail(10).mean() + self.away_df['xg2'].mean()) / 2)
    old_games = self.away_df['xg2'].head(20).mean()
    new_games = self.away_df['xg2'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def away_rollingAvgXGA(self):
    if (len(self.away_df) < 10):
      return self.away_df['xg1'].mean()
    if (len(self.away_df) < 20):
      return ((self.away_df['xg1'].tail(10).mean() + self.away_df['xg1'].mean()) / 2)
    old_games = self.away_df['xg1'].head(20).mean()
    new_games = self.away_df['xg1'].tail(10).mean()
    return ((old_games + new_games) / 2)

lg_avg_home = spi_matches_league['xg1'].sum()/len(spi_matches_league)
lg_avg_away = spi_matches_league['xg2'].sum()/len(spi_matches_league)

game_data = pd.DataFrame({'Season': np.array(season),
                          'Date': np.array(date),
                          'Home Team': np.array(hometeam),
                          'Away Team': np.array(awayteam),
                          'Home Goals': np.array(home_goals),
                          'Away Goals': np.array(away_goals),
                          'Goal Difference': np.array(home_goals) - np.array(away_goals),
                          'Pythag': np.array(pythag),
                          'Result': np.array(result),
                          'Home Odds': np.array(home_odds),
                          'Away Odds': np.array(away_odds),
                          'Draw Odds': np.array(draw_odds)})

# This slice creates a sample that can be used to generate better atk/def ratings
# Keep in mind that this extra data used to create ratings does not apply to promoted teams
game_data = game_data[game_data['Season'] > 2016].reset_index(drop=True)

game_data['Object'] = [CalcTeamRatings(game_data['Home Team'][x], game_data['Away Team'][x], game_data['Date'][x], spi_matches_league) for x in range(len(game_data))]
game_data['Home Atk'] = [game_data['Object'][x].rollingAvgXG() for x in range(len(game_data))]
game_data['Home Def'] = [game_data['Object'][x].rollingAvgXGA() for x in range(len(game_data))]
game_data['Away Atk'] = [game_data['Object'][x].away_rollingAvgXG() for x in range(len(game_data))]
game_data['Away Def'] = [game_data['Object'][x].away_rollingAvgXGA() for x in range(len(game_data))]

game_data['Home Atk'] = game_data['Home Atk'].replace(to_replace=0, value=np.nan)
game_data['Home Def'] = game_data['Home Def'].replace(to_replace=0, value=np.nan)
game_data['Away Atk'] = game_data['Away Atk'].replace(to_replace=0, value=np.nan)
game_data['Away Def'] = game_data['Away Def'].replace(to_replace=0, value=np.nan)
game_data = game_data.dropna()
game_data = game_data.reset_index(drop=True).replace(to_replace=0, value=0.01)

game_data.to_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Processed Data/Bundesliga.csv')
spi_matches_league.to_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Processed Data/spi_matches_Bundesliga.csv')

In [6]:
# La Liga

from mpl_toolkits import mplot3d
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from scipy.stats import gamma
from sklearn.neighbors import NearestNeighbors
import seaborn as sns


# Code block will analyze EPL data
# Load the data into variables
spi_global_ranking = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/538 SPI Data/spi_global_rankings.csv')
spi_matches = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/538 SPI Data/spi_matches.csv')
spi_matches_latest = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/538 SPI Data/spi_matches_latest.csv')

La_Liga_match_odds_2122 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/La Liga/2021-2022.csv')
La_Liga_match_odds_2021 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/La Liga/2020-2021.csv')
La_Liga_match_odds_1920 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/La Liga/2019-2020.csv')
La_Liga_match_odds_1819 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/La Liga/2018-2019.csv')
La_Liga_match_odds_1718 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/La Liga/2017-2018.csv')
La_Liga_match_odds_1617 = pd.read_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Betting Odds/La Liga/2016-2017.csv')

# This section stores the data I want to use regarding betting markets in a new dataframe
keep_cols = ['HomeTeam', 'AwayTeam', 'B365H', 'B365A', 'B365D', 'FTR', 'HTR']
La_Liga_match_odds = La_Liga_match_odds_1617[keep_cols].sort_values(['HomeTeam', 'AwayTeam'])
La_Liga_match_odds = La_Liga_match_odds.append(La_Liga_match_odds_1718[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
La_Liga_match_odds = La_Liga_match_odds.append(La_Liga_match_odds_1819[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
La_Liga_match_odds = La_Liga_match_odds.append(La_Liga_match_odds_1920[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
La_Liga_match_odds = La_Liga_match_odds.append(La_Liga_match_odds_2021[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
La_Liga_match_odds = La_Liga_match_odds.append(La_Liga_match_odds_2122[keep_cols].sort_values(['HomeTeam', 'AwayTeam']))
La_Liga_match_odds = La_Liga_match_odds.replace(to_replace='Espanol', value='Espanyol')

# This section converts the data into usable forms
# Converting between decimal odds and implied probability is just y = 1/x
La_Liga_match_odds['B365A'] = La_Liga_match_odds['B365A'].apply(lambda x: 1/x)
La_Liga_match_odds['B365D'] = La_Liga_match_odds['B365D'].apply(lambda x: 1/x)
La_Liga_match_odds['B365H'] = La_Liga_match_odds['B365H'].apply(lambda x: 1/x)

# This changes the team names so they are the same in both dataframes
# This ensures they are sorted exactly the same
spi_matches_league = spi_matches[spi_matches['league'] == 'Spanish Primera Division']
spi_matches_league = spi_matches_league.replace(to_replace='Athletic Bilbao', value='Ath Bilbao')
spi_matches_league = spi_matches_league.replace(to_replace='Atletico Madrid', value='Ath Madrid')
spi_matches_league = spi_matches_league.replace(to_replace='Deportivo La Coruña', value='La Coruna')
spi_matches_league = spi_matches_league.replace(to_replace='Real Betis', value='Betis')
spi_matches_league = spi_matches_league.replace(to_replace='Girona FC', value='Girona')
spi_matches_league = spi_matches_league.replace(to_replace='Sevilla FC', value='Sevilla')
spi_matches_league = spi_matches_league.replace(to_replace='Rayo Vallecano', value='Vallecano')
spi_matches_league = spi_matches_league.replace(to_replace='Real Valladolid', value='Valladolid')
spi_matches_league = spi_matches_league.replace(to_replace='SD Huesca', value='Huesca')
spi_matches_league = spi_matches_league.replace(to_replace='Sporting Gijón', value='Sp Gijon')
spi_matches_league = spi_matches_league.replace(to_replace='Real Sociedad', value='Sociedad')
spi_matches_league = spi_matches_league.replace(to_replace='Celta Vigo', value='Celta').sort_values(['season', 'team1', 'team2'])

# Arrays to build the game_data DataFrame
season = [np.array(spi_matches_league['season'])[x+10] for x in range(len(spi_matches_league) -10)]
date = [np.array(spi_matches_league['date'])[x+10] for x in range(len(spi_matches_league) - 10)]
hometeam = [np.array(spi_matches_league['team1'])[x+10] for x in range(len(spi_matches_league) - 10)]
home_goals = [np.array(spi_matches_league['xg1'])[x+10] for x in range(len(spi_matches_league) - 10)]
away_goals = [np.array(spi_matches_league['xg2'])[x+10] for x in range(len(spi_matches_league) - 10)]
awayteam = [np.array(spi_matches_league['team2'])[x+10] for x in range(len(spi_matches_league) - 10)]
result = [np.array(La_Liga_match_odds['FTR'])[x+10] for x in range(len(La_Liga_match_odds) - 10)]
home_odds = [np.array(La_Liga_match_odds['B365H'])[x+10] for x in range(len(La_Liga_match_odds) - 10)]
away_odds = [np.array(La_Liga_match_odds['B365A'])[x+10] for x in range(len(La_Liga_match_odds) - 10)]
draw_odds = [np.array(La_Liga_match_odds['B365D'])[x+10] for x in range(len(La_Liga_match_odds) - 10)]
pythag = np.divide(np.multiply(home_goals, home_goals), np.add(np.multiply(home_goals, home_goals), np.multiply(away_goals, away_goals)))

# Object to handle data manipulation; initialized to create ratings
# Assumes spi_matches_league is the df being passed
class CalcTeamRatings:
  def __init__(self, home, away, date, df):
    self.home = home
    self.date = date
    self.away = away
    self.df = df[df['date'] < self.date].sort_values(['date'])
    self.df = self.df.dropna()
    self.home_df = self.df[self.df['team1'] == self.home]
    self.home_df = self.home_df.tail(30)
    self.away_df = self.df[self.df['team2'] == self.away]
    self.away_df = self.away_df.tail(30)
  def rollingAvgXG(self):
    if (len(self.home_df) < 10):
      return self.home_df['xg1'].mean()
    if (len(self.home_df) < 20):
      return ((self.home_df['xg1'].tail(10).mean() + self.home_df['xg1'].mean()) / 2)
    old_games = self.home_df['xg1'].head(20).mean()
    new_games = self.home_df['xg1'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def rollingAvgXGA(self):
    if (len(self.home_df) < 10):
      return self.home_df['xg2'].mean()
    if (len(self.home_df) < 20):
      return ((self.home_df['xg2'].tail(10).mean() + self.home_df['xg2'].mean()) / 2)
    old_games = self.home_df['xg2'].head(20).mean()
    new_games = self.home_df['xg2'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def away_rollingAvgXG(self):
    if (len(self.away_df) < 10):
      return self.away_df['xg2'].mean()
    if (len(self.away_df) < 20):
      return ((self.away_df['xg2'].tail(10).mean() + self.away_df['xg2'].mean()) / 2)
    old_games = self.away_df['xg2'].head(20).mean()
    new_games = self.away_df['xg2'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def away_rollingAvgXGA(self):
    if (len(self.away_df) < 10):
      return self.away_df['xg1'].mean()
    if (len(self.away_df) < 20):
      return ((self.away_df['xg1'].tail(10).mean() + self.away_df['xg1'].mean()) / 2)
    old_games = self.away_df['xg1'].head(20).mean()
    new_games = self.away_df['xg1'].tail(10).mean()
    return ((old_games + new_games) / 2)

lg_avg_home = spi_matches_league['xg1'].sum()/len(spi_matches_league)
lg_avg_away = spi_matches_league['xg2'].sum()/len(spi_matches_league)

game_data = pd.DataFrame({'Season': np.array(season),
                          'Date': np.array(date),
                          'Home Team': np.array(hometeam),
                          'Away Team': np.array(awayteam),
                          'Home Goals': np.array(home_goals),
                          'Away Goals': np.array(away_goals),
                          'Goal Difference': np.array(home_goals) - np.array(away_goals),
                          'Pythag': np.array(pythag),
                          'Result': np.array(result),
                          'Home Odds': np.array(home_odds),
                          'Away Odds': np.array(away_odds),
                          'Draw Odds': np.array(draw_odds)})

# This slice creates a sample that can be used to generate better atk/def ratings
# Keep in mind that this extra data used to create ratings does not apply to promoted teams
game_data = game_data[game_data['Season'] > 2016].reset_index(drop=True)

game_data['Object'] = [CalcTeamRatings(game_data['Home Team'][x], game_data['Away Team'][x], game_data['Date'][x], spi_matches_league) for x in range(len(game_data))]
game_data['Home Atk'] = [game_data['Object'][x].rollingAvgXG() for x in range(len(game_data))]
game_data['Home Def'] = [game_data['Object'][x].rollingAvgXGA() for x in range(len(game_data))]
game_data['Away Atk'] = [game_data['Object'][x].away_rollingAvgXG() for x in range(len(game_data))]
game_data['Away Def'] = [game_data['Object'][x].away_rollingAvgXGA() for x in range(len(game_data))]

game_data['Home Atk'] = game_data['Home Atk'].replace(to_replace=0, value=np.nan)
game_data['Home Def'] = game_data['Home Def'].replace(to_replace=0, value=np.nan)
game_data['Away Atk'] = game_data['Away Atk'].replace(to_replace=0, value=np.nan)
game_data['Away Def'] = game_data['Away Def'].replace(to_replace=0, value=np.nan)
game_data = game_data.dropna()
game_data = game_data.reset_index(drop=True).replace(to_replace=0, value=0.01)

game_data.to_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Processed Data/La Liga.csv')
spi_matches_league.to_csv('/content/drive/My Drive/Sports Data Analysis/Soccer Data/Processed Data/spi_matches_La_Liga.csv')