In [2]:
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np
from scipy.optimize import minimize

import numpy as np
from bs4 import BeautifulSoup

def GenerateDataBase(seasons):
    #df contains all matches from all imported seasons

    df = pd.concat(seasons, axis = 0).reset_index(drop = True)

    #Change Result column to numerical value.
    for index in df.index:
        if df['FTR'][index] == 'D':
            df['FTR'][index] = 1
        elif df['FTR'][index] == 'H':
            df['FTR'][index] = 0
        else:
            df['FTR'][index] = 2

    df['FTR'] = df['FTR'].astype(int)

    #We need the following columns to normalize probability of Win, Draw or Away
    df['B365Tot'] = (1/df['B365H']+1/df['B365D']+1/df['B365A'])
    df['WHTot'] = (1/df['WHH']+1/df['WHD']+1/df['WHA'])

    #Estimate probability of H win, D, or A win.
    df['PH'] = 1/2*(1/df['B365H']*(1/df['B365Tot'])+(1/df['WHH'])*(1/df['WHTot']))
    df['PD'] = 1/2*(1/df['B365D']*(1/df['B365Tot'])+1/df['WHD']*(1/df['WHTot']))
    df['PA'] = 1/2*(1/df['B365A']*(1/df['B365Tot'])+1/df['WHA']*(1/df['WHTot']))

    #Can drop the following columns, as we already have computed probability.
    bet_cols = ['B365H', 'B365D', 'B365A', 'WHH', 'WHD', 'WHA', 'B365Tot', 'WHTot'] #'BWH', 'BWD', 'BWA'
    df.drop(bet_cols, axis = 1, inplace = True)

    #Return df containing 'HomeTeam', 'AwayTeam', 'FTR', 'PH', 'PD', and 'PA'
    return df

def GenerateNewSeason(soup):
    new_season = pd.DataFrame(columns = ['HomeTeam', 'AwayTeam', 'HOdds', 'DOdds', 'AOdds'])

    mydivs = soup.findAll("div", {"class": "ssm-SiteSearchNameMarket gl-Market_General gl-Market_General-topborder gl-Market_General-pwidth50"})
    target_html = []
    for div in mydivs:
        #for subdiv in div.findAll("div", {"class":"ssm-SiteSearchLabelOnlyParticipant gl-Market_General-cn1"}):
        for subdiv in div.findAll("div", {"class":"ssm-SiteSearchLabelOnlyParticipant gl-Market_General-cn1"}):
            for span in subdiv.findAll("span"):
                target_html.append(span)

    new_season['HomeTeam']=[x.text.split(' v ')[0] for x in target_html]
    new_season['AwayTeam']=[x.text.split(' v ')[1] for x in target_html]

    mydivs = soup.findAll("div", {"class": "ssm-SiteSearchOddsOnlyParticipant gl-Participant_General gl-Market_General-cn1"})
    odds = []
    for div in mydivs:
        for subdiv in div.findAll("div", {"class":"ssm-SiteSearchOddsOnlyParticipant_Wrapper"}):
            for span in subdiv.findAll("span"):
                odds.append(span)


    n = int(len(odds)/3)
    hodds = [float(x.text) for x in odds[0:n]]
    dodds = [float(x.text) for x in odds[n:2*n]]
    aodds = [float(x.text) for x in odds[2*n:3*n]]

    new_season['HOdds'] = hodds
    new_season['DOdds'] = dodds
    new_season['AOdds'] = aodds

    #We need the following columns to normalize probability of Win, Draw or Away
    new_season['OddsTot'] = (1/new_season['HOdds']+1/new_season['DOdds']+1/new_season['AOdds'])

    #Estimate probability of H win, D, or A win.
    new_season['PH'] = 1/new_season['HOdds']*(1/new_season['OddsTot'])
    new_season['PD'] = 1/new_season['DOdds']*(1/new_season['OddsTot'])
    new_season['PA'] = 1/new_season['AOdds']*(1/new_season['OddsTot'])

    #drop unnecessary columns
    new_season.drop('OddsTot', axis = 1, inplace = True)

    return new_season

def cleanna(df):
    df.dropna(axis = 0, inplace = True)

def DropNotInTeams(new_season, df):
    new_season = new_season[(new_season['HomeTeam'].isin(df['HomeTeam'].unique())) & (new_season['AwayTeam'].isin(df['HomeTeam'].unique()))]
    return new_season


#Import historical data from Primera Division
s2021 = pd.read_csv('SP1.csv')
s2020 = pd.read_csv('SP2.csv')
s2019 = pd.read_csv('SP3.csv')
s2018 = pd.read_csv('SP5.csv')

#Import historical data from Segunda Division
Segunda2021 = pd.read_csv('Segunda2021.csv')
Segunda2020 = pd.read_csv('Segunda2020.csv')
Segunda2019 = pd.read_csv('Segunda2019.csv')

In [26]:
#Choose desired columns
selected_columns = ['HomeTeam', 'AwayTeam', 'FTR', 'B365H', 'B365D', 'B365A', 'WHH', 'WHD', 'WHA']
seasons1 = [s2021[selected_columns], s2020[selected_columns], s2019[selected_columns], s2018[selected_columns]]
seasons2 = [Segunda2021[selected_columns], Segunda2020[selected_columns], Segunda2019[selected_columns]]

f1 = open("FutureMatches.txt", "r")
f2 = open('FutureMatchesSegunda.txt', 'r')
soup = BeautifulSoup(f1)
soup2 = BeautifulSoup(f2)
f1.close()
f2.close()

#Upcoming Matches in Primera and Segunda Division
#Returns dataframes with 'HomeTeam', 'AwayTeam', 'HOdds', 'DOdds', 'AOdds',  'PH', 'PD', 'PA'
new_primera = GenerateNewSeason(soup)
new_segunda = GenerateNewSeason(soup2)

#Historical dataframes of Primera and Segunda division
df_primera = GenerateDataBase(seasons1)
df_segunda = GenerateDataBase(seasons2)

#Correct wrong names
dictio1 = {'Alavés':'Alaves', 'Real Sociedad':'Sociedad', 'Atlético de Madrid':'Ath Madrid', 'Real Betis': 'Betis', 'Cádiz':'Cadiz', 'Athletic de Bilbao':'Ath Bilbao', 'Celta de Vigo':'Celta'}
dictio2 = {'FC Cartagena': 'Cartagena', 'UD Logronés':'Logrones', 'Málaga C.F.':'Malaga', 'CD Castellón':'Castellon', 'Rayo Vallecano':'Vallecano', 'Racing Santander': 'Santander'}
new_primera['HomeTeam'].replace(dictio1, inplace = True)
new_primera['AwayTeam'].replace(dictio1, inplace = True)
new_segunda['HomeTeam'].replace(dictio2, inplace = True)
new_segunda['AwayTeam'].replace(dictio2, inplace = True)
new_primera=DropNotInTeams(new_primera, df_primera)
new_segunda=DropNotInTeams(new_segunda, df_segunda)

In [35]:
predictiondf = pd.DataFrame(columns = new_primera.columns)
predictiondf[['AdjPH', 'AdjPD', 'AdjPA']] = 0
predictiondf[['Outcome']] = 0
for match in new_primera.index:
    try:
        ht, at = new_primera['HomeTeam'].loc[match], new_primera['AwayTeam'].loc[match] #Store HT and AT
        df = df_primera
        historicdf = df[(df['HomeTeam'] == ht) & (df['AwayTeam'] == at)].drop(['HomeTeam', 'AwayTeam'], axis = 1)
        clf = RandomForestClassifier(n_estimators=50, criterion = 'entropy', max_depth=3, random_state=0)
        clf.fit(historicdf.drop(['FTR'], axis = 1), historicdf[['FTR']])clf.predict_proba(new_primera.drop(['HomeTeam', 'AwayTeam', 'HOdds', 'DOdds', 'AOdds'], axis = 1).loc[match])
        print(probabvect)
        predictiondf['AdjPH'][index] = probabvect[0]
        predictiondf['AdjPD'][index] = probabvect[1]
        predictiondf['AdjPA'][index] = probabvect[2]
        predictiondf[['Outcome']] = clf.predict(new_primera.loc[match])[0]
    except:
        print('Error in index %d' % match)

Error in index 0
Error in index 1
Error in index 2
Error in index 3
Error in index 4
Error in index 5
Error in index 6
Error in index 7
Error in index 8
Error in index 9
Error in index 10
Error in index 11


In [None]:


#Encode team names for each season
le1 = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
df_primera[['HomeTeam']] = df_primera[['HomeTeam']].apply(le1.fit_transform)
df_primera[['AwayTeam']] = df_primera[['AwayTeam']].apply(le1.transform)
df_segunda[['HomeTeam']] = df_segunda[['HomeTeam']].apply(le2.fit_transform)
df_segunda[['AwayTeam']] = df_segunda[['AwayTeam']].apply(le2.transform)

new_primera[['HomeTeam', 'AwayTeam']]=new_primera[['HomeTeam', 'AwayTeam']].apply(le1.transform)
new_segunda[['HomeTeam', 'AwayTeam']]=new_segunda[['HomeTeam', 'AwayTeam']].apply(le2.transform)

cleanna(df_primera), cleanna(df_segunda), cleanna(new_primera), cleanna(new_segunda)

#Train each classifier
clf_primera = RandomForestClassifier(n_estimators=50, criterion = 'entropy', max_depth=3, random_state=0)
clf_segunda = RandomForestClassifier(n_estimators=50, criterion = 'entropy', max_depth=3, random_state=0)
clf_primera.fit(df_primera.drop('FTR', axis = 1), df_primera['FTR'])
clf_segunda.fit(df_segunda.drop('FTR', axis = 1), df_segunda['FTR'])

#predict primera
predict_columns = ['HomeTeam', 'AwayTeam', 'PH', 'PD', 'PA']
new_primera[['PH', 'PD', 'PA']] = clf_primera.predict_proba(new_primera[predict_columns])
new_primera['PredictedOutcome'] = clf_primera.predict(new_primera[predict_columns])

#predict segunda
new_segunda[['PH', 'PD', 'PA']] = clf_primera.predict_proba(new_segunda[predict_columns])
new_segunda['PredictedOutcome'] = clf_primera.predict(new_segunda[predict_columns])

#unlabel
tocatcols = ['HomeTeam', 'AwayTeam']
new_primera[tocatcols] = new_primera[tocatcols].apply(le1.inverse_transform)
new_segunda[tocatcols] = new_segunda[tocatcols].apply(le2.inverse_transform)

#concatenate primera and segunda predictions
predn = pd.concat([new_primera, new_segunda], axis = 0).reset_index(drop = True)

#create list of upcoming quiniela matches via zip
hometeam_list = ['Levante', 'Huesca', 'Elche', 'Sevilla', 'Sociedad', 'Ath Bilbao', 'Osasuna', 'Betis', 'Logrones', 'Fuenlabrada', 'Girona', 'Malaga', 'Vallecano', 'Las Palmas']
awayteam_list = ['Granada', 'Real Madrid', 'Villarreal', 'Getafe', 'Cadiz', 'Valencia', 'Eibar', 'Barcelona', 'Sporting', 'Almeria', 'Leganes', 'Zaragoza', 'Tenerife', 'Sabadell']
matches = list(zip(hometeam_list, awayteam_list))

#new df where we store the quiniela matches
quiniela = pd.DataFrame(columns = predn.columns)
for match in matches:
    for index in predn.index:
        h, a = match
        if (predn['HomeTeam'].loc[index] == h) and (predn['AwayTeam'].loc[index] == a):
            quiniela.loc[index] = predn.loc[index]