![alt text](https://trello-attachments.s3.amazonaws.com/56b4dc9a5618cc0446578ec6/5c6ffc4673fc7e8ac0a66708/c5f7d8d143f69ffd6683d6ad5b72c58f/Machine-Learning-e-Ciência-de-dados-nas-apostas-esportivas.png)

## Importando bibliotecas

In [87]:
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import requests
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

warnings.filterwarnings('ignore')
%matplotlib inline
matplotlib.style.use('default')
plt.rcParams['figure.figsize'] = [6, 4]

## Manipulando dataset

In [88]:
team_df = pd.read_csv('dataset_with_date.csv', sep=';').drop(['Unnamed: 0'], axis=1).sample(frac=1).reset_index(drop=True)

In [89]:
display(team_df.head())

Unnamed: 0,HomeTeam,AwayTeam,FTR,B365H,B365D,B365A,league,Date
0,Bradford,Oldham,A,2.7,3.3,2.88,England - League 1,2014-04-05
1,Schalke 04,Bielefeld,D,1.3,5.25,10.0,Germany - Bundesliga 1,2008-10-25
2,Bournemouth,Leeds,A,4.33,3.4,1.72,England - League 1,2007-11-06
3,Millwall,Southend,A,2.38,3.4,3.2,England - League 1,2015-09-19
4,Shamrock Rovers,Finn Harps,H,1.46,4.07,7.06,Ireland - Premier Division,2011-03-31


In [90]:
display(team_df.shape)

(136660, 8)

In [91]:
team_df = team_df[team_df.league == 'Spain - La Liga Primera Division'].sample(frac=1).reset_index(drop=True)

In [92]:
team_df = team_df.loc[:500,:]
team_df

Unnamed: 0,HomeTeam,AwayTeam,FTR,B365H,B365D,B365A,league,Date
0,Almeria,Espanol,A,2.80,3.30,2.50,Spain - La Liga Primera Division,2009-05-23
1,Betis,Sevilla,A,3.80,3.40,2.05,Spain - La Liga Primera Division,2017-02-25
2,Villarreal,Osasuna,A,1.57,3.75,6.00,Spain - La Liga Primera Division,2010-01-31
3,Granada,Sociedad,A,2.40,3.10,3.20,Spain - La Liga Primera Division,2015-09-22
4,Villarreal,Vallecano,H,1.60,4.20,5.50,Spain - La Liga Primera Division,2013-03-17
...,...,...,...,...,...,...,...,...
496,Villarreal,Elche,H,1.36,5.00,8.50,Spain - La Liga Primera Division,2015-05-10
497,Barcelona,Malaga,H,1.17,7.00,15.00,Spain - La Liga Primera Division,2010-02-27
498,Getafe,Levante,H,1.83,3.40,4.50,Spain - La Liga Primera Division,2013-11-29
499,Osasuna,Celta,D,2.90,3.10,2.60,Spain - La Liga Primera Division,2016-09-18


In [93]:
team_df.Date.unique().shape

(412,)

## Usando a API para pegar o Elo Score

In [94]:
from io import StringIO
list_df = []
for data in team_df.Date.unique():
    elo_csv = requests.get(f"http://api.clubelo.com/{data}", allow_redirects=True)
    elo_csv = str(elo_csv.content,'utf-8')
    elo_csv = StringIO(elo_csv) 
    elo_df = pd.read_csv(elo_csv)
    list_df.append(elo_df)

In [95]:
elo_df_final = pd.concat(list_df)

In [96]:
elo_df_final.shape

(254078, 7)

In [97]:
elo_df_final.head()

Unnamed: 0,Rank,Club,Country,Level,Elo,From,To
0,1,Man United,ENG,1,2008.739502,2009-05-17,2009-05-24
1,2,Liverpool,ENG,1,1977.893555,2009-05-18,2009-05-24
2,3,Barcelona,ESP,1,1975.755371,2009-05-18,2009-05-23
3,4,Chelsea,ENG,1,1974.973999,2009-05-18,2009-05-23
4,5,Arsenal,ENG,1,1892.493164,2009-05-17,2009-05-24


In [98]:
elo_df_final.drop_duplicates(inplace=True)

In [99]:
elo_df_final.reset_index(drop=True)

Unnamed: 0,Rank,Club,Country,Level,Elo,From,To
0,1,Man United,ENG,1,2008.739502,2009-05-17,2009-05-24
1,2,Liverpool,ENG,1,1977.893555,2009-05-18,2009-05-24
2,3,Barcelona,ESP,1,1975.755371,2009-05-18,2009-05-23
3,4,Chelsea,ENG,1,1974.973999,2009-05-18,2009-05-23
4,5,Arsenal,ENG,1,1892.493164,2009-05-17,2009-05-24
...,...,...,...,...,...,...,...
146244,,Hapoel Haifa,ISR,1,1222.329346,2014-12-07,2014-12-14
146245,,Ross County,SCO,1,1193.589966,2014-12-12,2014-12-13
146246,,St Mirren,SCO,1,1174.492188,2014-12-12,2014-12-14
146247,,Marek Dupnitza,BUL,1,1086.119141,2014-12-10,2014-12-13


In [100]:
display(team_df.head())

Unnamed: 0,HomeTeam,AwayTeam,FTR,B365H,B365D,B365A,league,Date
0,Almeria,Espanol,A,2.8,3.3,2.5,Spain - La Liga Primera Division,2009-05-23
1,Betis,Sevilla,A,3.8,3.4,2.05,Spain - La Liga Primera Division,2017-02-25
2,Villarreal,Osasuna,A,1.57,3.75,6.0,Spain - La Liga Primera Division,2010-01-31
3,Granada,Sociedad,A,2.4,3.1,3.2,Spain - La Liga Primera Division,2015-09-22
4,Villarreal,Vallecano,H,1.6,4.2,5.5,Spain - La Liga Primera Division,2013-03-17


In [101]:
elo_df_esp = elo_df_final[elo_df_final['Country'] == 'ESP'].reset_index(drop=True)

In [102]:
elo_df_esp

Unnamed: 0,Rank,Club,Country,Level,Elo,From,To
0,3,Barcelona,ESP,1,1975.755371,2009-05-18,2009-05-23
1,6,Real Madrid,ESP,1,1876.529053,2009-05-17,2009-05-24
2,11,Sevilla,ESP,1,1820.997559,2009-05-21,2009-05-23
3,16,Atletico,ESP,1,1811.556030,2009-05-18,2009-05-23
4,18,Villarreal,ESP,1,1786.427124,2009-05-17,2009-05-23
...,...,...,...,...,...,...,...
14300,,Sabadell,ESP,2,1502.200073,2014-12-12,2014-12-14
14301,,Tenerife,ESP,2,1501.845459,2014-12-12,2014-12-14
14302,,Leganes,ESP,2,1496.792236,2014-12-12,2014-12-14
14303,,Albacete,ESP,2,1485.289917,2014-12-12,2014-12-14


In [103]:
copy_team_df = team_df.copy()
copy_team_df = copy_team_df.reset_index(drop=True)

In [104]:
copy_team_df

Unnamed: 0,HomeTeam,AwayTeam,FTR,B365H,B365D,B365A,league,Date
0,Almeria,Espanol,A,2.80,3.30,2.50,Spain - La Liga Primera Division,2009-05-23
1,Betis,Sevilla,A,3.80,3.40,2.05,Spain - La Liga Primera Division,2017-02-25
2,Villarreal,Osasuna,A,1.57,3.75,6.00,Spain - La Liga Primera Division,2010-01-31
3,Granada,Sociedad,A,2.40,3.10,3.20,Spain - La Liga Primera Division,2015-09-22
4,Villarreal,Vallecano,H,1.60,4.20,5.50,Spain - La Liga Primera Division,2013-03-17
...,...,...,...,...,...,...,...,...
496,Villarreal,Elche,H,1.36,5.00,8.50,Spain - La Liga Primera Division,2015-05-10
497,Barcelona,Malaga,H,1.17,7.00,15.00,Spain - La Liga Primera Division,2010-02-27
498,Getafe,Levante,H,1.83,3.40,4.50,Spain - La Liga Primera Division,2013-11-29
499,Osasuna,Celta,D,2.90,3.10,2.60,Spain - La Liga Primera Division,2016-09-18


In [105]:
copy_team_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501 entries, 0 to 500
Data columns (total 8 columns):
HomeTeam    501 non-null object
AwayTeam    501 non-null object
FTR         501 non-null object
B365H       501 non-null float64
B365D       501 non-null float64
B365A       501 non-null float64
league      501 non-null object
Date        501 non-null object
dtypes: float64(3), object(5)
memory usage: 31.4+ KB


In [106]:
copy_team_df.dropna(inplace=True)

In [107]:
lista_times = ['Atlético Minas Gerais', 'Cruzeiro', 'Palmeiras', 'Atlético Goiás']
process.extract('Atlético-MG', lista_times)

[('Atlético Minas Gerais', 86),
 ('Atlético Goiás', 82),
 ('Palmeiras', 32),
 ('Cruzeiro', 22)]

## Criando colunas com Elo Score do Mandante e do Visitante

In [108]:
for key, row in copy_team_df.iterrows():
    if row['HomeTeam'] == 'Ath Madrid':
        row['HomeTeam'] = 'Atletico'
    if row['HomeTeam'] == 'La Coruna':
        copy_team_df.drop(key,inplace=True)
    choices = list(elo_df_esp.Club.unique())
    team = process.extract(row['HomeTeam'], choices, limit=2)[0][0]
    team_elo = elo_df_esp[(elo_df_esp['Club'] == team) & ((elo_df_esp['To'] == row['Date']) | (elo_df_esp['From'] == row['Date']))]['Elo']
    if not team_elo.empty:
        copy_team_df.loc[key,'home_elo'] = team_elo.values[0]

In [109]:
copy_team_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 492 entries, 0 to 291
Data columns (total 9 columns):
HomeTeam    488 non-null object
AwayTeam    488 non-null object
FTR         488 non-null object
B365H       488 non-null float64
B365D       488 non-null float64
B365A       488 non-null float64
league      488 non-null object
Date        488 non-null object
home_elo    478 non-null float64
dtypes: float64(4), object(5)
memory usage: 58.4+ KB


In [110]:
copy_team_df.dropna(inplace=True)

In [111]:
for key, row in copy_team_df.iterrows():
    if row['AwayTeam'] == 'Ath Madrid':
        row['AwayTeam'] = 'Atletico'
    if row['AwayTeam'] == 'La Coruna':
        copy_team_df.drop(key,inplace=True)
    choices = list(elo_df_esp.Club.unique())
    team = process.extract(row['AwayTeam'], choices, limit=2)[0][0]
    team_elo = elo_df_esp[(elo_df_esp['Club'] == team) & ((elo_df_esp['To'] == row['Date']) | (elo_df_esp['From'] == row['Date']))]['Elo']
    if not team_elo.empty:
        copy_team_df.loc[key,'away_elo'] = team_elo.values[0]

In [112]:
copy_team_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 468 entries, 0 to 398
Data columns (total 10 columns):
HomeTeam    461 non-null object
AwayTeam    461 non-null object
FTR         461 non-null object
B365H       461 non-null float64
B365D       461 non-null float64
B365A       461 non-null float64
league      461 non-null object
Date        461 non-null object
home_elo    461 non-null float64
away_elo    463 non-null float64
dtypes: float64(5), object(5)
memory usage: 60.2+ KB


In [113]:
copy_team_df.dropna(inplace=True)

In [114]:
copy_team_df = copy_team_df.reset_index(drop=True)

In [115]:
copy_team_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456 entries, 0 to 455
Data columns (total 10 columns):
HomeTeam    456 non-null object
AwayTeam    456 non-null object
FTR         456 non-null object
B365H       456 non-null float64
B365D       456 non-null float64
B365A       456 non-null float64
league      456 non-null object
Date        456 non-null object
home_elo    456 non-null float64
away_elo    456 non-null float64
dtypes: float64(5), object(5)
memory usage: 35.8+ KB


In [116]:
copy_team_df

Unnamed: 0,HomeTeam,AwayTeam,FTR,B365H,B365D,B365A,league,Date,home_elo,away_elo
0,Almeria,Espanol,A,2.80,3.30,2.50,Spain - La Liga Primera Division,2009-05-23,1710.323853,1706.179688
1,Betis,Sevilla,A,3.80,3.40,2.05,Spain - La Liga Primera Division,2017-02-25,1652.570068,1844.111328
2,Villarreal,Osasuna,A,1.57,3.75,6.00,Spain - La Liga Primera Division,2010-01-31,1757.755615,1703.363647
3,Granada,Sociedad,A,2.40,3.10,3.20,Spain - La Liga Primera Division,2015-09-22,1648.902710,1724.027222
4,Villarreal,Vallecano,H,1.60,4.20,5.50,Spain - La Liga Primera Division,2013-03-17,1662.804810,1700.998535
...,...,...,...,...,...,...,...,...,...,...
451,Villarreal,Elche,H,1.36,5.00,8.50,Spain - La Liga Primera Division,2015-05-10,1778.408081,1692.216431
452,Barcelona,Malaga,H,1.17,7.00,15.00,Spain - La Liga Primera Division,2010-02-27,1978.289673,1703.936646
453,Getafe,Levante,H,1.83,3.40,4.50,Spain - La Liga Primera Division,2013-11-29,1692.529053,1672.404663
454,Osasuna,Celta,D,2.90,3.10,2.60,Spain - La Liga Primera Division,2016-09-18,1616.232056,1745.238892


In [117]:
copy_team_df.to_csv('dataset_elo_spain_v4.csv', sep=';', index=False)