# Aplicação do Modelo de Predição

> Neste notebook iremos aplicar o modelo de predição nos dados da Copa do Mundo de 2022 com duas abordagens: primeiro fazendo o modelo prever os resultados desde o início e depois com ele prevendo o apenas os resultados que ainda estão por vir.

In [3]:
# Importando bibliotecas necessárias
from dadosfera import *
import snowflake.connector
import pandas as pd
import numpy as np
import os
from joblib import dump
import matplotlib.pyplot as plt
from operator import itemgetter
import seaborn as sns

### 1. Recuperar dados do passo anterior

In [2]:
# Recuperando dados do passo anterior
data = dadosfera_utils.get_inputs() 

df = data['dict_df']['df']
team_stats_raw = data['dict_df']['team_stats_raw']
model = data['dict_df']['model']

df.head()

Unnamed: 0,home_team,away_team,target,rank_dif,goals_dif,goals_dif_l5,goals_suf_dif,goals_suf_dif_l5,goals_per_ranking_dif,dif_rank_agst,dif_rank_agst_l5,dif_points_rank,dif_points_rank_l5,is_friendly_0,is_friendly_1
2,Burkina Faso,Mali,0,28.0,0.0,0.0,0.0,0.0,-0.007459,16.0,16.0,-0.007459,-0.007459,0,1
3,Gabon,Ghana,1,16.0,0.0,0.0,0.0,0.0,-0.004183,28.0,28.0,-0.004183,-0.004183,0,1
10,Uganda,Tanzania,0,12.0,2.0,2.0,-2.0,-2.0,0.02663,-12.0,-12.0,0.0375,0.0375,0,1
18,Togo,Zimbabwe,1,47.0,-1.0,-1.0,-1.0,-1.0,-0.009804,-23.0,-23.0,0.002854,0.002854,1,0
23,Cameroon,Finland,1,-22.0,0.0,0.0,0.0,0.0,0.0,-39.0,-39.0,0.002622,0.002622,1,0


Salvar o modelo criado

In [4]:
dump(model, 'modelo.joblib')

['modelo.joblib']

### 2. Simulação da Copa do Mundo 2022 desde o início

Nesta predição, o time com a maior média de probabilidade de vitória definida pelo classificador será considerado o vencedor da partida! Essa simulação foi realizada considerando todos os times na disputa, desde o início.

In [5]:
table = {'A': [['Qatar', 0, []],
  ['Ecuador', 0, []],
  ['Senegal', 0, []],
  ['Netherlands', 0, []]],
 'B': [['England', 0, []],
  ['Iran', 0, []],
  ['United States', 0, []],
  ['Wales', 0, []]],
 'C': [['Argentina', 0, []],
  ['Saudi Arabia', 0, []],
  ['Mexico', 0, []],
  ['Poland', 0, []]],
 'D': [['France', 0, []],
  ['Australia', 0, []],
  ['Denmark', 0, []],
  ['Tunisia', 0, []]],
 'E': [['Spain', 0, []],
  ['Costa Rica', 0, []],
  ['Germany', 0, []],
  ['Japan', 0, []]],
 'F': [['Belgium', 0, []],
  ['Canada', 0, []],
  ['Morocco', 0, []],
  ['Croatia', 0, []]],
 'G': [['Brazil', 0, []],
  ['Serbia', 0, []],
  ['Switzerland', 0, []],
  ['Cameroon', 0, []]],
 'H': [['Portugal', 0, []],
  ['Ghana', 0, []],
  ['Uruguay', 0, []],
  ['South Korea', 0, []]]}

matches = [('A', 'Qatar', 'Ecuador'),
 ('A', 'Senegal', 'Netherlands'),
 ('A', 'Qatar', 'Senegal'),
 ('A', 'Netherlands', 'Ecuador'),
 ('A', 'Ecuador', 'Senegal'),
 ('A', 'Netherlands', 'Qatar'),
 
 ('B', 'England', 'Iran'),
 ('B', 'United States', 'Wales'),
 ('B', 'England', 'United States'),
 ('B', 'Wales', 'Iran'),
 ('B', 'Iran', 'United States'),
 ('B', 'Wales', 'England'),
 
 ('C', 'Argentina', 'Saudi Arabia'),
 ('C', 'Mexico', 'Poland'),
 ('C', 'Argentina', 'Mexico'),
 ('C', 'Poland', 'Saudi Arabia'),
 ('C', 'Saudi Arabia', 'Mexico'),
 ('C', 'Poland', 'Argentina'),
 
 ('D', 'France', 'Australia'),
 ('D', 'Denmark', 'Tunisia'),
 ('D', 'France', 'Denmark'),
 ('D', 'Tunisia', 'Australia'),
 ('D', 'Australia', 'Denmark'),
 ('D', 'Tunisia', 'France'),
 
 ('E', 'Spain', 'Costa Rica'),
 ('E', 'Germany', 'Japan'),
 ('E', 'Spain', 'Germany'),
 ('E', 'Japan', 'Costa Rica'),
 ('E', 'Costa Rica', 'Germany'),
 ('E', 'Japan', 'Spain'),
 
 ('F', 'Belgium', 'Canada'),
 ('F', 'Morocco', 'Croatia'),
 ('F', 'Belgium', 'Morocco'),
 ('F', 'Croatia', 'Canada'),
 ('F', 'Canada', 'Morocco'),
 ('F', 'Croatia', 'Belgium'),
    
 ('G', 'Brazil', 'Serbia'),
 ('G', 'Switzerland', 'Cameroon'),
 ('G', 'Brazil', 'Switzerland'),
 ('G', 'Cameroon', 'Serbia'),
 ('G', 'Serbia', 'Switzerland'),
 ('G', 'Cameroon', 'Brazil'),
 
 ('H', 'Portugal', 'South Korea'),
 ('H', 'Ghana', 'Portugal'),
 ('H', 'Uruguay', 'Portugal'),
 ('H', 'South Korea', 'Ghana'),
 ('H', 'South Korea', 'Uruguay'),
 ('H', 'Ghana', 'Uruguay')]

In [6]:
def find_stats(team_1):
    past_games = team_stats_raw[(team_stats_raw["team"] == team_1)].sort_values("date")
    last5 = team_stats_raw[(team_stats_raw["team"] == team_1)].sort_values("date").tail(5)

    team_1_rank = past_games["rank"].values[-1]
    team_1_goals = past_games.score.mean()
    team_1_goals_l5 = last5.score.mean()
    team_1_goals_suf = past_games.suf_score.mean()
    team_1_goals_suf_l5 = last5.suf_score.mean()
    team_1_rank_suf = past_games.rank_suf.mean()
    team_1_rank_suf_l5 = last5.rank_suf.mean()
    team_1_gp_rank = past_games.points_by_rank.mean()
    team_1_gp_rank_l5 = last5.points_by_rank.mean()

    return [team_1_rank, team_1_goals, team_1_goals_l5, team_1_goals_suf, team_1_goals_suf_l5, team_1_rank_suf, team_1_rank_suf_l5, team_1_gp_rank, team_1_gp_rank_l5]

In [7]:
def find_features(team_1, team_2):
    rank_dif = team_1[0] - team_2[0]
    goals_dif = team_1[1] - team_2[1]
    goals_dif_l5 = team_1[2] - team_2[2]
    goals_suf_dif = team_1[3] - team_2[3]
    goals_suf_dif_l5 = team_1[4] - team_2[4]
    goals_per_ranking_dif = (team_1[1]/team_1[5]) - (team_2[1]/team_2[5])
    dif_rank_agst = team_1[5] - team_2[5]
    dif_rank_agst_l5 = team_1[6] - team_2[6]
    dif_gp_rank = team_1[7] - team_2[7]
    dif_gp_rank_l5 = team_1[8] - team_2[8]
    
    return [rank_dif, goals_dif, goals_dif_l5, goals_suf_dif, goals_suf_dif_l5, goals_per_ranking_dif, dif_rank_agst, dif_rank_agst_l5, dif_gp_rank, dif_gp_rank_l5, 1, 0]

In [8]:
advanced_group = []
last_group = ""

for k in table.keys():
    for t in table[k]:
        t[1] = 0
        t[2] = []
        
for teams in matches:
    draw = False
    team_1 = find_stats(teams[1])
    team_2 = find_stats(teams[2])

    features_g1 = find_features(team_1, team_2)
    features_g2 = find_features(team_2, team_1)

    probs_g1 = model.predict_proba([features_g1])
    probs_g2 = model.predict_proba([features_g2])
    
    team_1_prob_g1 = probs_g1[0][0]
    team_1_prob_g2 = probs_g2[0][1]
    team_2_prob_g1 = probs_g1[0][1]
    team_2_prob_g2 = probs_g2[0][0]

    team_1_prob = (probs_g1[0][0] + probs_g2[0][1])/2
    team_2_prob = (probs_g2[0][0] + probs_g1[0][1])/2
    
    if ((team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)) | ((team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)):
        draw=True
        for i in table[teams[0]]:
            if i[0] == teams[1] or i[0] == teams[2]:
                i[1] += 1
                
    elif team_1_prob > team_2_prob:
        winner = teams[1]
        winner_proba = team_1_prob
        for i in table[teams[0]]:
            if i[0] == teams[1]:
                i[1] += 3
                
    elif team_2_prob > team_1_prob:  
        winner = teams[2]
        winner_proba = team_2_prob
        for i in table[teams[0]]:
            if i[0] == teams[2]:
                i[1] += 3
    
    for i in table[teams[0]]: #adding criterio de desempate (probs por jogo)
            if i[0] == teams[1]:
                i[2].append(team_1_prob)
            if i[0] == teams[2]:
                i[2].append(team_2_prob)

    if last_group != teams[0]:
        if last_group != "":
            print("\n")
            print("Do grupo %s avançam: "%(last_group))
            
            for i in table[last_group]: #adding crieterio de desempate
                i[2] = np.mean(i[2])
            
            final_points = table[last_group]
            final_table = sorted(final_points, key=itemgetter(1, 2), reverse = True)
            advanced_group.append([final_table[0][0], final_table[1][0]])
            for i in final_table:
                print("%s -------- %d"%(i[0], i[1]))
        print("\n")
        print("-"*10+" Simulando o Grupo %s "%(teams[0])+"-"*10)
        
        
    if draw == False:
        print("Grupo %s - %s vs. %s: Vencedor %s com %.2f de probabilidade"%(teams[0], teams[1], teams[2], winner, winner_proba))
    else:
        print("Grupo %s - %s vs. %s: Empate"%(teams[0], teams[1], teams[2]))
    last_group =  teams[0]

print("\n")
print("Do grupo %s avançam: "%(last_group))

for i in table[last_group]: #adding crieterio de desempate
    i[2] = np.mean(i[2])
            
final_points = table[last_group]
final_table = sorted(final_points, key=itemgetter(1, 2), reverse = True)
advanced_group.append([final_table[0][0], final_table[1][0]])
for i in final_table:
    print("%s -------- %d"%(i[0], i[1]))



---------- Simulando o Grupo A ----------
Grupo A - Qatar vs. Ecuador: Vencedor Ecuador com 0.74 de probabilidade
Grupo A - Senegal vs. Netherlands: Vencedor Netherlands com 0.74 de probabilidade
Grupo A - Qatar vs. Senegal: Vencedor Senegal com 0.60 de probabilidade
Grupo A - Netherlands vs. Ecuador: Vencedor Netherlands com 0.72 de probabilidade
Grupo A - Ecuador vs. Senegal: Vencedor Ecuador com 0.62 de probabilidade
Grupo A - Netherlands vs. Qatar: Vencedor Netherlands com 0.88 de probabilidade


Do grupo A avançam: 
Netherlands -------- 9
Ecuador -------- 6
Senegal -------- 3
Qatar -------- 0


---------- Simulando o Grupo B ----------
Grupo B - England vs. Iran: Vencedor England com 0.81 de probabilidade
Grupo B - United States vs. Wales: Vencedor United States com 0.63 de probabilidade
Grupo B - England vs. United States: Vencedor England com 0.60 de probabilidade
Grupo B - Wales vs. Iran: Empate
Grupo B - Iran vs. United States: Vencedor United States com 0.68 de probabilidad

In [9]:
advanced = advanced_group

In [10]:
playoffs = {"s Oitavas de Final": [], "s Quartas de Final": [], " Semi-Final": [], " Final": []}

for p in playoffs.keys():
    playoffs[p] = []

actual_round = ""
next_rounds = []

for p in playoffs.keys():
    if p == "s Oitavas de Final":
        control = []
        for a in range(0, len(advanced*2), 1):
            if a < len(advanced):
                if a % 2 == 0:
                    control.append((advanced*2)[a][0])
                else:
                    control.append((advanced*2)[a][1])
            else:
                if a % 2 == 0:
                    control.append((advanced*2)[a][1])
                else:
                    control.append((advanced*2)[a][0])

        playoffs[p] = [[control[c], control[c+1]] for c in range(0, len(control)-1, 1) if c%2 == 0]
        
        for i in range(0, len(playoffs[p]), 1):
            game = playoffs[p][i]
            
            home = game[0]
            away = game[1]
            team_1 = find_stats(home)
            team_2 = find_stats(away)

            features_g1 = find_features(team_1, team_2)
            features_g2 = find_features(team_2, team_1)
            
            probs_g1 = model.predict_proba([features_g1])
            probs_g2 = model.predict_proba([features_g2])
            
            team_1_prob = (probs_g1[0][0] + probs_g2[0][1])/2
            team_2_prob = (probs_g2[0][0] + probs_g1[0][1])/2
            
            if actual_round != p:
                print("-"*10)
                print("Simulação da%s"%(p))
                print("-"*10)
                print("\n")
            
            if team_1_prob < team_2_prob:
                print("%s vs. %s: %s avança com a probabilidade de %.2f"%(home, away, away, team_2_prob))
                next_rounds.append(away)
            else:
                print("%s vs. %s: %s avança com a probabilidade de %.2f"%(home, away, home, team_1_prob))
                next_rounds.append(home)
            
            game.append([team_1_prob, team_2_prob])
            playoffs[p][i] = game
            actual_round = p
        
    else:
        playoffs[p] = [[next_rounds[c], next_rounds[c+1]] for c in range(0, len(next_rounds)-1, 1) if c%2 == 0]
        next_rounds = []
        for i in range(0, len(playoffs[p])):
            game = playoffs[p][i]
            home = game[0]
            away = game[1]
            team_1 = find_stats(home)
            team_2 = find_stats(away)
            
            features_g1 = find_features(team_1, team_2)
            features_g2 = find_features(team_2, team_1)
            
            probs_g1 = model.predict_proba([features_g1])
            probs_g2 = model.predict_proba([features_g2])
            
            team_1_prob = (probs_g1[0][0] + probs_g2[0][1])/2
            team_2_prob = (probs_g2[0][0] + probs_g1[0][1])/2
            
            if actual_round != p:
                print("-"*10)
                print("Simulação da%s"%(p))
                print("-"*10)
                print("\n")
            
            if team_1_prob < team_2_prob:
                print("%s vs. %s: %s avança com a probabilidade de %.2f"%(home, away, away, team_2_prob))
                next_rounds.append(away)
            else:
                print("%s vs. %s: %s avança com a probabilidade de %.2f"%(home, away, home, team_1_prob))
                next_rounds.append(home)
            game.append([team_1_prob, team_2_prob])
            playoffs[p][i] = game
            actual_round = p
            

----------
Simulação das Oitavas de Final
----------


Netherlands vs. United States: Netherlands avança com a probabilidade de 0.61
Argentina vs. Denmark: Argentina avança com a probabilidade de 0.60
Spain vs. Croatia: Spain avança com a probabilidade de 0.59
Brazil vs. Uruguay: Brazil avança com a probabilidade de 0.69
Ecuador vs. England: England avança com a probabilidade de 0.74
Mexico vs. France: France avança com a probabilidade de 0.63
Germany vs. Belgium: Germany avança com a probabilidade de 0.62
Switzerland vs. Portugal: Portugal avança com a probabilidade de 0.68
----------
Simulação das Quartas de Final
----------


Netherlands vs. Argentina: Argentina avança com a probabilidade de 0.50
Spain vs. Brazil: Brazil avança com a probabilidade de 0.64
England vs. France: France avança com a probabilidade de 0.55
Germany vs. Portugal: Germany avança com a probabilidade de 0.50
----------
Simulação da Semi-Final
----------


Argentina vs. Brazil: Brazil avança com a probabilidade 

Agora que o nosso modelo escreveu os palpites dele, podemos comparar com o que de fato aconteceu!

Ele acertou 10 dos 32 países que poderiam estar nas Oitavas de Final:
- Netherlands 
- United States
- Argentina
- Spain
- Croatia
- Brazil
- England
- France
- Switzerland
- Portugal

.. e acabou errando 6:
- Germany
- Belgium
- Mexico
- Uruguay
- Ecuador
- Denmark

Vemos que o modelo pode ser melhorado ainda, mas considerando que ele começou simulando os 32 países desde o início e conseguiu predizer 10 que estariam na nas Oitavas, já é um sinal positivo! 

Como vimos anteriormente, é esperado que ele tenha um desempenho de aproximadamente 76%.

### 3. Simulação da Copa do Mundo 2022 a partir das quartas de finais - atual
Agora iremos simular os próximos resultados da Copa a partir do modelo criado!

In [11]:
def define_winner(team_1, team_2):
    team_1_stats = find_stats(team_1)
    team_2_stats = find_stats(team_2)
    
    features_g1 = find_features(team_1_stats, team_2_stats)
    features_g2 = find_features(team_2_stats, team_1_stats)

    probs_g1 = model.predict_proba([features_g1])
    probs_g2 = model.predict_proba([features_g2])

    team_1_prob = (probs_g1[0][0] + probs_g2[0][1])/2
    team_2_prob = (probs_g2[0][0] + probs_g1[0][1])/2
    
    print(f'{team_1} x {team_2}')

    if team_1_prob > team_2_prob:
        print(f'{team_1} irá vencer com probabilidade de {round(team_1_prob, 3)*100}%\n')
    else:
        print(f'{team_2} irá vencer com probabilidade de {round(team_2_prob, 3)*100}%\n')

#### **Quartas de final**

In [21]:
define_winner('Croatia', 'Brazil')
define_winner('Netherlands', 'Argentina')
define_winner('Morocco', 'Portugal')
define_winner('England', 'France')

Croatia x Brazil
Brazil irá vencer com probabilidade de 76.4%

Netherlands x Argentina
Argentina irá vencer com probabilidade de 50.5%

Morocco x Portugal
Portugal irá vencer com probabilidade de 74.4%

England x France
France irá vencer com probabilidade de 54.6%



#### **Semi final**

In [22]:
define_winner('Brazil', 'Argentina')
define_winner('Portugal', 'France')

Brazil x Argentina
Brazil irá vencer com probabilidade de 60.099999999999994%

Portugal x France
France irá vencer com probabilidade de 57.99999999999999%



#### **Final!**

In [23]:
define_winner('Brazil', 'France')

Brazil x France
Brazil irá vencer com probabilidade de 61.9%



E com este resultado, nós finalizamos a nossa predição da Copa do Mundo 2022 utilizando Machine Learning.