In [None]:
import requests
import json
import pandas as pd

In [None]:
def get_event_data(event_id):
    """
    Obtém os dados básicos de um evento (partida) usando a API do Sofascore.
    """
    url = f"https://www.sofascore.com/api/v1/event/{event_id}"
    response = requests.get(url)
    return response.json()


In [None]:
def get_event_statistics(event_id):
    """
    Obtém as estatísticas detalhadas de um evento (partida) usando a API do Sofascore.
    """
    url = f"https://www.sofascore.com/api/v1/event/{event_id}/statistics"
    response = requests.get(url)
    return response.json()


In [None]:
def extract_stat_value(value):
    """
    Converte valores de estatísticas em números float, removendo símbolos como '%' e substituindo vírgulas por pontos.
    """
    if isinstance(value, str):
        # Remove '%' e substitui vírgula por ponto
        value = value.replace('%', '').replace(',', '.').strip()
        try:
            return float(value)
        except ValueError:
            return 0.0
    elif isinstance(value, (int, float)):
        return value
    else:
        return 0.0


In [None]:
def process_events_for_team_performance(all_events):
    """
    Processa a lista de eventos para extrair as estatísticas de cada equipe e prepara os dados para o dataset.
    """
    team_performance_data = []
    qtd_eventos = len(all_events)

    for event in range(qtd_eventos):
        event_id = all_events[event]['id']
        data_games = get_event_data(event_id)
        status = data_games['event']['status']['description']

        if status == 'Ended':
            home_team = data_games['event']['homeTeam']['name']
            away_team = data_games['event']['awayTeam']['name']
            event_description = f"{home_team} vs {away_team}"

            # Obter estatísticas da partida
            statistics_data = get_event_statistics(event_id)

            if 'statistics' in statistics_data:
                statistics_list = statistics_data['statistics']
                # Procurar pelo período "ALL"
                for period_data in statistics_list:
                    if period_data['period'] == 'ALL':
                        groups = period_data['groups']
                        home_stats = {}
                        away_stats = {}
                        # Percorrer cada grupo
                        for group in groups:
                            statistics_items = group['statisticsItems']
                            # Percorrer cada estatística
                            for stat in statistics_items:
                                stat_name = stat['name']
                                home_value = stat.get('home', '0')
                                away_value = stat.get('away', '0')
                                home_stats[stat_name] = home_value
                                away_stats[stat_name] = away_value

                        # Adicionar dados ao dataset
                        team_performance_data.append({
                            'event_id': event_id,
                            'event_description': event_description,
                            'team': home_team,
                            'opponent': away_team,
                            'is_home': True,
                            'statistics': home_stats
                        })

                        team_performance_data.append({
                            'event_id': event_id,
                            'event_description': event_description,
                            'team': away_team,
                            'opponent': home_team,
                            'is_home': False,
                            'statistics': away_stats
                        })
                        break  # Saindo do loop após encontrar o período "ALL"
                else:
                    print(f"Período 'ALL' não encontrado para o evento {event_id}")
            else:
                print(f"Estatísticas não encontradas para o evento {event_id}")
    return team_performance_data


In [None]:
def create_team_performance_training_data(team_performance_data):
    training_data = []

    for data in team_performance_data:
        team_name = data['team']
        opponent_name = data['opponent']
        event_description = data['event_description']
        is_home = data['is_home']
        stats = data['statistics']

        # Extract relevant statistics
        ball_possession = stats.get('Ball possession', '0%')
        total_shots = stats.get('Total shots', 0)
        shots_on_target = stats.get('Shots on target', 0)
        pass_success = stats.get('Passes accurate %', '0%')
        corner_kicks = stats.get('Corner kicks', 0)
        fouls = stats.get('Fouls', 0)

        # Convert values
        ball_possession_value = extract_stat_value(ball_possession)
        pass_success_value = extract_stat_value(pass_success)
        total_shots_value = extract_stat_value(total_shots)
        shots_on_target_value = extract_stat_value(shots_on_target)
        corner_kicks_value = extract_stat_value(corner_kicks)
        fouls_value = extract_stat_value(fouls)

        # Create the instruction
        instruction = (f"In the match {event_description}, the team {team_name} had {ball_possession}% ball possession, "
                       f"{int(total_shots_value)} total shots, {int(shots_on_target_value)} shots on target, pass accuracy of {pass_success}%, "
                       f"{int(corner_kicks_value)} corner kicks and committed {int(fouls_value)} fouls. "
                       f"How was the team's overall performance?")

        # Create a more detailed response
        if ball_possession_value >= 60 and shots_on_target_value >= 10 and pass_success_value >= 85:
            performance = (
                f"The {team_name} displayed exceptional performance, dominating ball possession with {ball_possession}% and "
                f"creating {int(total_shots_value)} goal opportunities. Additionally, the team maintained a high pass accuracy of {pass_success}%, "
                f"demonstrating absolute control of the game."
            )
        elif ball_possession_value >= 55 and shots_on_target_value >= 7 and pass_success_value >= 80:
            performance = (
                f"The {team_name} had a very good performance, controlling the game with {ball_possession}% ball possession. "
                f"The team made {int(shots_on_target_value)} shots on target and maintained a pass accuracy of {pass_success}%, "
                f"indicating a strong offensive capability."
            )
        elif ball_possession_value >= 50 and shots_on_target_value >= 4 and pass_success_value >= 75:
            performance = (
                f"The {team_name} had a good performance, ensuring a ball possession of {ball_possession}% and creating several goal chances. "
                f"With {int(shots_on_target_value)} shots on target and a pass accuracy of {pass_success}%, the team showed consistency "
                f"in their game strategy."
            )
        elif ball_possession_value >= 45 and shots_on_target_value >= 2 and pass_success_value >= 70:
            performance = (
                f"The {team_name} showed an average performance, competing evenly against {opponent_name}. "
                f"With {ball_possession}% ball possession and {int(shots_on_target_value)} shots on target, the team managed "
                f"to stay competitive throughout the game."
            )
        else:
            performance = (
                f"The {team_name} had a below-expected performance, with limited ball possession of only {ball_possession}%. "
                f"The team created few goal opportunities, reflected in the {int(shots_on_target_value)} shots on target, "
                f"and faced difficulties in maintaining pass accuracy with {pass_success}%. "
                f"These factors contributed to an unsatisfactory performance in the match."
            )

        response = performance

        training_data.append({
            'Instructions': instruction,
            'Responses': response
        })
    return training_data


In [None]:
# URL base para obter eventos da temporada
url_base = "https://www.sofascore.com/api/v1/unique-tournament/325/season/58766/events/last/"
page_number = 0
all_events = []

while True:
    url = url_base + str(page_number)
    response = requests.get(url)

    if response.status_code == 404:
        break

    if response.status_code == 200:
        data = json.loads(response.content)

        if 'events' in data:
            all_events.extend(data['events'])
        else:
            print(f"Aviso: Chave 'events' não encontrada na página {page_number}.")

        page_number += 1
    else:
        print(f"Erro na requisição da página {page_number}: Código de status {response.status_code}")
        break

print(f"Total de eventos coletados: {len(all_events)}")


Total de eventos coletados: 295


In [None]:
# Processar os eventos para preparar os dados
team_performance_data = process_events_for_team_performance(all_events)


Estatísticas não encontradas para o evento 12117001


In [None]:
# Criar o dataset de treinamento
training_data = create_team_performance_training_data(team_performance_data)


In [None]:
# Converter para DataFrame
df_team_performance = pd.DataFrame(training_data)

# Visualizar as primeiras linhas do DataFrame
df_team_performance.head()


Unnamed: 0,Instructions,Responses
0,"In the match Fortaleza vs Bahia, the team Fort...",The Fortaleza had a below-expected performance...
1,"In the match Fortaleza vs Bahia, the team Bahi...","The Bahia had a below-expected performance, wi..."
2,In the match Atlético Mineiro vs Red Bull Brag...,The Atlético Mineiro had a below-expected perf...
3,In the match Atlético Mineiro vs Red Bull Brag...,The Red Bull Bragantino had a below-expected p...
4,"In the match Vasco vs Palmeiras, the team Vasc...","The Vasco had a below-expected performance, wi..."


In [None]:
# Salvar o DataFrame como CSV
df_team_performance.to_csv('team_performance_dataset.csv', index=False, sep=';')
print("Dataset de desempenho da equipe salvo com sucesso.")


Dataset de desempenho da equipe salvo com sucesso.


In [None]:
# Visualizar algumas entradas do dataset
df_team_performance.head()


Unnamed: 0,Instructions,Responses
0,"In the match Fortaleza vs Bahia, the team Fort...",The Fortaleza had a below-expected performance...
1,"In the match Fortaleza vs Bahia, the team Bahi...","The Bahia had a below-expected performance, wi..."
2,In the match Atlético Mineiro vs Red Bull Brag...,The Atlético Mineiro had a below-expected perf...
3,In the match Atlético Mineiro vs Red Bull Brag...,The Red Bull Bragantino had a below-expected p...
4,"In the match Vasco vs Palmeiras, the team Vasc...","The Vasco had a below-expected performance, wi..."


In [None]:
df_team_performance.shape

(570, 2)

In [None]:
df_team_performance.to_csv('team_performance_dataset.csv', sep=';', index=False)