In [None]:
import os
import requests
import json
from pprint import pprint
from collections import defaultdict
import pandas as pd

In [None]:
# Position mapping
position_map = {
    'G': 'goalkeeper',
    'D': 'defender',
    'M': 'midfield',
    'F': 'forward'
}

In [None]:
# Function to classify rating
def classify_rating(rating):
    if 3 <= rating < 6:
        return 'terrible'
    elif 6 <= rating < 6.5:
        return 'bad'
    elif 6.5 <= rating < 7:
        return 'average'
    elif 7 <= rating < 8:
        return 'good'
    elif 8 <= rating < 9:
        return 'great'
    elif rating >= 9:
        return 'excellent'
    return 'unknown'

In [None]:
# Function to compare two players and return a response with their real names
def compare_players(player_a_stats, player_b_stats, name_a, name_b):
    # Define weights for different statistics
    weights = {
        'goals': 4,
        'goalAssist': 3,
        'onTargetScoringAttempt': 1,
        'accuratePass': 1,
        'keyPass': 2,
        'totalTackle': 2,
        'totalClearance': 2,
        'interceptionWon': 2,
        'saves': 3,
        'goalsPrevented': 3,
        'goodHighClaim': 1
    }

    score_a = 0
    score_b = 0

    for stat, weight in weights.items():
        val_a = player_a_stats.get(stat, 0)
        val_b = player_b_stats.get(stat, 0)
        score_a += val_a * weight
        score_b += val_b * weight

    if score_a > score_b:
        return f'{name_a} had a better performance.'
    elif score_b > score_a:
        return f'{name_b} had a better performance.'
    else:
        return 'Both players had similar performances.'

In [None]:
# Function to generate player pairs for comparison
def generate_player_pairs(players):
    # Group players by position
    players_by_position = defaultdict(list)
    for player in players:
        position = position_map.get(player.get('position'), 'Unknown')
        if position != 'Unknown':
            players_by_position[position].append(player)

    # Generate all possible pairs within each position
    player_pairs = []
    for position, players in players_by_position.items():
        n = len(players)
        for i in range(n):
            for j in range(i+1, n):
                player_a = players[i]
                player_b = players[j]
                player_pairs.append((player_a, player_b, position))

    return player_pairs

In [None]:
# Function to process player data
def process_player_data(player, team_name):
    position = position_map.get(player.get('position'), 'Unknown')
    statistics_keys = {
        'forward': ['goals', 'goalAssist', 'onTargetScoringAttempt'],
        'midfield': ['accuratePass', 'keyPass', 'goalAssist'],
        'defender': ['totalTackle', 'totalClearance', 'interceptionWon'],
        'goalkeeper': ['saves', 'goalsPrevented', 'goodHighClaim']
    }.get(position, [])

    stats = {key: player['statistics'].get(key, 0) for key in statistics_keys}
    stats['rating'] = player['statistics'].get('rating', 0)
    stats['position'] = position
    stats['player_name'] = player['player']['name']
    stats['team_name'] = team_name

    return stats

In [None]:
# Function to generate instructions and responses for the dataset
def process_events_for_comparison(qtd_eventos, all_events):
    training_data = []

    for idx in range(qtd_eventos):
        event = all_events[idx]
        event_id = event['id']
        data_games = get_event_data(event_id)
        status = data_games.get('event', {}).get('status', {}).get('description', '')

        if status == 'Ended':
            home_team = data_games['event']['homeTeam']['name']
            away_team = data_games['event']['awayTeam']['name']
            event_description = f"{home_team} vs {away_team}"

            # Get the lineups of the event
            lineups = get_event_lineups(event_id)

            # Get players from both teams and associate the team name
            home_players = lineups.get('home', {}).get('players', [])
            away_players = lineups.get('away', {}).get('players', [])

            for player in home_players:
                player['team_name'] = home_team

            for player in away_players:
                player['team_name'] = away_team

            # Combine all players
            all_players = home_players + away_players

            # Generate player pairs
            player_pairs = generate_player_pairs(all_players)

            for pair in player_pairs:
                player_a, player_b, position = pair

                # Extract relevant statistics
                stats_a = process_player_data(player_a, player_a.get('team_name', 'Unknown'))
                stats_b = process_player_data(player_b, player_b.get('team_name', 'Unknown'))

                # Check if all required statistics are present
                required_stats = {
                    'forward': ['goals', 'goalAssist', 'onTargetScoringAttempt'],
                    'midfield': ['accuratePass', 'keyPass', 'goalAssist'],
                    'defender': ['totalTackle', 'totalClearance', 'interceptionWon'],
                    'goalkeeper': ['saves', 'goalsPrevented', 'goodHighClaim']
                }.get(position, [])

                # Debugging: Print missing stats if any
                missing_a = [stat for stat in required_stats if stat not in stats_a]
                missing_b = [stat for stat in required_stats if stat not in stats_b]
                if missing_a:
                    print(f"Skipping pair {stats_a['player_name']} vs {stats_b['player_name']} due to missing stats for Player A: {missing_a}")
                    continue
                if missing_b:
                    print(f"Skipping pair {stats_a['player_name']} vs {stats_b['player_name']} due to missing stats for Player B: {missing_b}")
                    continue

                # Format the instruction with the real names of the players
                instruction = (
                    f"In the match between {home_team} and {away_team}, {stats_a['player_name']}, a {position_map.get(player_a['position'], 'Unknown')} from {stats_a['team_name']}, had the following stats: "
                )

                # Add statistics of Player A
                stats_a_str = ", ".join([
                    f"{key.replace('_', ' ').capitalize()}: {value}"
                    for key, value in stats_a.items()
                    if key not in ['player_name', 'team_name', 'position']
                ])
                instruction += f"{stats_a_str}. "

                # Add statistics of Player B
                instruction += (
                    f"On the other hand, {stats_b['player_name']}, a {position_map.get(player_b['position'], 'Unknown')} from {stats_b['team_name']}, had the following stats: "
                )
                stats_b_str = ", ".join([
                    f"{key.replace('_', ' ').capitalize()}: {value}"
                    for key, value in stats_b.items()
                    if key not in ['player_name', 'team_name', 'position']
                ])
                instruction += f"{stats_b_str}. "

                # Final question
                instruction += "Based on these statistics, who had a better performance?"

                # Get the names of the players
                name_a = stats_a['player_name']
                name_b = stats_b['player_name']

                # Generate the response with the real names
                response = compare_players(stats_a, stats_b, name_a, name_b)

                # Add to the dataset
                training_data.append({
                    'Instructions': instruction,
                    'Responses': response
                })

    return training_data

In [None]:
# Function to get event data
def get_event_data(event_id):
    url = f"https://www.sofascore.com/api/v1/event/{event_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Erro ao obter dados do evento {event_id}: Código de status {response.status_code}")
        return {}

In [None]:
# Function to get lineups of an event
def get_event_lineups(event_id):
    url = f"https://www.sofascore.com/api/v1/event/{event_id}/lineups"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Erro ao obter lineups do evento {event_id}: Código de status {response.status_code}")
        return {}

In [None]:
# Function to collect all events
def collect_all_events():
    url_base = "https://www.sofascore.com/api/v1/unique-tournament/325/season/58766/events/last/"
    page_number = 0
    all_events = []

    while True:
        url = url_base + str(page_number)
        response = requests.get(url)

        if response.status_code == 404:
            break

        if response.status_code == 200:
            data = json.loads(response.content)

            if 'events' in data:
                all_events.extend(data['events'])
            else:
                print(f"Aviso: Chave 'events' não encontrada na página {page_number}.")

            page_number += 1
        else:
            print(f"Erro na requisição da página {page_number}: Código de status {response.status_code}")
            break

    print(f"Total de eventos coletados: {len(all_events)}")
    return all_events

In [None]:
# Function to create the final dataset
def create_dataset():
    all_events = collect_all_events()
    qtd_eventos = len(all_events)
    training_data = process_events_for_comparison(qtd_eventos, all_events)

    # Convert the data to a Pandas DataFrame
    df_training = pd.DataFrame(training_data)

    # Example visualization
    print(df_training.head())

    # Save the dataset as CSV for fine-tuning
    df_training.to_csv('player_comparison_dataset.csv', index=False)
    print("Dataset de comparação de jogadores salvo com sucesso.")

    # Optional: Save as JSONL
    df_training.to_json('player_comparison_dataset.jsonl', orient='records', lines=True)
    print("Dataset de comparação de jogadores salvo em formato JSONL com sucesso.")

    return df_training

In [None]:
 df_train = create_dataset()

Total de eventos coletados: 295
                                        Instructions  \
0  In the match between Fortaleza and Bahia, João...   
1  In the match between Fortaleza and Bahia, João...   
2  In the match between Fortaleza and Bahia, João...   
3  In the match between Fortaleza and Bahia, Maur...   
4  In the match between Fortaleza and Bahia, Maur...   

                                 Responses  
0   João Ricardo had a better performance.  
1   João Ricardo had a better performance.  
2   João Ricardo had a better performance.  
3  Marcos Felipe had a better performance.  
4   Both players had similar performances.  
Dataset de comparação de jogadores salvo com sucesso.
Dataset de comparação de jogadores salvo em formato JSONL com sucesso.


In [None]:
 df_train.head()

Unnamed: 0,Instructions,Responses
0,"In the match between Fortaleza and Bahia, João...",João Ricardo had a better performance.
1,"In the match between Fortaleza and Bahia, João...",João Ricardo had a better performance.
2,"In the match between Fortaleza and Bahia, João...",João Ricardo had a better performance.
3,"In the match between Fortaleza and Bahia, Maur...",Marcos Felipe had a better performance.
4,"In the match between Fortaleza and Bahia, Maur...",Both players had similar performances.


In [None]:
df_train.loc[1700, 'Instructions']

'In the match between Grêmio and Flamengo, Wallace, a midfield from Flamengo, had the following stats: Accuratepass: 1, Keypass: 0, Goalassist: 0, Rating: 6.4. On the other hand, Luis Aucélio, a midfield from Flamengo, had the following stats: Accuratepass: 0, Keypass: 0, Goalassist: 0, Rating: 0. Based on these statistics, who had a better performance?'

In [None]:
df_train.loc[1700, 'Responses']

'Wallace had a better performance.'

In [None]:
df_train.shape

(81825, 2)

In [None]:

import os
import requests
import json
from pprint import pprint
from collections import defaultdict
import pandas as pd

# Mapeamento de posições
position_map = {
    'G': 'goalkeeper',
    'D': 'defender',
    'M': 'midfield',
    'F': 'forward'
}

# Função para classificar o rating (opcional, já que removemos rating)
def classify_rating(rating):
    if 3 <= rating < 6:
        return 'terrible'
    elif 6 <= rating < 6.5:
        return 'bad'
    elif 6.5 <= rating < 7:
        return 'average'
    elif 7 <= rating < 8:
        return 'good'
    elif 8 <= rating < 9:
        return 'great'
    elif rating >= 9:
        return 'excellent'
    return 'unknown'

# Função para comparar dois jogadores e retornar uma resposta com os nomes reais
def compare_players(player_a_stats, player_b_stats, name_a, name_b, position):
    # Defina pesos para diferentes estatísticas com base na posição
    weights = {
        'forward': {
            'goals': 5,
            'goalAssist': 3,
            'onTargetScoringAttempt': 2
        },
        'midfield': {
            'accuratePass': 3,
            'keyPass': 4,
            'goalAssist': 3
        },
        'defender': {
            'totalTackle': 3,
            'totalClearance': 3,
            'interceptionWon': 2
        },
        'goalkeeper': {
            'saves': 5,
            'goalsPrevented': 4,
            'goodHighClaim': 2
        }
    }.get(position, {})

    score_a = 0
    score_b = 0

    for stat, weight in weights.items():
        val_a = player_a_stats.get(stat, 0)
        val_b = player_b_stats.get(stat, 0)
        score_a += val_a * weight
        score_b += val_b * weight

    if score_a > score_b:
        return f'{name_a} had a better performance.'
    elif score_b > score_a:
        return f'{name_b} had a better performance.'
    else:
        return 'Both players had similar performances.'

# Função para gerar pares de jogadores para comparação
def generate_player_pairs(players):
    # Agrupa jogadores por posição
    players_by_position = defaultdict(list)
    for player in players:
        position = position_map.get(player.get('position'), 'Unknown')
        if position != 'Unknown':
            players_by_position[position].append(player)

    # Gera todos os pares possíveis dentro de cada posição
    player_pairs = []
    for position, players in players_by_position.items():
        n = len(players)
        for i in range(n):
            for j in range(i+1, n):
                player_a = players[i]
                player_b = players[j]
                player_pairs.append((player_a, player_b, position))

    return player_pairs

# Função para processar dados dos jogadores
def process_player_data(player, team_name):
    position = position_map.get(player.get('position'), 'Unknown')
    statistics_keys = {
        'forward': ['goals', 'goalAssist', 'onTargetScoringAttempt'],
        'midfield': ['accuratePass', 'keyPass', 'goalAssist'],
        'defender': ['totalTackle', 'totalClearance', 'interceptionWon'],
        'goalkeeper': ['saves', 'goalsPrevented', 'goodHighClaim']
    }.get(position, [])

    # Remover 'rating' das estatísticas
    stats = {key: player['statistics'].get(key, 0) for key in statistics_keys}
    # stats['rating'] = player['statistics'].get('rating', 0)  # Removido

    stats['position'] = position
    stats['player_name'] = player['player']['name']
    stats['team_name'] = team_name

    return stats

# Função para gerar instruções e respostas para o dataset
def process_events_for_comparison(qtd_eventos, all_events):
    training_data = []

    for idx in range(qtd_eventos):
        event = all_events[idx]
        event_id = event['id']
        data_games = get_event_data(event_id)
        status = data_games.get('event', {}).get('status', {}).get('description', '')

        if status == 'Ended':
            home_team = data_games['event']['homeTeam']['name']
            away_team = data_games['event']['awayTeam']['name']
            event_description = f"{home_team} vs {away_team}"

            # Obter lineups do evento
            lineups = get_event_lineups(event_id)

            # Obter jogadores dos dois times e associar o nome do time
            home_players = lineups.get('home', {}).get('players', [])
            away_players = lineups.get('away', {}).get('players', [])

            for player in home_players:
                player['team_name'] = home_team

            for player in away_players:
                player['team_name'] = away_team

            # Combinar todos os jogadores
            all_players = home_players + away_players

            # Gerar pares de jogadores
            player_pairs = generate_player_pairs(all_players)

            for pair in player_pairs:
                player_a, player_b, position = pair

                # Extrair estatísticas relevantes
                stats_a = process_player_data(player_a, player_a.get('team_name', 'Unknown'))
                stats_b = process_player_data(player_b, player_b.get('team_name', 'Unknown'))

                # Verificar se todas as estatísticas necessárias estão presentes e >0, permitindo até uma estatística zero
                required_stats = {
                    'forward': ['goals', 'goalAssist', 'onTargetScoringAttempt'],
                    'midfield': ['accuratePass', 'keyPass', 'goalAssist'],
                    'defender': ['totalTackle', 'totalClearance', 'interceptionWon'],
                    'goalkeeper': ['saves', 'goalsPrevented', 'goodHighClaim']
                }.get(position, [])

                # Identificar estatísticas faltantes ou zero para Player A
                missing_a = [stat for stat in required_stats if stats_a.get(stat, 0) <= 0]
                # Identificar estatísticas faltantes ou zero para Player B
                missing_b = [stat for stat in required_stats if stats_b.get(stat, 0) <= 0]

                # Permitir até uma estatística zero ou faltante
                if len(missing_a) > 1:
                    print(f"Skipping pair {stats_a['player_name']} vs {stats_b['player_name']} due to Player A missing or zero stats: {missing_a}")
                    continue
                if len(missing_b) > 1:
                    print(f"Skipping pair {stats_a['player_name']} vs {stats_b['player_name']} due to Player B missing or zero stats: {missing_b}")
                    continue

                # Formatar a instrução com os nomes reais dos jogadores, excluindo o rating
                instruction = (
                    f"In the match between {home_team} and {away_team}, {stats_a['player_name']}, a {position_map.get(player_a['position'], 'Unknown')} from {stats_a['team_name']}, had the following stats: "
                )

                # Adicionar estatísticas do Jogador A
                stats_a_str = ", ".join([
                    f"{key.replace('_', ' ').capitalize()}: {value}"
                    for key, value in stats_a.items()
                    if key not in ['player_name', 'team_name', 'position']
                ])
                instruction += f"{stats_a_str}. "

                # Adicionar estatísticas do Jogador B
                instruction += (
                    f"On the other hand, {stats_b['player_name']}, a {position_map.get(player_b['position'], 'Unknown')} from {stats_b['team_name']}, had the following stats: "
                )
                stats_b_str = ", ".join([
                    f"{key.replace('_', ' ').capitalize()}: {value}"
                    for key, value in stats_b.items()
                    if key not in ['player_name', 'team_name', 'position']
                ])
                instruction += f"{stats_b_str}. "

                # Pergunta final
                instruction += "Based on these statistics, who had a better performance?"

                # Obter os nomes dos jogadores
                name_a = stats_a['player_name']
                name_b = stats_b['player_name']

                # Gerar a resposta com os nomes reais
                response = compare_players(stats_a, stats_b, name_a, name_b, position)

                # Adicionar ao dataset
                training_data.append({
                    'Instructions': instruction,
                    'Responses': response
                })

    return training_data

# Função para fazer requisição e obter dados de um evento
def get_event_data(event_id):
    url = f"https://www.sofascore.com/api/v1/event/{event_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Erro ao obter dados do evento {event_id}: Código de status {response.status_code}")
        return {}

# Função para fazer requisição e obter lineups de um evento
def get_event_lineups(event_id):
    url = f"https://www.sofascore.com/api/v1/event/{event_id}/lineups"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Erro ao obter lineups do evento {event_id}: Código de status {response.status_code}")
        return {}

# Coleta de todos os eventos
def collect_all_events():
    url_base = "https://www.sofascore.com/api/v1/unique-tournament/325/season/58766/events/last/"
    page_number = 0
    all_events = []

    while True:
        url = url_base + str(page_number)
        response = requests.get(url)

        if response.status_code == 404:
            break

        if response.status_code == 200:
            data = json.loads(response.content)

            if 'events' in data:
                all_events.extend(data['events'])
            else:
                print(f"Aviso: Chave 'events' não encontrada na página {page_number}.")

            page_number += 1
        else:
            print(f"Erro na requisição da página {page_number}: Código de status {response.status_code}")
            break

    print(f"Total de eventos coletados: {len(all_events)}")
    return all_events

# Função para criar o dataset final
def create_dataset():
    all_events = collect_all_events()
    qtd_eventos = len(all_events)
    training_data = process_events_for_comparison(qtd_eventos, all_events)

    # Convertendo os dados para um DataFrame do Pandas
    df_training = pd.DataFrame(training_data)

    # Exemplo de visualização
    print(df_training.head())

    # Salva o dataset em CSV para fine-tuning
    df_training.to_csv('player_comparison_dataset.csv', index=False)
    print("Dataset de comparação de jogadores salvo com sucesso.")

    # Opcional: Salvar em formato JSONL
    df_training.to_json('player_comparison_dataset.jsonl', orient='records', lines=True)
    print("Dataset de comparação de jogadores salvo em formato JSONL com sucesso.")

    return df_training

df_train = create_dataset()


[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
Skipping pair Leonardo Godoy vs Mateo Gamarra due to Player A missing or zero stats: ['totalTackle', 'totalClearance']
Skipping pair Leonardo Godoy vs Madson due to Player A missing or zero stats: ['totalTackle', 'totalClearance']
Skipping pair Leonardo Godoy vs Railan due to Player A missing or zero stats: ['totalTackle', 'totalClearance']
Skipping pair Leonardo Godoy vs Marllon due to Player A missing or zero stats: ['totalTackle', 'totalClearance']
Skipping pair Leonardo Godoy vs Allyson due to Player A missing or zero stats: ['totalTackle', 'totalClearance']
Skipping pair Leonardo Godoy vs Alan Empereur due to Player A missing or zero stats: ['totalTackle', 'totalClearance']
Skipping pair Leonardo Godoy vs Rikelme due to Player A missing or zero stats: ['totalTackle', 'totalClearance']
Skipping pair Leonardo Godoy vs Matheus Alexandre due to Player A missing or zero stats: ['totalTackle', 'totalClearance']
Ski

In [None]:
df_train.head()

Unnamed: 0,Instructions,Responses
0,"In the match between Fortaleza and Bahia, Eman...",Benjamin Kuscevic had a better performance.
1,"In the match between Fortaleza and Bahia, Eman...",Tomás Cardona had a better performance.
2,"In the match between Fortaleza and Bahia, Eman...",Emanuel Brítez had a better performance.
3,"In the match between Fortaleza and Bahia, Eman...",Santiago Arias had a better performance.
4,"In the match between Fortaleza and Bahia, Eman...",Gabriel Xavier had a better performance.


In [None]:
df_train.shape

(11541, 2)

In [None]:
df_train.loc[1720, 'Instructions']

'In the match between Fluminense and São Paulo, Thiago Silva, a defender from Fluminense, had the following stats: Totaltackle: 1, Totalclearance: 6, Interceptionwon: 2. On the other hand, Nahuel Ferraresi, a defender from São Paulo, had the following stats: Totaltackle: 1, Totalclearance: 2, Interceptionwon: 1. Based on these statistics, who had a better performance?'

In [None]:
df_train.loc[1720, 'Responses']

'Thiago Silva had a better performance.'

In [None]:
df_train.to_csv('player_comparison_dataset.csv', sep = ';', index=False)