In [None]:
import os
import requests
from pprint import pprint
import pandas as pd

In [None]:
# # Lista de IDs dos eventos
# ids = [12117117, 12117120, 12117115, 12117121, 12117123, 12117114, 12117122, 12117119, 12117116, 12117118]

In [None]:
# url = "https://www.sofascore.com/api/v1/unique-tournament/325/season/58766/events/last/0"
# response = requests.get(url)
# data = response.json()
# pprint(data)

In [None]:
import requests
import json

url_base = "https://www.sofascore.com/api/v1/unique-tournament/325/season/58766/events/last/"
page_number = 0
all_events = []

while True:
  url = url_base + str(page_number)
  response = requests.get(url)

  if response.status_code == 404:
    break

  if response.status_code == 200:
    data = json.loads(response.content)

    if 'events' in data:
        all_events.extend(data['events'])
    else:
        print(f"Aviso: Chave 'events' não encontrada na página {page_number}.")

    page_number += 1
  else:
    print(f"Erro na requisição da página {page_number}: Código de status {response.status_code}")
    break

print(f"Total de eventos coletados: {len(all_events)}")

Total de eventos coletados: 295


In [None]:
pprint(all_events[0]['id'])

12117241


In [None]:
qtd_eventos = len(all_events)
print(qtd_eventos)

295


In [None]:
# Função para fazer requisição e obter dados de um evento
def get_event_data(event_id):
    url = f"https://www.sofascore.com/api/v1/event/{event_id}"
    response = requests.get(url)
    return response.json()

In [None]:
# Função para fazer requisição e obter lineups de um evento
def get_event_lineups(event_id):
    url = f"https://www.sofascore.com/api/v1/event/{event_id}/lineups"
    response = requests.get(url)
    return response.json()

In [None]:
# Função para gerar o conteúdo do documento para um jogador
def generate_player_document(team, player):
    player_name = player['player']['name']
    position_map = {
        'G': 'goalkeeper',
        'D': 'defender',
        'M': 'midfield',
        'F': 'forward'
    }
    position = position_map.get(player['position'], 'Unknown')

    document_content = f"Team: {team} | Player: {player_name} | Position: {position}"

    # Estatísticas de acordo com a posição do jogador
    if position == 'forward':
        statistics_keys = ['goals', 'goalAssist', 'onTargetScoringAttempt']
    elif position == 'midfield':
        statistics_keys = ['accuratePass', 'keyPass', 'goalAssist']
    elif position == 'defender':
        statistics_keys = ['totalTackle', 'totalClearance', 'interceptionWon']
    elif position == 'goalkeeper':
        statistics_keys = ['saves', 'goalsPrevented', 'goodHighClaim']
    else:
        statistics_keys = []

    # Adiciona as estatísticas
    for key in statistics_keys:
        value = player['statistics'].get(key, 'N/A')
        if value != 'N/A':
            document_content += f" | {key}: {value}"

    # Adiciona rating se existir
    rating = player['statistics'].get('rating', 'N/A')
    if rating != 'N/A':
        document_content += f" | rating: {rating}"

    return document_content

In [None]:
# pprint(get_event_lineups(12117200))

In [None]:
# Função para salvar documento em arquivo .txt
def save_match_document(filename, content, is_first):
    # Cria a pasta 'data' se não existir
    os.makedirs('data', exist_ok=True)

    # Define o caminho completo do arquivo dentro da pasta 'data'
    filepath = os.path.join('data', filename)

    # Define o modo de escrita ('w' para o primeiro documento, 'a' para os subsequentes)
    mode = 'w' if is_first else 'a'
    with open(filepath, mode) as file:
        file.write(content + "\n")


In [None]:
from collections import defaultdict

# Mapeamento de posições conforme a função generate_player_document
position_map = {
    'G': 'goalkeeper',
    'D': 'defender',
    'M': 'midfield',
    'F': 'forward'
}

# Função para extrair todas as chaves de 'statistics' por posição
def extract_statistics_by_position(players):
    statistics_by_position = defaultdict(set)  # Cria um dicionário com conjuntos para cada posição

    for player in players:
        # Obtém a posição usando o mapeamento fornecido
        raw_position = player.get('position')
        position = position_map.get(raw_position, 'Unknown')

        if 'statistics' in player:
            statistics_by_position[position].update(player['statistics'].keys())

    return statistics_by_position

# Função principal para processar eventos e coletar estatísticas por posição
def process_events(qtd_eventos, all_events):
    all_statistics_by_position = defaultdict(set)  # Dicionário global para todas as posições

    for event in range(qtd_eventos):
        # Obter dados do evento
        event_id = all_events[event]['id']
        data_games = get_event_data(event_id)

        home_team = data_games['event']['homeTeam']['name']
        away_team = data_games['event']['awayTeam']['name']
        round = data_games['event']['roundInfo']['round']
        status = data_games['event']['status']['description']
        print(f"{home_team} x {away_team} - rodada {round} - id {event_id} - status {status}")

        if status == 'Ended':
            home_score = data_games['event']['homeScore']['current']
            away_score = data_games['event']['awayScore']['current']
            round_number = data_games['event']['roundInfo']['round']

            # Define o nome do arquivo com base nos times
            match_title = f"Rodada{round_number}_{home_team}_vs_{away_team}.txt"

            # Escrever estatísticas básicas da partida
            match_summary = f"Match: {home_team} vs {away_team} | Round: {round} | Status: {status} | Score: {home_team} {home_score} - {away_score} {away_team}"
            save_match_document(match_title, match_summary, is_first=True)

            # Obter lineups do evento
            lineups = get_event_lineups(event_id)

            # Processar jogadores do time da casa
            home_players = lineups['home']['players']
            home_statistics_by_position = extract_statistics_by_position(home_players)

            for position, statistics in home_statistics_by_position.items():
                all_statistics_by_position[position].update(statistics)

            for player in home_players:
                player_document = generate_player_document(home_team, player)
                save_match_document(match_title, player_document, is_first=False)

            # Processar jogadores do time visitante
            away_players = lineups['away']['players']
            away_statistics_by_position = extract_statistics_by_position(away_players)

            for position, statistics in away_statistics_by_position.items():
                all_statistics_by_position[position].update(statistics)

            for player in away_players:
                player_document = generate_player_document(away_team, player)
                save_match_document(match_title, player_document, is_first=False)

    return all_statistics_by_position

# Executa o processamento dos eventos
qtd_eventos = len(all_events)  # Supondo que você tenha o total de eventos
all_statistics_by_position = process_events(qtd_eventos, all_events)

# Exibe todas as chaves de 'statistics' por posição
print("Chaves de 'statistics' encontradas por posição:")
for position, statistics in all_statistics_by_position.items():
    print(f"Posição {position}: {statistics}")


Fortaleza x Bahia - rodada 27 - id 12117241 - status Ended
Atlético Mineiro x Red Bull Bragantino - rodada 27 - id 12117235 - status Ended
Vasco x Palmeiras - rodada 27 - id 12117238 - status Ended
Cuiabá x Cruzeiro - rodada 27 - id 12117242 - status Ended
São Paulo x Internacional - rodada 27 - id 12117240 - status Ended
Grêmio x Flamengo - rodada 27 - id 12117234 - status Ended
Criciúma x Athletico - rodada 27 - id 12117243 - status Ended
Red Bull Bragantino x Internacional - rodada 16 - id 12851651 - status Ended
Grêmio x Criciúma - rodada 5 - id 12860733 - status Ended
Palmeiras x Atlético Mineiro - rodada 28 - id 12117249 - status Ended
Botafogo x Grêmio - rodada 28 - id 12117248 - status Ended
Juventude x Red Bull Bragantino - rodada 28 - id 12117253 - status Ended
Fortaleza x Cuiabá - rodada 28 - id 12117251 - status Ended
Atlético Goianiense x Fluminense - rodada 28 - id 12117252 - status Ended
São Paulo x Corinthians - rodada 28 - id 12117250 - status Ended
Bahia x Criciúma - 

In [None]:
import os
import requests
import json
from pprint import pprint
from collections import defaultdict
import pandas as pd

# Mapeamento de posições conforme a função generate_player_document
position_map = {
    'G': 'goalkeeper',
    'D': 'defender',
    'M': 'midfield',
    'F': 'forward'
}

# Função para classificar o rating em uma nova chave 'ratingLabel'
def classify_rating(rating):
    if 3 <= rating < 6:
        return 'terrible'
    elif 6 <= rating < 6.5:
        return 'bad'
    elif 6.5 <= rating < 7:
        return 'average'
    elif 7 <= rating < 8:
        return 'good'
    elif 8 <= rating < 9:
        return 'great'
    elif rating >= 9:
        return 'excellent'
    return 'unknown'

# Conjunto de estatísticas relevantes por posição
position_required_stats = {
    'forward': ['goals', 'goalAssist', 'onTargetScoringAttempt'],
    'midfield': ['accuratePass', 'keyPass', 'goalAssist'],
    'defender': ['totalTackle', 'totalClearance', 'interceptionWon'],
    'goalkeeper': ['saves', 'goalsPrevented', 'goodHighClaim']
}

# Função para processar os dados e adicionar a classificação de rating
def process_player_data(players, event_description):
    processed_data = []
    statistics_keys = [
        'goals', 'goalAssist', 'onTargetScoringAttempt', 'accuratePass',
        'keyPass', 'totalTackle', 'totalClearance', 'interceptionWon',
        'saves', 'goalsPrevented', 'goodHighClaim', 'rating'
    ]

    for player in players:
        position = position_map.get(player.get('position'), 'Unknown')
        if position == 'Unknown':
            continue

        statistics = player.get('statistics', {})
        rating = statistics.get('rating', 0)

        # Verificar se todas as estatísticas requeridas para a posição estão presentes
        required_stats = position_required_stats.get(position, [])
        if not all(stat in statistics and statistics[stat] != 0 for stat in required_stats):
            continue

        # Classificar o rating
        rating_label = classify_rating(rating)
        player['ratingLabel'] = rating_label

        # Montar features com base nas estatísticas chave
        features = {
            'player_name': player['player']['name'],
            'position': position,
            'rating': rating,
            'ratingLabel': rating_label,
            'event_id': player.get('event_id'),  # Adicionar o event_id para referência
            'event_description': event_description  # Adicionar descrição do evento
        }

        for key in statistics_keys:
            features[key] = statistics.get(key, 0)

        processed_data.append(features)

    return processed_data

# Função principal para processar eventos e preparar os dados para fine-tuning
def process_events_for_fine_tuning(qtd_eventos, all_events):
    all_processed_data = []

    for event in range(qtd_eventos):
        event_id = all_events[event]['id']
        data_games = get_event_data(event_id)
        status = data_games['event']['status']['description']

        if status == 'Ended':
            home_team = data_games['event']['homeTeam']['name']
            away_team = data_games['event']['awayTeam']['name']
            event_description = f"{home_team} vs {away_team}"  # Descrição do evento

            # Obter lineups do evento
            lineups = get_event_lineups(event_id)

            # Processar jogadores do time da casa
            home_players = lineups['home']['players']
            # Adiciona o event_id aos jogadores para facilitar o cálculo de médias
            for player in home_players:
                player['event_id'] = event_id
            all_processed_data.extend(process_player_data(home_players, event_description))

            # Processar jogadores do time visitante
            away_players = lineups['away']['players']
            # Adiciona o event_id aos jogadores para facilitar o cálculo de médias
            for player in away_players:
                player['event_id'] = event_id
            all_processed_data.extend(process_player_data(away_players, event_description))

    return all_processed_data

# Inclua a lógica atual de obtenção de eventos
url_base = "https://www.sofascore.com/api/v1/unique-tournament/325/season/58766/events/last/"
page_number = 0
all_events = []

while True:
    url = url_base + str(page_number)
    response = requests.get(url)

    if response.status_code == 404:
        break

    if response.status_code == 200:
        data = json.loads(response.content)

        if 'events' in data:
            all_events.extend(data['events'])
        else:
            print(f"Aviso: Chave 'events' não encontrada na página {page_number}.")

        page_number += 1
    else:
        print(f"Erro na requisição da página {page_number}: Código de status {response.status_code}")
        break

print(f"Total de eventos coletados: {len(all_events)}")

# Executa o processamento dos eventos para preparar os dados
qtd_eventos = len(all_events)  # Supondo que você tenha o total de eventos
processed_data = process_events_for_fine_tuning(qtd_eventos, all_events)

# Convertendo os dados para um DataFrame do Pandas para fácil manipulação
df = pd.DataFrame(processed_data)

# Calculando as médias das estatísticas para jogadores da mesma posição dentro do mesmo evento
statistics_keys = [
    'goals', 'goalAssist', 'onTargetScoringAttempt', 'accuratePass',
    'keyPass', 'totalTackle', 'totalClearance', 'interceptionWon',
    'saves', 'goalsPrevented', 'goodHighClaim', 'rating'
]

# Calcula as médias por evento e posição
event_position_means = df.groupby(['event_id', 'position'])[statistics_keys].mean().reset_index()

# Renomeia as colunas para indicar que são médias por evento e posição
event_position_means.columns = ['event_id', 'position'] + [f'avg_{col}' for col in statistics_keys]

# Junta as médias calculadas de volta ao DataFrame principal
df = df.merge(event_position_means, on=['event_id', 'position'], how='left')

# Salva o DataFrame como CSV para uso posterior no fine-tuning
df.to_csv('processed_player_data.csv', index=False)
print("Dados processados e salvos com sucesso para fine-tuning.")
print("df.shape:", df.shape)

df.head()

Total de eventos coletados: 295
Dados processados e salvos com sucesso para fine-tuning.
df.shape: (1557, 29)


Unnamed: 0,player_name,position,rating,ratingLabel,event_id,event_description,goals,goalAssist,onTargetScoringAttempt,accuratePass,...,avg_onTargetScoringAttempt,avg_accuratePass,avg_keyPass,avg_totalTackle,avg_totalClearance,avg_interceptionWon,avg_saves,avg_goalsPrevented,avg_goodHighClaim,avg_rating
0,Emanuel Brítez,defender,7.3,good,12117241,Fortaleza vs Bahia,0,1,0,24,...,0.0,48.142857,0.857143,2.857143,3.0,1.857143,0.0,0.0,0.0,6.885714
1,Benjamin Kuscevic,defender,7.0,good,12117241,Fortaleza vs Bahia,0,0,0,28,...,0.0,48.142857,0.857143,2.857143,3.0,1.857143,0.0,0.0,0.0,6.885714
2,Tomás Cardona,defender,7.4,good,12117241,Fortaleza vs Bahia,0,0,0,42,...,0.0,48.142857,0.857143,2.857143,3.0,1.857143,0.0,0.0,0.0,6.885714
3,Eros Mancuso,defender,6.4,bad,12117241,Fortaleza vs Bahia,0,0,0,30,...,0.0,48.142857,0.857143,2.857143,3.0,1.857143,0.0,0.0,0.0,6.885714
4,Zé Welison,midfield,6.3,bad,12117241,Fortaleza vs Bahia,0,1,0,32,...,0.0,29.333333,1.333333,1.666667,0.333333,0.333333,0.0,0.0,0.0,6.766667


In [None]:
df.tail()

Unnamed: 0,player_name,position,rating,ratingLabel,event_id,event_description,goals,goalAssist,onTargetScoringAttempt,accuratePass,...,avg_onTargetScoringAttempt,avg_accuratePass,avg_keyPass,avg_totalTackle,avg_totalClearance,avg_interceptionWon,avg_saves,avg_goalsPrevented,avg_goodHighClaim,avg_rating
1552,Lucas Esteves,defender,7.5,good,12116996,Vitória vs Bahia,0,0,1,39,...,0.5,45.5,1.5,2.5,4.75,1.0,0.0,0.0,0.0,7.225
1553,Matheuzinho,forward,8.3,great,12116996,Vitória vs Bahia,1,1,3,21,...,2.0,17.0,1.5,0.5,0.0,1.5,0.0,0.0,0.0,8.3
1554,Santiago Arias,defender,6.8,average,12116996,Vitória vs Bahia,0,0,0,37,...,0.5,45.5,1.5,2.5,4.75,1.0,0.0,0.0,0.0,7.225
1555,Luciano Juba,defender,6.8,average,12116996,Vitória vs Bahia,0,0,0,55,...,0.5,45.5,1.5,2.5,4.75,1.0,0.0,0.0,0.0,7.225
1556,Biel,forward,8.3,great,12116996,Vitória vs Bahia,1,1,1,13,...,2.0,17.0,1.5,0.5,0.0,1.5,0.0,0.0,0.0,8.3


In [None]:
df.loc[0]

Unnamed: 0,0
player_name,Emanuel Brítez
position,defender
rating,7.3
ratingLabel,good
event_id,12117241
event_description,Fortaleza vs Bahia
goals,0
goalAssist,1
onTargetScoringAttempt,0
accuratePass,24


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1557 entries, 0 to 1556
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   player_name                 1557 non-null   object 
 1   position                    1557 non-null   object 
 2   rating                      1557 non-null   float64
 3   ratingLabel                 1557 non-null   object 
 4   event_id                    1557 non-null   int64  
 5   event_description           1557 non-null   object 
 6   goals                       1557 non-null   int64  
 7   goalAssist                  1557 non-null   int64  
 8   onTargetScoringAttempt      1557 non-null   int64  
 9   accuratePass                1557 non-null   int64  
 10  keyPass                     1557 non-null   int64  
 11  totalTackle                 1557 non-null   int64  
 12  totalClearance              1557 non-null   int64  
 13  interceptionWon             1557 

In [None]:
def create_training_data(df):
    """
    Adiciona colunas 'x' e 'y' ao DataFrame original para fine-tuning do Llama 2,
    formatando as estatísticas com uma casa decimal.

    Args:
        df: DataFrame original com os dados dos jogadores.

    Returns:
        DataFrame: DataFrame original com as colunas 'x' (texto de entrada) e 'y' (rótulo de avaliação) adicionadas.
    """

    for _, row in df.iterrows():
        player_stats = row.to_dict()

        # Formata as estatísticas com uma casa decimal
        for key in player_stats:
            if isinstance(player_stats[key], float):
                player_stats[key] = f"{player_stats[key]:.1f}"

        base_performance_response = f"In the match between {player_stats.get('event_description')}, the player {player_stats.get('player_name')}, playing as a {player_stats.get('position')}"

        if player_stats.get('position') == 'forward':
            base_performance_response += f" scored {player_stats.get('goals')} goals, provided {player_stats.get('goalAssist')} assists, and had {player_stats.get('onTargetScoringAttempt')} shots on target."
        elif player_stats.get('position') == 'midfield':
            base_performance_response += f" had {player_stats.get('accuratePass')} accurate passes, created {player_stats.get('keyPass')} key chances, and provided {player_stats.get('goalAssist')} assists."
        elif player_stats.get('position') == 'defender':
            base_performance_response += f" made {player_stats.get('totalTackle')} tackles, {player_stats.get('totalClearance')} clearances, and {player_stats.get('interceptionWon')} interceptions."
        elif player_stats.get('position') == 'goalkeeper':
            base_performance_response += f" made {player_stats.get('saves')} saves, prevented {player_stats.get('goalsPrevented')} goals, and made {player_stats.get('goodHighClaim')} high claims."

        base_performance_response += " The average for players in this position was "

        if player_stats.get('position') == 'forward':
            base_performance_response += f"{player_stats.get('avg_goals')} goals, {player_stats.get('avg_goalAssist')} assists, and {player_stats.get('avg_onTargetScoringAttempt')} shots on target."
        elif player_stats.get('position') == 'midfield':
            base_performance_response += f"{player_stats.get('avg_accuratePass')} accurate passes, {player_stats.get('avg_keyPass')} key chances, and {player_stats.get('avg_goalAssist')} assists."
        elif player_stats.get('position') == 'defender':
            base_performance_response += f"{player_stats.get('avg_totalTackle')} tackles, {player_stats.get('avg_totalClearance')} clearances, and {player_stats.get('avg_interceptionWon')} interceptions."
        elif player_stats.get('position') == 'goalkeeper':
            base_performance_response += f"{player_stats.get('avg_saves')} saves, {player_stats.get('avg_goalsPrevented')} prevented goals, and {player_stats.get('avg_goodHighClaim')} high claims."

        base_performance_response += " Taking this into account, how was his performance?"

        df.loc[_, 'x'] = base_performance_response
        df.loc[_, 'y'] = f"His performance was " + player_stats.get('ratingLabel')

    return df

df = create_training_data(df)
df.head()

Unnamed: 0,player_name,position,rating,ratingLabel,event_id,event_description,goals,goalAssist,onTargetScoringAttempt,accuratePass,...,avg_keyPass,avg_totalTackle,avg_totalClearance,avg_interceptionWon,avg_saves,avg_goalsPrevented,avg_goodHighClaim,avg_rating,x,y
0,Emanuel Brítez,defender,7.3,good,12117241,Fortaleza vs Bahia,0,1,0,24,...,0.857143,2.857143,3.0,1.857143,0.0,0.0,0.0,6.885714,"In the match between Fortaleza vs Bahia, the p...",His performance was good
1,Benjamin Kuscevic,defender,7.0,good,12117241,Fortaleza vs Bahia,0,0,0,28,...,0.857143,2.857143,3.0,1.857143,0.0,0.0,0.0,6.885714,"In the match between Fortaleza vs Bahia, the p...",His performance was good
2,Tomás Cardona,defender,7.4,good,12117241,Fortaleza vs Bahia,0,0,0,42,...,0.857143,2.857143,3.0,1.857143,0.0,0.0,0.0,6.885714,"In the match between Fortaleza vs Bahia, the p...",His performance was good
3,Eros Mancuso,defender,6.4,bad,12117241,Fortaleza vs Bahia,0,0,0,30,...,0.857143,2.857143,3.0,1.857143,0.0,0.0,0.0,6.885714,"In the match between Fortaleza vs Bahia, the p...",His performance was bad
4,Zé Welison,midfield,6.3,bad,12117241,Fortaleza vs Bahia,0,1,0,32,...,1.333333,1.666667,0.333333,0.333333,0.0,0.0,0.0,6.766667,"In the match between Fortaleza vs Bahia, the p...",His performance was bad


In [None]:
df.loc[0, 'x']

'In the match between Fortaleza vs Bahia, the player Emanuel Brítez, playing as a defender made 1 tackles, 1 clearances, and 3 interceptions. The average for players in this position was 2.9 tackles, 3.0 clearances, and 1.9 interceptions. Taking this into account, how was his performance?'

In [None]:
df_train = df[['x', 'y']]
df_train.head()

Unnamed: 0,x,y
0,"In the match between Fortaleza vs Bahia, the p...",His performance was good
1,"In the match between Fortaleza vs Bahia, the p...",His performance was good
2,"In the match between Fortaleza vs Bahia, the p...",His performance was good
3,"In the match between Fortaleza vs Bahia, the p...",His performance was bad
4,"In the match between Fortaleza vs Bahia, the p...",His performance was bad


In [None]:
df_train = df_train.rename(columns={'x': 'Instructions', 'y': 'Responses'})
df_train.head()

Unnamed: 0,Instructions,Responses
0,"In the match between Fortaleza vs Bahia, the p...",His performance was good
1,"In the match between Fortaleza vs Bahia, the p...",His performance was good
2,"In the match between Fortaleza vs Bahia, the p...",His performance was good
3,"In the match between Fortaleza vs Bahia, the p...",His performance was bad
4,"In the match between Fortaleza vs Bahia, the p...",His performance was bad


In [None]:
df_train.to_csv('player_performance_analysis_dataset.csv', sep=';', index=False)