In [123]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [102]:
df_players = pd.read_csv('WorldCupPlayers.csv')
df_players.tail()

Unnamed: 0,RoundID,MatchID,Team Initials,Coach Name,Line-up,Shirt Number,Player Name,Position,Event
37779,255959,300186501,ARG,SABELLA Alejandro (ARG),N,19,ALVAREZ,,
37780,255959,300186501,GER,LOEW Joachim (GER),N,6,KHEDIRA,,
37781,255959,300186501,ARG,SABELLA Alejandro (ARG),N,20,AGUERO,,IH46' Y65'
37782,255959,300186501,GER,LOEW Joachim (GER),N,21,MUSTAFI,,
37783,255959,300186501,ARG,SABELLA Alejandro (ARG),N,23,BASANTA,,


In [120]:
# --- 3. LIMPEZA GERAL DE LINHAS E COLUNAS ---
    # Remover linhas duplicadas
df_players.drop_duplicates(inplace=True)
    # Remover linhas onde o nome do jogador é nulo
df_players.dropna(subset=['Player Name'], inplace=True)

In [121]:
 # --- 4. TRATAMENTO DA COLUNA 'Event' ---
    # Preencher eventos nulos com 'No Event'
df_players['Event'].fillna('No Event', inplace=True)
    # Contar o número de gols (substring 'G') em cada evento
df_players['GoalsScored'] = df_players['Event'].str.count('G')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_players['Event'].fillna('No Event', inplace=True)


In [124]:
 # --- 5. TRATAMENTO DA COLUNA 'Position' ---
    # Preencher posições nulas com uma string vazia para manipulação
df_players['Position'].fillna('', inplace=True)
    # Criar coluna booleana 'IsCaptain' se a posição contiver 'C'
df_players['IsCaptain'] = df_players['Position'].str.contains('C')
    # Criar coluna de posição simplificada: 'Goalkeeper' ou 'Line Player'
df_players['PlayerPosition'] = np.where(
        df_players['Position'].str.contains('GK'),
        'Goalkeeper',
        'Line Player'
    )
    # Agora podemos remover a coluna original 'Position' que não é mais necessária
df_players.drop('Position', axis=1, inplace=True)

S=Line-up, N=Substitute

In [127]:
 # --- 6. VERIFICAÇÃO FINAL ---
print("\nTratamento completo! Amostra do DataFrame final:")
    # Mostrando as colunas mais importantes que criamos ou limpamos
print(df_players.sample(10))


Tratamento completo! Amostra do DataFrame final:
        RoundID    MatchID Team Initials                  Coach Name Line-up  \
10055       262       1955           AUS            RASIC Rale (YUG)       S   
3870        212       1264           ENG   WINTERBOTTOM Walter (ENG)       S   
18986       322        111           CMR  NEPOMNYASHCHI Valeri (RUS)       S   
24269      1014       8766           MEX       LAPUENTE Manuel (MEX)       S   
8529       3478       1633           ENG            RAMSEY Alf (ENG)       S   
1701        206       1157           CUB            TAPIA Jose (CUB)       S   
23466      1014       8747           NGA      MILUTINOVIC Bora (YUG)       N   
32339    249722  300061475           DEN          OLSEN Morten (DEN)       N   
30422  97410100   97410046           KOR         Dick ADVOCAAT (NED)       N   
34408    255931  300186513           ENG           HODGSON Roy (ENG)       N   

       Shirt Number       Player Name     Event   Goal  Yellow Card  

In [126]:
 # --- 7. (EXEMPLO DE USO) ANÁLISE DE ARTILHARIA ---
player_stats = df_players.groupby('Player Name')['GoalsScored'].sum()
top_scorers = player_stats.sort_values(ascending=False)
print("\n--- Maiores Artilheiros (Análise de Exemplo) ---")
print(top_scorers.head())


--- Maiores Artilheiros (Análise de Exemplo) ---
Player Name
RONALDO                               16
KLOSE                                 16
Just FONTAINE                         13
Gerd MUELLER                          13
PEL� (Edson Arantes do Nascimento)    12
Name: GoalsScored, dtype: int64


In [103]:
df_players['Event'].value_counts()

Event
OH46'              247
IH46'              206
Y1'                 87
I77'                78
I78'                78
                  ... 
Y58' G59' G81'       1
P4'                  1
Y1' Y49' RSY49'      1
G18' G84'            1
G60' O83'            1
Name: count, Length: 1893, dtype: int64

G=Goal, OG=Own Goal, Y=Yellow Card, R=Red Card, SY = Red Card by second yellow, P=Penalty, MP=Missed Penalty, I = Substitution In, O=Substitute Out

In [128]:
df_players.to_csv('WorldCupPlayers_tratado.csv', index=False)