In [2]:
import pandas as pd

In [3]:
df_matches = pd.read_csv('WorldCupMatches.csv')


In [4]:
  # --- 2. LIMPEZA INICIAL DE LINHAS ---
  # Remover linhas onde dados essenciais (como o ano) estão faltando. Isso elimina as linhas vazias.
df_matches.dropna(subset=['Year'], inplace=True)
    # Remover partidas que estão completamente duplicadas
#df_matches.drop_duplicates(inplace=True)
print("Linhas vazias e duplicadas foram removidas.")

Linhas vazias e duplicadas foram removidas.


In [5]:
df_matches.tail()

Unnamed: 0,Year,Datetime,Stage,Stadium,City,Home Team Name,Home Team Goals,Away Team Goals,Away Team Name,Win conditions,Attendance,Half-time Home Goals,Half-time Away Goals,Referee,Assistant 1,Assistant 2,RoundID,MatchID,Home Team Initials,Away Team Initials
847,2014.0,05 Jul 2014 - 17:00,Quarter-finals,Arena Fonte Nova,Salvador,Netherlands,0.0,0.0,Costa Rica,Netherlands win on penalties (4 - 3),51179.0,0.0,0.0,Ravshan IRMATOV (UZB),RASULOV Abduxamidullo (UZB),KOCHKAROV Bakhadyr (KGZ),255953.0,300186488.0,NED,CRC
848,2014.0,08 Jul 2014 - 17:00,Semi-finals,Estadio Mineirao,Belo Horizonte,Brazil,1.0,7.0,Germany,,58141.0,0.0,5.0,RODRIGUEZ Marco (MEX),TORRENTERA Marvin (MEX),QUINTERO Marcos (MEX),255955.0,300186474.0,BRA,GER
849,2014.0,09 Jul 2014 - 17:00,Semi-finals,Arena de Sao Paulo,Sao Paulo,Netherlands,0.0,0.0,Argentina,Argentina win on penalties (2 - 4),63267.0,0.0,0.0,C�neyt �AKIR (TUR),DURAN Bahattin (TUR),ONGUN Tarik (TUR),255955.0,300186490.0,NED,ARG
850,2014.0,12 Jul 2014 - 17:00,Play-off for third place,Estadio Nacional,Brasilia,Brazil,0.0,3.0,Netherlands,,68034.0,0.0,2.0,HAIMOUDI Djamel (ALG),ACHIK Redouane (MAR),ETCHIALI Abdelhak (ALG),255957.0,300186502.0,BRA,NED
851,2014.0,13 Jul 2014 - 16:00,Final,Estadio do Maracana,Rio De Janeiro,Germany,1.0,0.0,Argentina,Germany win after extra time,74738.0,0.0,0.0,Nicola RIZZOLI (ITA),Renato FAVERANI (ITA),Andrea STEFANI (ITA),255959.0,300186501.0,GER,ARG


In [6]:
# --- 3. PADRONIZANDO NOMES E TEXTOS ---
    # Unificar nomes de países para consistência
df_matches.replace('Germany FR', 'Germany', inplace=True)
    # Remover espaços em branco extras no início/fim da coluna 'City'
df_matches['City'] = df_matches['City'].str.strip()
print("Nomes de países e cidades foram padronizados.")

Nomes de países e cidades foram padronizados.


In [7]:
# --- 4. CONVERTENDO TIPOS DE DADOS ---
# Converter a coluna 'Year' para inteiro
df_matches['Year'] = df_matches['Year'].astype(int)

In [8]:
 # Converter colunas de gols e público para tipos numéricos
    # O 'errors='coerce'' transforma textos que não são números em NaN (Nulo)
    # Em seguida, o .fillna(0) substitui esses nulos por 0
df_matches['Home Team Goals'] = pd.to_numeric(df_matches['Home Team Goals'], errors='coerce').fillna(0).astype(int)
df_matches['Away Team Goals'] = pd.to_numeric(df_matches['Away Team Goals'], errors='coerce').fillna(0).astype(int)
df_matches['Attendance'] = pd.to_numeric(df_matches['Attendance'], errors='coerce').fillna(0).astype(int)
print("Colunas numéricas (Ano, Gols, Público) convertidas com sucesso.")

Colunas numéricas (Ano, Gols, Público) convertidas com sucesso.


In [9]:
 # --- 5. VERIFICAÇÃO FINAL ---
print("\nTratamento completo! Amostra do DataFrame final:")
print(df_matches.tail())
print("\nVerificando os tipos das colunas após o tratamento:")
df_matches.info()


Tratamento completo! Amostra do DataFrame final:
     Year              Datetime                     Stage  \
847  2014  05 Jul 2014 - 17:00             Quarter-finals   
848  2014  08 Jul 2014 - 17:00                Semi-finals   
849  2014  09 Jul 2014 - 17:00                Semi-finals   
850  2014  12 Jul 2014 - 17:00   Play-off for third place   
851  2014  13 Jul 2014 - 16:00                      Final   

                 Stadium            City Home Team Name  Home Team Goals  \
847     Arena Fonte Nova        Salvador    Netherlands                0   
848     Estadio Mineirao  Belo Horizonte         Brazil                1   
849   Arena de Sao Paulo       Sao Paulo    Netherlands                0   
850     Estadio Nacional        Brasilia         Brazil                0   
851  Estadio do Maracana  Rio De Janeiro        Germany                1   

     Away Team Goals Away Team Name                         Win conditions  \
847                0     Costa Rica  Netherlands

In [10]:
df_matches.tail()

Unnamed: 0,Year,Datetime,Stage,Stadium,City,Home Team Name,Home Team Goals,Away Team Goals,Away Team Name,Win conditions,Attendance,Half-time Home Goals,Half-time Away Goals,Referee,Assistant 1,Assistant 2,RoundID,MatchID,Home Team Initials,Away Team Initials
847,2014,05 Jul 2014 - 17:00,Quarter-finals,Arena Fonte Nova,Salvador,Netherlands,0,0,Costa Rica,Netherlands win on penalties (4 - 3),51179,0.0,0.0,Ravshan IRMATOV (UZB),RASULOV Abduxamidullo (UZB),KOCHKAROV Bakhadyr (KGZ),255953.0,300186488.0,NED,CRC
848,2014,08 Jul 2014 - 17:00,Semi-finals,Estadio Mineirao,Belo Horizonte,Brazil,1,7,Germany,,58141,0.0,5.0,RODRIGUEZ Marco (MEX),TORRENTERA Marvin (MEX),QUINTERO Marcos (MEX),255955.0,300186474.0,BRA,GER
849,2014,09 Jul 2014 - 17:00,Semi-finals,Arena de Sao Paulo,Sao Paulo,Netherlands,0,0,Argentina,Argentina win on penalties (2 - 4),63267,0.0,0.0,C�neyt �AKIR (TUR),DURAN Bahattin (TUR),ONGUN Tarik (TUR),255955.0,300186490.0,NED,ARG
850,2014,12 Jul 2014 - 17:00,Play-off for third place,Estadio Nacional,Brasilia,Brazil,0,3,Netherlands,,68034,0.0,2.0,HAIMOUDI Djamel (ALG),ACHIK Redouane (MAR),ETCHIALI Abdelhak (ALG),255957.0,300186502.0,BRA,NED
851,2014,13 Jul 2014 - 16:00,Final,Estadio do Maracana,Rio De Janeiro,Germany,1,0,Argentina,Germany win after extra time,74738,0.0,0.0,Nicola RIZZOLI (ITA),Renato FAVERANI (ITA),Andrea STEFANI (ITA),255959.0,300186501.0,GER,ARG


In [11]:
df_matches.to_csv('WorldCupMatches_tratado.csv', index=False)