## ETL para analise de edições do fifa 22

In [155]:
import pandas as pd

basePath = './tablesV2/utf/'


#### Importa dados de jogadores

In [156]:
newPlayers = basePath + 'players.txt'

dfPlayers = pd.read_csv(newPlayers, sep='\s+')
dfPlayers = dfPlayers[['firstnameid', 'lastnameid', 'commonnameid', 'playerid']].drop_duplicates(subset='playerid', keep='first')

#### Trato dados de jogadores cruzando com bases de nomes

In [157]:
newPlayers = basePath + 'playernames.txt'

dfPlayersName = pd.read_csv(
    newPlayers,
    sep='\t',  # Delimitador de tabulação
    encoding='utf-8',  # Codificação UTF-8, equivalente a 65001
    names=['nameid', 'commentaryid', 'name'],  # Nomes das colunas
    skiprows=1,
    engine='python'  # Usar o motor Python para maior flexibilidade
).drop_duplicates(subset='nameid', keep='first')

playersOriginalNames = dict(zip(dfPlayersName['nameid'], dfPlayersName['name']))

In [158]:
dfOriginalPlayers = dfPlayers[dfPlayers['firstnameid'] > 0].copy()

colsToSearch = [('firstnameid','firstname'),('lastnameid', 'surname'), ('commonnameid','commonname')]

for col_id, col_name in colsToSearch:
    dfOriginalPlayers[col_name] = dfOriginalPlayers[col_id].map(playersOriginalNames)

dfOriginalPlayers['fullname'] = dfOriginalPlayers['firstname'] + ' ' + dfOriginalPlayers['surname']
dfOriginalPlayers.head()

Unnamed: 0,firstnameid,lastnameid,commonnameid,playerid,firstname,surname,commonname,fullname
0,15353,6531,0,27,Joe,Cole,,Joe Cole
1,1882,14117,14116,41,Andrés,Iniesta Luján,Iniesta,Andrés Iniesta Luján
2,1340,45088,0,65,Alexander,,,
3,20389,44974,0,67,Manuel,,,
4,21082,16798,0,82,Max,Kessler,,Max Kessler


In [159]:
editPlayersNames = basePath + 'editedplayernames.txt'
dfEditedPlayerNames = pd.read_csv(
    editPlayersNames,
    sep='\t',  # Delimitador de tabulação
    encoding='utf-8',  # Codificação UTF-8, equivalente a 65001
    names=['firstname','commonname','playerjerseyname','surname','playerid'],  # Nomes das colunas
    skiprows=1,
    engine='python'  # Usar o motor Python para maior flexibilidade
)

dfEditedPlayerNames.dropna(inplace=True)

In [160]:
dfEditedPlayers = dfPlayers[dfPlayers['firstnameid'] == 0]

dfEditedPlayers= dfEditedPlayers.merge(dfEditedPlayerNames[['playerid','firstname','surname','commonname']], on='playerid', how='left')
dfEditedPlayers['fullname'] = dfEditedPlayers['firstname'] + ' ' + dfEditedPlayers['surname']
dfEditedPlayerNames.head()

Unnamed: 0,firstname,commonname,playerjerseyname,surname,playerid
5,Lamine,Lamine Yamal,Lamine Yamal,Yamal,269150
6,Endrick Felipe,Endrick,Endrick,Moreira de Sousa,6357
7,Antonio,Silva,Silva,Silva,256125
8,Adrianne,Iron,Iron,Reid,1661
9,José Luis,Joselu,Joselu,Sanmartín Mato,173608


In [161]:
dfPlayers = pd.concat([dfOriginalPlayers,dfEditedPlayers])
dfPlayers.drop(columns=['firstnameid','lastnameid','commonnameid'],inplace=True)

### Inicia busca de times

In [162]:
teamsPath = basePath + 'teams.txt'

dfTeams = pd.read_csv(
    teamsPath,
    sep='\t',  # Delimitador de tabulação
    encoding='utf-8'# Usar o motor Python para maior flexibilidade
).drop_duplicates(subset='teamid')
dfTeams = dfTeams[['teamid', 'teamname']].dropna()

#### Retira seleções da listagem de times

In [163]:
leaguesPath = basePath + 'leagues.txt'
leaguesTeamsPath = basePath + 'leagueteamlinks.txt'

dfLeagues = pd.read_csv(
    leaguesPath,
    usecols=['leagueid', 'leaguename'],
    sep='\t',  # Delimitador de tabulação
    encoding='utf-8'# Usar o motor Python para maior flexibilidade
).drop_duplicates(subset='leagueid')

dfLeaguesTeams = pd.read_csv(
    leaguesTeamsPath,
    usecols=['leagueid', 'teamid', 'artificialkey'],
    sep='\t',  # Delimitador de tabulação
    encoding='utf-8'# Usar o motor Python para maior flexibilidade
)

dfLeagues = dfLeagues.query('leaguename != 75 or leaguename != 222')

dfLeaguesTeams = dfLeaguesTeams.query('artificialkey != 0')
dfLeaguesTeams = dfLeaguesTeams.merge(dfLeagues, on='leagueid', how='left')
# dfLeaguesTeams.drop_duplicates(subset='teamid')

dfLeaguesTeams.head()


Unnamed: 0,leagueid,artificialkey,teamid,leaguename
0,1,1,270,Denmark Superliga (1)
1,1,2,271,Denmark Superliga (1)
2,1,3,272,Denmark Superliga (1)
3,1,4,819,Denmark Superliga (1)
4,1,5,820,Denmark Superliga (1)


In [164]:
dfteams = dfTeams.merge(dfLeaguesTeams, on='teamid', how="inner")
dfteams.head()

Unnamed: 0,teamid,teamname,leagueid,artificialkey,leaguename
0,1,Arsenal,13,68,England Premier League (1)
1,2,Aston Villa,13,69,England Premier League (1)
2,3,Blackburn Rovers,14,91,England Championship (2)
3,4,Bolton Wanderers,60,324,England League One (3)
4,5,Chelsea,13,70,England Premier League (1)


In [165]:
teamsPlayersPath = basePath + 'teamplayerlinks.txt'

dfTeamsPlayers = pd.read_csv(
    teamsPlayersPath,
    sep='\t',  # Delimitador de tabulação
    encoding='utf-8'# Usar o motor Python para maior flexibilidade
)
dfTeamsPlayers = dfTeamsPlayers[['teamid','playerid','jerseynumber', 'position']].dropna()

dfTeamsPlayers.head()

Unnamed: 0,teamid,playerid,jerseynumber,position
0,114815,27,20,29
1,101146,41,8,18
2,111674,65,18,28
3,982,67,4,3
4,982,82,19,29


In [167]:
dfNewTeamsPlayers = dfTeamsPlayers.merge(dfPlayers,on='playerid', how='left')
dfNewTeamsPlayers = dfNewTeamsPlayers.merge(dfTeams,on='teamid', how='left')
dfNewTeamsPlayers.to_csv('./result/newPlayers')
dfNewTeamsPlayers.head()

Unnamed: 0,teamid,playerid,jerseynumber,position,firstname,surname,commonname,fullname,teamname
0,114815,27,20,29,Joe,Cole,,Joe Cole,Soccer Aid
1,101146,41,8,18,Andrés,Iniesta Luján,Iniesta,Andrés Iniesta Luján,Vissel Kobe
2,111674,65,18,28,Alexander,,,,BFC Dynamo
3,982,67,4,3,Manuel,,,,Chemie Leipzig
4,982,82,19,29,Max,Kessler,,Max Kessler,Chemie Leipzig
