## ETL para analise de edições do fifa 22

In [134]:
import pandas as pd

basePath = './tables/utf/'


#### Importa dados de jogadores

In [135]:
newPlayers = basePath + 'players.txt'

dfPlayers = pd.read_csv(newPlayers, sep='\s+')
dfPlayers = dfPlayers[['firstnameid', 'lastnameid', 'commonnameid', 'playerid']].drop_duplicates(subset='playerid', keep='first')

#### Trato dados de jogadores cruzando com bases de nomes

In [136]:
newPlayers = basePath + 'playernames.txt'

dfPlayersName = pd.read_csv(
    newPlayers,
    sep='\t',  # Delimitador de tabulação
    encoding='utf-8',  # Codificação UTF-8, equivalente a 65001
    names=['nameid', 'commentaryid', 'name'],  # Nomes das colunas
    skiprows=1,
    engine='python'  # Usar o motor Python para maior flexibilidade
).drop_duplicates(subset='nameid', keep='first')

playersOriginalNames = dict(zip(dfPlayersName['nameid'], dfPlayersName['name']))

In [137]:
dfOriginalPlayers = dfPlayers[dfPlayers['firstnameid'] > 0].copy()

colsToSearch = [('firstnameid','firstname'),('lastnameid', 'surname'), ('commonnameid','commonname')]

for col_id, col_name in colsToSearch:
    dfOriginalPlayers[col_name] = dfOriginalPlayers[col_id].map(playersOriginalNames)

dfOriginalPlayers['fullname'] = dfOriginalPlayers['firstname'] + ' ' + dfOriginalPlayers['surname']
dfOriginalPlayers.head()

Unnamed: 0,firstnameid,lastnameid,commonnameid,playerid,firstname,surname,commonname,fullname
1,15353,6531,0,27,Joe,Cole,,Joe Cole
2,1882,14117,14116,41,Andrés,Iniesta Luján,Iniesta,Andrés Iniesta Luján
3,1178,29294,0,51,Alan,Shearer,,Alan Shearer
4,27651,16635,0,240,Roy,Keane,,Roy Keane
5,25044,28745,0,246,Paul,Scholes,,Paul Scholes


In [138]:
editPlayersNames = basePath + 'editedplayernames.txt'
dfEditedPlayerNames = pd.read_csv(
    editPlayersNames,
    sep='\t',  # Delimitador de tabulação
    encoding='utf-8',  # Codificação UTF-8, equivalente a 65001
    names=['firstname','commonname','playerjerseyname','surname','playerid'],  # Nomes das colunas
    skiprows=1,
    engine='python'  # Usar o motor Python para maior flexibilidade
)

dfEditedPlayerNames.dropna(inplace=True)

In [139]:
dfEditedPlayers = dfPlayers[dfPlayers['firstnameid'] == 0]

dfEditedPlayers= dfEditedPlayers.merge(dfEditedPlayerNames[['playerid','firstname','surname','commonname']], on='playerid', how='left')
dfEditedPlayers['fullname'] = dfEditedPlayers['firstname'] + ' ' + dfEditedPlayers['surname']
dfEditedPlayerNames.head()

Unnamed: 0,firstname,commonname,playerjerseyname,surname,playerid
0,Yago Rafael,Yago Darub,Yago Darub,Valadares Darub,22
1,Paulo Vítor,Paulo Vítor,Paulo Vítor,Leal Sousa Lima,3000
2,Miguel Angel,Miguelito,Miguelito,Terceros Acuna,4330
3,Luiz Carlos,Mirandinha,Mirandinha,Paulino de Carvalho,4331
4,Luiz Henrique,Luiz Henrique,Luiz Henrique,Bezerra dos Santos,4332


In [140]:
dfPlayers = pd.concat([dfOriginalPlayers,dfEditedPlayers])
dfPlayers.drop(columns=['firstnameid','lastnameid','commonnameid'],inplace=True)

### Inicia busca de times

In [141]:
teamsPath = basePath + 'teams.txt'

dfTeams = pd.read_csv(
    teamsPath,
    sep='\t',  # Delimitador de tabulação
    encoding='utf-8'# Usar o motor Python para maior flexibilidade
).drop_duplicates(subset='teamid')

dfTeams = dfTeams.query("cityid != 0")
dfTeams = dfTeams[['teamid', 'teamname']].dropna()

In [142]:
teamsPlayersPath = basePath + 'teamplayerlinks.txt'

dfTeamsPlayers = pd.read_csv(
    teamsPlayersPath,
    sep='\t',  # Delimitador de tabulação
    encoding='utf-8'# Usar o motor Python para maior flexibilidade
)
dfTeamsPlayers = dfTeamsPlayers[['teamid','playerid','jerseynumber', 'position']].dropna()

dfTeamsPlayers.head()

Unnamed: 0,teamid,playerid,jerseynumber,position
0,1,270390,21,29
1,1,199503,34,11
2,1,201118,17,3
3,1,209989,5,9
4,1,213051,25,28


In [143]:
dfTeamsPlayers = dfTeamsPlayers.merge(dfPlayers,on='playerid', how='left')
dfTeamsPlayers = dfTeamsPlayers.merge(dfTeams,on='teamid', how='left')
dfTeamsPlayers.head()

Unnamed: 0,teamid,playerid,jerseynumber,position,firstname,surname,commonname,fullname,teamname
0,1,270390,21,29,Marcus,Alencar,Marquinhos,Marcus Alencar,Arsenal
1,1,199503,34,11,Granit,Xhaka,,Granit Xhaka,Arsenal
2,1,201118,17,3,Cédric Ricardo,Alves Soares,Cédric,Cédric Ricardo Alves Soares,Arsenal
3,1,209989,5,9,Thomas,Partey,,Thomas Partey,Arsenal
4,1,213051,25,28,Mohamed,Elneny,,Mohamed Elneny,Arsenal


### Inicia merge com dados atualizados

In [144]:
dfUpdatedPlayers = pd.read_csv('./result/newPlayers')
dfUpdatedPlayers.drop(dfUpdatedPlayers.columns[0], axis=1, inplace=True)
dfUpdatedPlayers.head()

Unnamed: 0,teamid,playerid,jerseynumber,position,firstname,surname,commonname,fullname,teamname
0,114815,27,20,29,Joe,Cole,,Joe Cole,Soccer Aid
1,101146,41,8,18,Andrés,Iniesta Luján,Iniesta,Andrés Iniesta Luján,Vissel Kobe
2,111674,65,18,28,Alexander,,,,BFC Dynamo
3,982,67,4,3,Manuel,,,,Chemie Leipzig
4,982,82,19,29,Max,Kessler,,Max Kessler,Chemie Leipzig


In [145]:
dfIntersectPlayers = dfTeamsPlayers.merge(dfUpdatedPlayers,on='playerid', how='inner',suffixes=('_edited', '_updated'))
dfIntersectPlayers['equalFullNames'] = dfIntersectPlayers['fullname_edited'] == dfIntersectPlayers['fullname_updated']
dfIntersectPlayers.head()

MemoryError: Unable to allocate 1.28 GiB for an array with shape (4, 43087634) and data type object