# Raw Processing

## Preparation


In [15]:
import pandas as pd
from random import choice

pd.set_option('display.max_columns', None)

## Data Information & Description

In [22]:
# sample from a random year
year = choice([i for i in range(2000, 2010)])
df = pd.read_parquet(f'../data/raw/transfers_by_day/y={year}/', engine='pyarrow')
df['y'] = str(year)
df.sample(2)

Unnamed: 0,portrait_url,name,player_id,position,age,nationalities,left_club_url,left_club_name,left_club_name_alt,left_club_league_country_url,left_club_league_country_name,left_club_league_name,left_club_league_name_alt,join_club_url,join_club_name,join_club_name_alt,join_club_league_country_url,join_club_league_country_name,join_club_league_name,join_club_league_name_alt,market_value,fee,loan_fee,transfer_url,transfer_date,ingested_at,m,d,y
17541,https://img.a.transfermarkt.technology/portrai...,Heitor,26023,Right Winger,28,"[{""name"": ""Portugal"", ""url"": ""https://tmssl.ak...",https://tmssl.akamaized.net/images/wappen/tiny...,Trofense,CD Trofense,https://tmssl.akamaized.net/images/flagge/tiny...,Portugal,Liga Sabseg,Liga Sabseg,https://tmssl.akamaized.net/images/wappen/tiny...,Famalicão,FC Famalicão,https://tmssl.akamaized.net/images/flagge/very...,Portugal,,,€100k,?,,/jumplist/transfers/spieler/26023/transfer_id/...,2007-01-01,2023-03-02 16:04:11,1,1,2007
27243,https://img.a.transfermarkt.technology/portrai...,Daniel Gramann,25784,Centre-Back,20,"[{""name"": ""Austria"", ""url"": ""https://tmssl.aka...",https://tmssl.akamaized.net/images/wappen/tiny...,TSV Hartberg,TSV Hartberg,https://tmssl.akamaized.net/images/flagge/tiny...,Austria,Regional League Central,Regional League Central,https://tmssl.akamaized.net/images/wappen/tiny...,SCR Altach,SCR Altach,https://tmssl.akamaized.net/images/flagge/very...,Austria,Bundesliga,Bundesliga,€200k,€8k,,/jumplist/transfers/spieler/25784/transfer_id/...,2007-07-27,2023-03-03 11:54:02,7,27,2007


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10254 entries, 0 to 10253
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   portrait_url                   10254 non-null  object  
 1   name                           10254 non-null  object  
 2   player_id                      10254 non-null  object  
 3   position                       10254 non-null  object  
 4   age                            10254 non-null  object  
 5   nationalities                  10254 non-null  object  
 6   left_club_url                  10254 non-null  object  
 7   left_club_name                 10254 non-null  object  
 8   left_club_name_alt             10254 non-null  object  
 9   left_club_league_country_url   10254 non-null  object  
 10  left_club_league_country_name  10254 non-null  object  
 11  left_club_league_name          10254 non-null  object  
 12  left_club_league_name_alt      1

## Create Clean Raw Data

The clean raw data processing is for removing duplicates and merge splitted parquet files of a transfer date (y/m/d)

In [34]:
total_rows = 0
total_non_duplicated_rows = 0
total_duplicated_rows = 0

for year in range(2000, 2004):
    df = pd.read_parquet(f'../data/raw/transfers_by_day/y={year}/', engine='pyarrow')
    df['y'] = str(year)
    duplicated = df.duplicated(subset=['player_id', 'transfer_date']).value_counts()
    total_rows = total_rows + (df.shape[0] if len(df.shape) >= 1 else 0)
    total_non_duplicated_rows = total_non_duplicated_rows + (duplicated[0] if len(duplicated) >= 1 else 0)
    total_duplicated_rows = total_duplicated_rows + ((duplicated[1] if len(duplicated) >= 2 else 0))
    
    # write to a new dataset
    df.to_parquet('../data/raw/transfers_by_day_clean2/', partition_cols=['y', 'm', 'd'])
    
print(f"total_rows: {total_rows}")
print(f"total_non_duplicated_rows: {total_non_duplicated_rows}")
print(f"total_duplicated_rows: {total_duplicated_rows}")
    

total_rows: 165
total_non_duplicated_rows: 165
total_duplicated_rows: 0
