In [19]:
import pandas as pd
from pathlib import Path
import re

### Import CSV files

In [20]:
# path to raw data direcotry
web_scraping_dir = Path.cwd().parent
raw_data_dir = web_scraping_dir / "data" / "raw_data"

# open csv files
products_df = pd.read_csv(raw_data_dir / "products.csv", delimiter=";")
colors_df = pd.read_csv(raw_data_dir / "colors.csv", delimiter=";")
sizes_df = pd.read_csv(raw_data_dir / "sizes.csv", delimiter=";")
categories_df = pd.read_csv(raw_data_dir / "categories.csv", delimiter=";")
boots_category_df = (
    pd.read_csv(raw_data_dir / "boots_category.csv", delimiter=";")
    .drop("Product Type:", axis=1)
)
balls_category_df = (
    pd.read_csv( raw_data_dir / "balls_category.csv", delimiter=";")
    .drop("Product Type:", axis=1)
)

### Helper functions

In [21]:
# Cleans text by removing HTML, quotes, and extra spaces.
def clean_description(data):
    data = re.sub(r"<.*?>", " ", data)
    data = re.sub(r'[,\'"]', "", data)
    data = re.sub(r"\s+", " ", data)
    return data.strip()[1:-1].strip()


# Formats column names: lowercase, underscores
def format_cols(df):
    col_names = {}
    for i, col in enumerate(df.columns):
        if i > 0:
            col_names[col] = col[:-1].lower().replace(" ", "_").replace("'", "")
        else:
            col_names[col] = col
    return col_names


# Strips and capitalizes specified string columns in a DataFrame.
def format_str(df, cols):
    for col in cols:
        df[col] = df[col].str.strip().str.title()

### Data cleaning

colors, sizes and categories are quite simple tables and don't require any transformation

In [22]:
# product table
products_df.drop_duplicates(inplace=True)
products_df["name"] = products_df["name"].str.split("\n", expand=True)[0]
products_df["price"] = products_df["price"].str.split("\n", expand=True)[1].str.strip().str[:-2].str.replace(",", ".").astype(float)
products_df["description"] = products_df["description"].apply(clean_description)

products_df.head(5)

Unnamed: 0,id,scaped_id,category_id,name,price,description,link
0,1,199578,1,Cleats adidas F50 Elite LL FG - White,270.0,Boost your pace in lightweight adidas F50 shoe...,https://www.r-gol.com/en/cleats-adidas-f50-eli...
1,2,191711,1,Cleats Nike Zoom Mercurial Vapor 16 Elite FG -...,194.99,Do you have an obsession with speed? The bigge...,https://www.r-gol.com/en/cleats-nike-zoom-merc...
2,3,191710,1,Cleats Nike Zoom Mercurial Vapor 16 Elite FG -...,182.99,Do you have an obsession with speed? The bigge...,https://www.r-gol.com/en/cleats-nike-zoom-merc...
3,4,95634,1,adidas Mundial Team Boots - Black,107.99,This is probably one of the most popular shoe ...,https://www.r-gol.com/en/adidas-mundial-team-b...
4,5,194479,1,Cleats Nike Zoom Mercurial Vapor 16 Elite FG -...,269.99,Do you have an obsession with speed? The bigge...,https://www.r-gol.com/en/cleats-nike-zoom-merc...


In [23]:
# football boots category
boots_cols = format_cols(boots_category_df)
boots_category_df.rename(columns=boots_cols, inplace=True)
boots_cols_old = ["producer", "collections", "collection", "class", "upper", "color", "type_of_binding", "manufacturers_data"]
format_str(boots_category_df, boots_cols_old)

boots_category_df.head(5)

Unnamed: 0,product_id,producer,collections,age_group,ground_type,class,upper,type_of_binding,boots_with_sock,collection,color,plays_in_these_boots,manufacturers_data,team
0,1,Adidas,Adidas F50,Men,Firm Ground (FG),Professional,Synthetic,Lack,No,Adidas Pure Victory,White,Lionel Messi,"Adidas Ag, Adi-Dassler-Strasse 1, 91074 Herzog...",
1,2,Nike,Nike Mercurial Vapor,Men,Firm Ground (FG),Professional,Synthetic,Shoelace,No,Nike Mad Ambition Pack,Sky Blue,Robert Lewandowski,"Nike Retail B.V., Po Box 6453, Colosseum 1, 12...",
2,3,Nike,Nike Mercurial Vapor,Men,Firm Ground (FG),Professional,Synthetic,Shoelace,No,Nike Shadow Pack 2024,Black,Robert Lewandowski,"Nike Retail B.V., Po Box 6453, Colosseum 1, 12...",
3,4,Adidas,Adidas Classic,Men,Artificial Grass (AG/TF),Professional,Leather,Shoelace,No,Adidas Classic,Black,,"Adidas Ag, Adi-Dassler-Strasse 1, 91074 Herzog...",
4,5,Nike,Nike Mercurial Vapor,Men,Firm Ground (FG),Professional,Synthetic,Shoelace,No,Nike Mad Energy Pack,Red,Robert Lewandowski,"Nike Retail B.V., Po Box 6453, Colosseum 1, 12...",


In [24]:
# football balls category
balls_cols = format_cols(balls_category_df)
balls_category_df.rename(columns=balls_cols, inplace=True)
balls_category_df["ball_size"] = (
    balls_category_df["ball_size"].replace("Futsal", 4).fillna(5).astype(int)
)
balls_cols_old = ["producer", "collection", "connecting_type", "manufacturers_data"]
format_str(balls_category_df, balls_cols_old)

balls_category_df.head(5)

Unnamed: 0,product_id,producer,ball_size,ground_type,class,collection,connecting_type,weigh,color,manufacturers_data,league,team
0,452,Nike,5,Firm Ground (FG),Match,Other,Thermally Welded,410-450g,Yellow,"Nike Retail B.V., Po Box 6453, Colosseum 1, 12...",,
1,453,Adidas,5,Firm Ground (FG),Match,Ekstraklasa,,410-450g,White,"Adidas Ag, Adi-Dassler-Strasse 1, 91074 Herzog...",,
2,454,Select,4,Indoor (IC/IN),Match,Select,Thermally Welded,400-440g,"White, Multicolor","Select Sport A/S, Fabriksparken 46, Dk 2600 Gl...",,
3,455,Adidas,5,Firm Ground (FG),Match,Ucl 24/25,Thermally Welded,410-450g,Multicolor,"Adidas Ag, Adi-Dassler-Strasse 1, 91074 Herzog...",,
4,456,Adidas,5,Firm Ground (FG),Match,Women'S Euro 2025,Thermally Welded,410-450g,White,"Adidas Ag, Adi-Dassler-Strasse 1, 91074 Herzog...",,


### Export cleaned data

In [25]:
cleaned_data_dir = web_scraping_dir / "data" / "cleaned_data"

products_df.to_csv(cleaned_data_dir / "products.csv", index=False, sep=";")
colors_df.to_csv(cleaned_data_dir / "colors.csv", index=False, sep=";")
sizes_df.to_csv(cleaned_data_dir / "sizes.csv", index=False, sep=";")
categories_df.to_csv(cleaned_data_dir / "categories.csv", index=False, sep=";")
boots_category_df.to_csv(cleaned_data_dir / "boots_category.csv", index=False, sep=";")
balls_category_df.to_csv(cleaned_data_dir / "balls_category.csv", index=False, sep=";")