In [58]:
from google.cloud import storage
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import json
import os
from pprint import pprint
from pathlib import Path
import sys


In [59]:
env_path = Path.cwd().parent / ".credentials"
load_dotenv(dotenv_path=env_path)

ROOT_DIR = Path.cwd().parent
sys.path.insert(0, str(ROOT_DIR))
from functions.gcs_utils import get_file_from_bucket

# gcp connection
bucket_name = os.getenv("BUCKET_NAME")
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

raw_files_dir = "api-pipeline/raw"

def get_product_table():
    products = pd.DataFrame()
    categories = ["boots", "balls"]

    for index, category in enumerate(categories, start=1):
        data = get_file_from_bucket(raw_files_dir, f"{category}.json", "json")
        df = pd.DataFrame(data["products"])
        df["category_name"] = data["info"]["category_name"]
        df["category_id"] = index
        products = pd.concat([products, df], ignore_index=True)

    return products

In [60]:
# created_at - datetime
# descitpion - remove nones
# color - remove fakes
# add category name and id
# create size lst , dict

# >> function that returns df and seizes df for appending (balls and boots)

In [61]:
orders_json = get_file_from_bucket(raw_files_dir, "orders.json", "json")
orders = pd.DataFrame(orders_json)

In [62]:
orders.head(3)

Unnamed: 0,order_date,region,quantity,created_at,customer_id,payment_method,order_id,city,total_price,order_details
0,2025-02-04T21:00:25.999530+01:00,Łódzkie,1,2025-03-18T11:06:33.363942+01:00,973,BLIK,1,Piotrków Trybunalski,107.99,"[{'product_id': 5, 'quantity': 1, 'old_price':..."
1,2025-01-22T12:39:14.019088+01:00,Wielkopolskie,2,2025-03-18T11:07:21.935633+01:00,485,Digital Wallet,2,Kalisz,367.98,"[{'product_id': 159, 'quantity': 2, 'old_price..."
2,2025-03-01T04:44:34.365505+01:00,Podkarpackie,1,2025-03-18T11:07:21.935633+01:00,181,Bank Transfers,3,Rzeszów,260.0,"[{'product_id': 15, 'quantity': 1, 'old_price'..."


In [63]:
orders["order_date"] = pd.to_datetime(orders["order_date"])
orders["created_at"] = pd.to_datetime(orders["created_at"])

In [64]:
orders.head(3)

Unnamed: 0,order_date,region,quantity,created_at,customer_id,payment_method,order_id,city,total_price,order_details
0,2025-02-04 21:00:25.999530+01:00,Łódzkie,1,2025-03-18 11:06:33.363942+01:00,973,BLIK,1,Piotrków Trybunalski,107.99,"[{'product_id': 5, 'quantity': 1, 'old_price':..."
1,2025-01-22 12:39:14.019088+01:00,Wielkopolskie,2,2025-03-18 11:07:21.935633+01:00,485,Digital Wallet,2,Kalisz,367.98,"[{'product_id': 159, 'quantity': 2, 'old_price..."
2,2025-03-01 04:44:34.365505+01:00,Podkarpackie,1,2025-03-18 11:07:21.935633+01:00,181,Bank Transfers,3,Rzeszów,260.0,"[{'product_id': 15, 'quantity': 1, 'old_price'..."


In [65]:
sizes_lst = []

# created_at - datetime
balls["created_at"] = pd.to_datetime(balls["created_at"])

# description - remove nones
balls["description"] = balls["description"].apply(lambda value: value if value else np.nan)

# color - remove outliers
balls["product_color"] = np.where(balls["product_color"].str.split().str.len() == 1, balls["product_color"], np.nan)



# append sizes to lst with product_id
for _, row in balls[["product_id", "sizes"]].iterrows():
    if row["sizes"]:
        for size in row["sizes"]:
            sizes_lst.append({"product_id": row["product_id"], **size})

filtered_balls = balls.drop(columns=["sizes"])
filtered_balls.head()

Unnamed: 0,product_id,created_at,price,description,num_votes,title,old_price,avg_vote_rate,product_color,related_products,labels,features,category_name,category_id
0,563,2025-03-21 15:59:53.424127+01:00,126.99,"Available in our store, the Brillant Super TB ...",,Ball Select Brillant Super Tb Fifa V25 Size 5 ...,,,Yellow,[526],[New],"{'product_type': 'Balls', 'producer': 'Select'...",balls,1
1,564,2025-03-24 08:05:59.092066+01:00,105.99,Reflecting the strength and dynamism of compet...,1.0,Ball Adidas Uwcl Pro 24/25 Size 5 - Multicolor,150.0,5.0,Multicolor,[538],[Last Chance],"{'product_type': 'Balls', 'producer': 'Adidas'...",balls,1
2,504,2025-02-18 05:57:33.190330+01:00,97.99,Adidas Île-De-Foot 24 Pro ball in the colors o...,3.0,Ball Adidas Île-De-Foot 24 Pro Size 5 - Blue,150.0,5.0,Blue,[],[Sale],"{'product_type': 'Balls', 'producer': 'Adidas'...",balls,1
3,538,2025-02-27 15:34:04.526927+01:00,150.0,,,Ball Adidas Uwcl Pro 24/25 Size 5 - White,,,White,[],[New],"{'product_type': 'Balls', 'producer': 'Adidas'...",balls,1
4,539,2025-02-27 15:34:11.106168+01:00,128.99,,,Ball Adidas Conext 25 Pro Size 5 - Multicolor,150.0,,Multicolor,[],[New],"{'product_type': 'Balls', 'producer': 'Adidas'...",balls,1


In [66]:
sizes_lst

[{'product_id': 563, 'size': '5', 'in_stock': True},
 {'product_id': 564, 'size': '5', 'in_stock': True},
 {'product_id': 504, 'size': '5', 'in_stock': True},
 {'product_id': 538, 'size': '5', 'in_stock': True},
 {'product_id': 539, 'size': '5', 'in_stock': True},
 {'product_id': 475, 'size': '5', 'in_stock': True},
 {'product_id': 476, 'size': '5', 'in_stock': True},
 {'product_id': 479, 'size': '5', 'in_stock': True},
 {'product_id': 481, 'size': '5', 'in_stock': True},
 {'product_id': 482, 'size': '5', 'in_stock': True},
 {'product_id': 483, 'size': '5', 'in_stock': True},
 {'product_id': 485, 'size': '5', 'in_stock': True},
 {'product_id': 486, 'size': '5', 'in_stock': True},
 {'product_id': 487, 'size': '5', 'in_stock': True},
 {'product_id': 488, 'size': '5', 'in_stock': True},
 {'product_id': 489, 'size': '5', 'in_stock': True},
 {'product_id': 491, 'size': '5', 'in_stock': True},
 {'product_id': 492, 'size': '5', 'in_stock': True},
 {'product_id': 493, 'size': '5', 'in_stock': 

In [67]:
boots_json = open_file_from_gcs("api-pipeline/raw", "boots.json")
boots = pd.DataFrame(boots_json["products"])

In [68]:
boots.head()

Unnamed: 0,product_id,created_at,price,description,num_votes,title,old_price,avg_vote_rate,product_color,related_products,labels,sizes,features
0,555,2025-03-18T18:43:30.648269+01:00,304.99,"Mizuno is a brand with a long tradition, uniqu...",,Cleats Mizuno Morelia Neo Iv Beta Japan Mix - ...,379.99,,Silver,"[144, 251, 262, 362, 367, 398]",[],"[{'size': '40.5', 'in_stock': True}, {'size': ...","{'product_type': 'Football Boots', 'producer':..."
1,556,2025-03-18T18:43:30.648269+01:00,279.99,"Achieve your best speed with these shoes, care...",,Cleats Mizuno Alpha Ii Japan Fg - Silver,348.99,,Silver,[514],[New],"[{'size': '40.0', 'in_stock': True}, {'size': ...","{'product_type': 'Football Boots', 'producer':..."
2,557,2025-03-18T18:43:30.648269+01:00,296.99,,,Cleats Mizuno Morelia Dna Japan Fg - Silver,370.99,,Silver,[],[New],"[{'size': '40.0', 'in_stock': True}, {'size': ...","{'product_type': 'Football Boots', 'producer':..."
3,3,2025-02-18T05:03:19.734919+01:00,182.99,Do you have an obsession with speed? The bigge...,41.0,Cleats Nike Zoom Mercurial Vapor 16 Elite Fg -...,269.99,5.0,Black,"[52, 6, 107, 171, 112, 2]",[],"[{'size': '38', 'in_stock': False}, {'size': '...","{'product_type': 'Football Boots', 'producer':..."
4,4,2025-02-18T05:03:27.424585+01:00,159.99,Step into a new day and show the world what yo...,12.0,Cleats Nike Phantom Luna Ii Elite Fg - Black,279.99,4.9,Black,"[58, 26, 183, 119, 322]",[Sale],"[{'size': '36', 'in_stock': False}, {'size': '...","{'product_type': 'Football Boots', 'producer':..."


In [69]:
boots["product_color"].unique()

array(['Silver', 'Black', 'Purple', 'White',
       'Cleats Adidas Predator Elite Ft Fg Yots', 'Red', 'Gray', 'Orange',
       'Multicolor', 'Sky Blue',
       'Cleats New Balance Tekela V4+ Pro Low Fg',
       'Cleats New Balance Tekela V4 Magia Low Sg', 'Pink', 'Navy Blue',
       'Lime', 'Beige', 'Brown', 'Pro Player Edition',
       'Cleats Puma Future 7 Ultimate Fg/Ag Conquer Your Mountain',
       'Blue', 'Yellow', 'Cleats Mizuno Morelia Neo Iv Beta Sr4 Japan',
       'Cleats Adidas Predator Elite Ll Fg', 'Claret',
       'Cleats Puma King Ultimate Launch Edition Fg/Ag',
       'Cleats Puma King Ultimate Rush Fg/Ag',
       'Cleats Mizuno Alpha Japan Fg', 'Cleats Puma Future Ultimate Mxsg',
       'Cleats Adidas Predator Accuracy+ Sg',
       'Cleats Adidas Predator Accuracy+ Ag',
       'Cleats Adidas Predator Accuracy.1 Sg',
       'Cleats Adidas Predator Accuracy.1 L Sg',
       'Cleats Adidas X Crazyfast.1 Ll Fg',
       'Cleats Adidas X Crazyfast.1 Ll Sg',
       'Cleats Adi