In [1]:
import os
import pandas as pd

# ---- Adjust this if your base path ever changes ----
BASE_DIR = r"C:\Users\hyq94\PycharmProjects\ReviewsCrawler"
GENDER_FOLDERS = ["UNIQLO_female", "UNIQLO_male"]

# To store detailed info per CSV (product_id file)
csv_rows_info = []  # each item: dict with gender, category, file_name, n_reviews

# To store info aggregated per category
category_info = {}  # key: (gender, category), value: dict with counts

for gender in GENDER_FOLDERS:
    gender_path = os.path.join(BASE_DIR, gender)
    if not os.path.isdir(gender_path):
        print(f"Warning: {gender_path} does not exist, skipping.")
        continue

    # Each subfolder in gender_path is a product category
    for entry in os.scandir(gender_path):
        if not entry.is_dir():
            continue  # skip non-folders

        category = entry.name  # folder name is category
        category_path = entry.path

        # Initialize category record if not exists
        key = (gender, category)
        if key not in category_info:
            category_info[key] = {
                "gender": gender,
                "category": category,
                "num_product_ids": 0,
                "total_reviews_in_category": 0,
            }

        # List all CSV files in this category folder
        csv_files = [f for f in os.scandir(category_path)
                     if f.is_file() and f.name.lower().endswith(".csv")]

        for csv_entry in csv_files:
            csv_path = csv_entry.path
            file_name = csv_entry.name

            # Count number of rows (reviews) in this CSV
            try:
                df = pd.read_csv(csv_path)
                n_rows = len(df)
            except Exception as e:
                print(f"Error reading {csv_path}: {e}")
                n_rows = 0

            # Assume each CSV file corresponds to one product_id
            category_info[key]["num_product_ids"] += 1
            category_info[key]["total_reviews_in_category"] += n_rows

            csv_rows_info.append({
                "gender": gender,
                "category": category,
                "file_name": file_name,
                # Optional: parse product_id from file name if needed
                # "product_id": file_name.replace(".csv", ""),
                "num_reviews_in_file": n_rows,
            })

# ---- Convert to DataFrames for nicer display / further analysis ----
category_df = pd.DataFrame(category_info.values())
csv_df = pd.DataFrame(csv_rows_info)

# ---- Overall stats ----
total_categories = len(category_df)
total_reviews = int(category_df["total_reviews_in_category"].sum())
total_product_ids = int(category_df["num_product_ids"].sum())

print("\n===== Overall Summary =====")
print(f"Total product categories (across female & male): {total_categories}")
print(f"Total product_ids (CSV files): {total_product_ids}")
print(f"Total number of reviews (rows): {total_reviews}")

print("\n===== Categories (folder names) =====")
print(category_df[["gender", "category"]].sort_values(["gender", "category"]).to_string(index=False))

print("\n===== Per-category summary =====")
# columns: gender, category, num_product_ids, total_reviews_in_category
print(category_df.sort_values(["gender", "category"]).to_string(index=False))

print("\n===== Per-CSV (product_id) summary (first 20 rows) =====")
print(csv_df.head(20).to_string(index=False))


===== Overall Summary =====
Total product categories (across female & male): 18
Total product_ids (CSV files): 660
Total number of reviews (rows): 55928

===== Categories (folder names) =====
       gender               category
UNIQLO_female  UNIQLO_T-shirt_female
UNIQLO_female   UNIQLO_blouse_female
UNIQLO_female     UNIQLO_coat_female
UNIQLO_female    UNIQLO_dress_female
UNIQLO_female      UNIQLO_femaledata
UNIQLO_female    UNIQLO_jeans_female
UNIQLO_female   UNIQLO_jersey_female
UNIQLO_female UNIQLO_knitwear_female
UNIQLO_female    UNIQLO_pants_female
UNIQLO_female UNIQLO_roomwear_female
UNIQLO_female    UNIQLO_skirt_female
  UNIQLO_male    UNIQLO_T-shirt_male
  UNIQLO_male     UNIQLO_blouse_male
  UNIQLO_male       UNIQLO_coat_male
  UNIQLO_male      UNIQLO_jeans_male
  UNIQLO_male     UNIQLO_jersey_male
  UNIQLO_male        UNIQLO_maledata
  UNIQLO_male   UNIQLO_roomwear_male

===== Per-category summary =====
       gender               category  num_product_ids  total_reviews_i