In [35]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
from PIL import Image
from mlxtend.frequent_patterns import apriori, association_rules

np.random.seed(42)

In [36]:
fashion_df = pd.read_csv("../data/cleaned_data/fashion_dataset.csv")

In [37]:
def load_attributes_cloth(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    num_attributes = int(lines[0].strip())
    columns = lines[1].strip().split()
    
    data = []
    for i in range(2, len(lines)):
        line = lines[i].strip()
        parts = line.split()
        
        attr_type = int(parts[-1])
        attr_name = ' '.join(parts[:-1])
        
        attr_id = i - 2
        
        data.append([attr_id, attr_name, attr_type])
    
    attr_df = pd.DataFrame(data, columns=['attr_id', 'attr_name', 'attr_type'])
    attr_df['attr_type_name'] = attr_df['attr_type'].map({
        1: 'texture', 
        2: 'fabric', 
        3: 'shape', 
        4: 'part', 
        5: 'style'
    })
    
    return attr_df
attr_df = load_attributes_cloth("../data/Anno_coarse/list_attr_cloth.txt")


In [38]:
# Extract all attributes as a list
all_attributes = []
for attrs in fashion_df['positive_attributes']:
    all_attributes.extend(attrs)
print(len(all_attributes))

attr_count = Counter(all_attributes)
print(attr_count)

# DF with attribute counts
attr_count_df = pd.DataFrame({
    'attr_id': list(attr_count.keys()),
    'count': list(attr_count.values())
})

attr_count_df['attr_id'] = attr_count_df['attr_id'].astype(str)
attr_df['attr_id'] = attr_df['attr_id'].astype(str)

4749219
Counter({',': 684738, ' ': 684738, '3': 354968, '8': 340360, '1': 299875, '2': 289528, '[': 289212, ']': 289212, '6': 284199, '5': 283679, '7': 267500, '9': 250359, '4': 236285, '0': 194566})


In [39]:
# Merge with attribute names if available
attr_count_df = pd.merge(attr_count_df, 
                           attr_df[['attr_id', 'attr_name', 'attr_type_name']], 
                           on='attr_id', how='left')

# calculate percentage and sort by count
attr_count_df['percentage'] = attr_count_df['count'] / len(fashion_df) * 100
attr_count_df = attr_count_df.sort_values('count', ascending=False)

In [40]:
def gen_fashion_baskets(fashion_df, attr_df, num_baskets=2000):
    """Market Baskets - Generate baskets from fashion items based on categories"""
    item_names = {}
    for idx, row in fashion_df.iterrows():
        category = row['category_name']
        attrs = row['positive_attributes']

        if attr_df is not None and len(attrs) > 0:
            attr_id = attrs[0]
            attr_row = attr_df[attr_df['attr_id'] == attr_id]
            if not attr_row.empty and attr_row['attr_type_name'].iloc[0] in ['texture', 'fabric']:
                item_names[idx] = f"{category} ({attr_row['attr_name'].iloc[0]})"
                continue
        item_names[idx] = category

    # group by clothing type
    upper_body = fashion_df[fashion_df['category_type_name'] == 'upper-body'].index.tolist()
    lower_body = fashion_df[fashion_df['category_type_name'] == 'lower-body'].index.tolist()
    full_body = fashion_df[fashion_df['category_type_name'] == 'full-body'].index.tolist()

    # get items of each type
    for group in [upper_body, lower_body, full_body]:
        if not group:
            group = fashion_df.sample(min(100, len(fashion_df))).index.tolist()

    baskets = []
    for _ in range(num_baskets):
        basket_type = np.random.choice(['upper+lower', 'full', 'upper', 'lower', 'mixed'],
                                       p=[0.5, 0.2, 0.1, 0.1, 0.1])
        basket = []
        if basket_type == 'upper+lower':
            if upper_body and lower_body:
                basket.append(random.choice(upper_body))
                basket.append(random.choice(lower_body))
                if random.random() < 0.3 and upper_body:
                    basket.append(random.choice(upper_body))
        elif basket_type == 'full':
            if full_body:
                basket.append(random.choice(full_body))
                if random.random() < 0.7 and upper_body:
                    basket.append(random.choice(upper_body))
        elif basket_type == 'upper':
            if upper_body:
                for _ in range(random.randint(1, 3)):
                    if upper_body:
                        basket.append(random.choice(upper_body))
        elif basket_type == 'lower':
            if lower_body:
                for _ in range(random.randint(1, 3)):
                    if upper_body:
                        basket.append(random.choice(lower_body))
        else:
            all_items = fashion_df.index.tolist()
            for _ in range(random.randint(2, 4)):
                if all_items:
                    basket.append(random.choice(all_items))

        basket = list(set(basket))
        if basket:
            baskets.append(basket)

    labeled_baskets = [[item_names[item_idx] for item_idx in basket] for basket in baskets]
    print(f"{len(labeled_baskets)} created")

    item_mapping = {v: k for k,v in item_names.items()}
    return labeled_baskets, item_mapping

In [41]:
baskets, item_mapping = gen_fashion_baskets(fashion_df, attr_df)

print("\nExample baskets:")
for i, basket in enumerate(baskets[:5]):
    print(f"Basket {i+1}: {basket}")

2000 created

Example baskets:
Basket 1: ['Shorts', 'Hoodie']
Basket 2: ['Tee', 'Blouse', 'Shorts', 'Leggings']
Basket 3: ['Top', 'Blouse', 'Blouse']
Basket 4: ['Jumpsuit', 'Cardigan']
Basket 5: ['Skirt', 'Jacket', 'Tee']


In [42]:
def group_items_by_category_baskets(baskets, df):
    """Group items by their category type and style rules"""
    cat_groups = {}
    cat_to_group = {}
    
    for _, row in df.iterrows():
        cat_type = row['category_type_name']
        cat_name = row['category_name']
        
        if cat_type not in cat_groups:
            cat_groups[cat_type] = set()

        cat_groups[cat_type].add(cat_name)
        cat_to_group[cat_name] = cat_type

    grouped_baskets = []
    for basket in baskets:
        grouped_baskets.append(basket)
        grouped = [cat_to_group.get(item.split(' (')[0], item) for item in basket]
        grouped = list(set(grouped))
        if len(grouped) > 1:
            grouped_baskets.append(grouped)

    style_rules = [
        ('T-shirt', 'Jeans', 0.3),
        ('Blouse', 'Skirt', 0.2),
        ('Sweater', 'Leggings', 0.15),
        ('Button-down', 'Chinos', 0.15),
        ('Jacket', 'Jeans', 0.2)
    ]

    cat_to_items = {}
    for idx, row in df.iterrows():
        cat = row['category_name']
        if cat not in cat_to_items:
            cat_to_items[cat] = []
        cat_to_items[cat].append(idx)

    for _ in range(500):
        rule = random.choice(style_rules)
        if rule[0] not in cat_to_items or rule[1] not in cat_to_items:
            continue

        upper_items = cat_to_items[rule[0]]
        lower_items = cat_to_items[rule[1]]

        if upper_items and lower_items:
            new_basket = [random.choice(upper_items), random.choice(lower_items)]
            if random.random() < 0.5 and 'Accessories' in cat_to_items:
                new_basket.append(random.choice(cat_to_items['Accessories']))

            grouped_baskets.append(new_basket)

    return grouped_baskets
    

category_baskets = group_items_by_category_baskets(baskets, fashion_df)

In [43]:
def encode_baskets(baskets, min_support=0.01):
    """
    Encode baskets into a binary matrix format for association rule mining
    """    
    # Get all unique items
    all_items = set()
    for basket in baskets:
        all_items.update(basket)
    
    print(f"{len(all_items)} unique items across all baskets")
    
    matrix = []
    for basket in baskets:
        row = {item: 1 if item in basket else 0 for item in all_items}
        matrix.append(row)
    
    encoded_df = pd.DataFrame(matrix)
    
    # Filter out items that appear too infrequently
    min_count = int(min_support * len(baskets))
    item_counts = encoded_df.sum()
    frequent_items = item_counts[item_counts >= min_count].index.tolist()
        
    encoded_df = encoded_df[frequent_items]
    
    return encoded_df

In [44]:
encoded_df = encode_baskets(category_baskets, min_support=0.01)
print(f"Encoded matrix shape: {encoded_df.shape}")
display(encoded_df.iloc[:5, :5])

630 unique items across all baskets
Encoded matrix shape: (3732, 22)


Unnamed: 0,Romper,Jacket,Jumpsuit,Dress,Cutoffs
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [45]:
# Find frequent itemsets
frequent_itemsets = apriori(encoded_df, min_support=0.005, use_colnames=True, 
                            max_len=5,low_memory=True, verbose=1)
frequent_itemsets = frequent_itemsets.sort_values('support', ascending=False)
print(f"Found {len(frequent_itemsets)} frequent itemsets")

# Display the top frequent itemsets
print("\nTop 10 frequent itemsets by support:")
display(frequent_itemsets.head(10))

Processing 4 combinations | Sampling itemset size 3 2
Found 68 frequent itemsets

Top 10 frequent itemsets by support:




Unnamed: 0,support,itemsets
11,0.377814,(upper-body)
15,0.289657,(lower-body)
62,0.282154,"(lower-body, upper-body)"
5,0.140407,(Tee)
8,0.124866,(Shorts)
3,0.122186,(Dress)
12,0.11254,(full-body)
61,0.105038,"(full-body, upper-body)"
6,0.097267,(Skirt)
17,0.095391,(Blouse)


In [46]:
# Create a new column with itemset size
frequent_itemsets['itemset_size'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules = rules[rules['lift'] >= 1.0]
rules = rules.sort_values(['confidence', 'lift'], ascending=False)
print(f"{len(rules)} association rules found")
print("\nTop 10 association rules by confidence:")
display(rules.head(10))

4 association rules found

Top 10 association rules by confidence:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(lower-body),(upper-body),0.289657,0.377814,0.282154,0.974098,2.578251,1.0,0.172718,24.020843,0.861753,0.732267,0.958369,0.860453
2,(full-body),(upper-body),0.11254,0.377814,0.105038,0.933333,2.470355,1.0,0.062518,9.332797,0.670678,0.272601,0.892851,0.605674
1,(upper-body),(lower-body),0.377814,0.289657,0.282154,0.746809,2.578251,1.0,0.172718,2.805556,0.983853,0.732267,0.643564,0.860453
3,"(lower-body, full-body)",(upper-body),0.016881,0.377814,0.009378,0.555556,1.470449,1.0,0.003,1.39992,0.325429,0.024339,0.285673,0.290189
