In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
from PIL import Image
import torch
from mlxtend.frequent_patterns import apriori, association_rules
import ast

np.random.seed(42)

In [2]:
fashion_df = pd.read_csv("../data/cleaned_data/fashion_dataset.csv")

In [3]:
def load_attributes_cloth(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    num_attributes = int(lines[0].strip())
    columns = lines[1].strip().split()
    
    data = []
    for i in range(2, len(lines)):
        line = lines[i].strip()
        parts = line.split()
        
        attr_type = int(parts[-1])
        attr_name = ' '.join(parts[:-1])
        
        attr_id = i - 2
        
        data.append([attr_id, attr_name, attr_type])
    
    attr_df = pd.DataFrame(data, columns=['attr_id', 'attr_name', 'attr_type'])
    attr_df['attr_type_name'] = attr_df['attr_type'].map({
        1: 'texture', 
        2: 'fabric', 
        3: 'shape', 
        4: 'part', 
        5: 'style'
    })
    
    return attr_df
attr_df = load_attributes_cloth("../data/Anno_coarse/list_attr_cloth.txt")

In [11]:
# Extract all unique
all_attributes = []
for attrs in fashion_df['positive_attributes']:
    if isinstance(attrs, str):
        try:
            parsed = ast.literal_eval(attrs)
            all_attributes.append(parsed if isinstance(parsed, list) else [])
        except:
            stripped = attrs.strip('[]')
            if stripped:
                items = [item.strip() for item in stripped.split(',')]
                all_attributes.append([int(item) for item in items if item.isdigit()])
            else:
                all_attributes.append([])
    elif isinstance(attrs, list):
        all_attributes.append(attrs)
    else:
        all_attributes.append([])
print(len(all_attributes))

attr_counts = {}
for attr_list in all_attributes:
    for attr in attr_list:
        attr_id = str(attr)
        attr_counts[attr_id] = attr_counts.get(attr_id, 0) + 1

valid_attrs = {attr_id for attr_id, count in attr_counts.items()
              if count >= 5 and attr_id in attr_df['attr_id'].astype(str).values}

print(f"{len(valid_attrs)} out of {len(attr_counts)} total attrs")

attribute_matrix = []
for attr_list in all_attributes:
    item_attrs = {attr_id: 0 for attr_id in valid_attrs}

    for attr in attr_list:
        attr_id = str(attr)
        if attr_id in valid_attrs:
            item_attrs[attr_id] = 1
    attribute_matrix.append(item_attrs)
attribute_matrix = pd.DataFrame(attribute_matrix)

attr_id_to_name = dict(zip(attr_df['attr_id'].astype(str), attr_df['attr_name']))
attr_id_to_type = dict(zip(attr_df['attr_id'].astype(str), attr_df['attr_type_name']))

attr_mapping = {}
for attr_id in valid_attrs:
    attr_name = attr_id_to_name.get(attr_id, "Unknown")
    attr_type = attr_id_to_type.get(attr_id, "Unknown")
    attr_mapping[attr_id] = f"{attr_name} ({attr_type})"

attribute_matrix.rename(columns=attr_mapping, inplace=True)

frequent_itemsets = apriori(attribute_matrix, min_support=0.01, use_colnames=True, max_len=3)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules = rules[rules['lift'] >= 1.0]
rules = rules.sort_values(['confidence', 'lift'], ascending=False)
print(f"{len(rules)} association rules found")
print("\nTop 10 association rules by confidence:")
display(rules.head(10))

289212
1000 out of 1000 total attrs




25 association rules found

Top 10 association rules by confidence:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
18,(faux leather (fabric)),"(faux (fabric), leather (fabric))",0.014726,0.015345,0.014726,1.0,65.167192,1.0,0.0145,inf,0.999372,0.959667,1.0,0.979833
0,(faux leather (fabric)),(faux (fabric)),0.014726,0.025044,0.014726,1.0,39.929863,1.0,0.014357,inf,0.989528,0.588016,1.0,0.794008
16,"(faux leather (fabric), leather (fabric))",(faux (fabric)),0.014726,0.025044,0.014726,1.0,39.929863,1.0,0.014357,inf,0.989528,0.588016,1.0,0.794008
6,(faux leather (fabric)),(leather (fabric)),0.014726,0.025359,0.014726,1.0,39.434415,1.0,0.014353,inf,0.989209,0.58072,1.0,0.79036
15,"(faux leather (fabric), faux (fabric))",(leather (fabric)),0.014726,0.025359,0.014726,1.0,39.434415,1.0,0.014353,inf,0.989209,0.58072,1.0,0.79036
24,(floral print (texture)),"(floral (texture), print (texture))",0.02614,0.032139,0.02614,1.0,31.114793,1.0,0.0253,inf,0.99384,0.813341,1.0,0.90667
10,(long sleeve (part)),(sleeve (part)),0.023014,0.061643,0.023014,1.0,16.222347,1.0,0.021596,inf,0.960461,0.373345,1.0,0.686673
14,(floral print (texture)),(floral (texture)),0.02614,0.083634,0.02614,1.0,11.956838,1.0,0.023954,inf,0.940963,0.312552,1.0,0.656276
22,"(floral print (texture), print (texture))",(floral (texture)),0.02614,0.083634,0.02614,1.0,11.956838,1.0,0.023954,inf,0.940963,0.312552,1.0,0.656276
11,(floral print (texture)),(print (texture)),0.02614,0.129203,0.02614,1.0,7.73977,1.0,0.022763,inf,0.894171,0.202318,1.0,0.601159
