In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
from PIL import Image
import torch
from mlxtend.frequent_patterns import apriori, association_rules

np.random.seed(42)

In [None]:
fashion_df = pd.read_csv("../data/cleaned_data/fashion_dataset.csv")

In [None]:
def load_attributes_cloth(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    num_attributes = int(lines[0].strip())
    columns = lines[1].strip().split()
    
    data = []
    for i in range(2, len(lines)):
        line = lines[i].strip()
        parts = line.split()
        
        attr_type = int(parts[-1])
        attr_name = ' '.join(parts[:-1])
        
        attr_id = i - 2
        
        data.append([attr_id, attr_name, attr_type])
    
    attr_df = pd.DataFrame(data, columns=['attr_id', 'attr_name', 'attr_type'])
    attr_df['attr_type_name'] = attr_df['attr_type'].map({
        1: 'texture', 
        2: 'fabric', 
        3: 'shape', 
        4: 'part', 
        5: 'style'
    })
    
    return attr_df
attr_df = load_attributes_cloth("../data/Anno_coarse/list_attr_cloth.txt")

In [None]:
# Extract all unique
all_attributes = set()
for attrs in fashion_df['positive_attributes']:
    all_attributes.update(attrs)
print(len(all_attributes))

attribute_matrix = []
for _, row in fashion_df.iterrows():
    attrs = row['positive_attributes']
    item_attrs = {attr_id: 1 if attr_id in attrs else 0 for attr_id in all_attributes}
    attribute_matrix.append(item_attrs)
attribute_matrix = pd.DataFrame(attribute_matrix)

frequent_itemsets = apriori(attribute_matrix, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules = rules[rules['lift'] >= 1.0]
rules = rules.sort_values(['confidence', 'lift'], ascending=False)
print(f"{len(rules)} association rules found")
print("\nTop 10 association rules by confidence:")
display(rules.head(10))

# attr_count = Counter(all_attributes)
# print(attr_count)

# # DF with attribute counts
# attr_count_df = pd.DataFrame({
#     'attr_id': list(attr_count.keys()),
#     'count': list(attr_count.values())
# })

# attr_count_df['attr_id'] = attr_count_df['attr_id'].astype(str)
# attr_df['attr_id'] = attr_df['attr_id'].astype(str)

In [39]:
# Merge with attribute names if available
attr_count_df = pd.merge(attr_count_df, 
                           attr_df[['attr_id', 'attr_name', 'attr_type_name']], 
                           on='attr_id', how='left')

# calculate percentage and sort by count
attr_count_df['percentage'] = attr_count_df['count'] / len(fashion_df) * 100
attr_count_df = attr_count_df.sort_values('count', ascending=False)