In [1]:
import os
import sys
import csv
import numpy as np
import pandas as pd

In [2]:
## Import Image Label File
product_meta = pd.read_csv('product_meta.csv')

In [3]:
## Import Class file and create a list of all the labels
class_list = pd.read_csv('classes.csv')
class_labels = class_list['label'].to_list()

In [4]:
## Eliminate labels that aren't relevant to the clothing domain
product_meta2 = product_meta[product_meta['label'].isin(class_labels)]

In [7]:
## Merge classification dataframe with product dataframe
product_merged = pd.merge(product_meta, class_list, on='label')

In [8]:
## Sort DataFrame by product_id and tier_domain to assist in quicker processing later
products_sorted = product_merged.sort_values(by=['product_id', 'tier_domain'])

In [9]:
## Groups the data by product_id and tier_domain and creates a rank based on the confidence
## Take only the top rank of each tier_domain
products_sorted['tier_domain_rank'] = products_sorted.groupby(['product_id', 'tier_domain'])['confidence'].rank(method='first',ascending=False)
products_reduced = products_sorted[products_sorted['tier_domain_rank'] == 1]

In [12]:
## Reduce the DataFrame to just the columns needed
product_pre_pivot = products_reduced[['label', 'product_id', 'tier_domain', 'domain', 'domain_restrictions']]

In [15]:
## Pivot rows into columns to show all labels for an image in one row
product_pivoted = product_pre_pivot.pivot(index='product_id', columns='tier_domain', values=['label', 'domain', 'domain_restrictions'])

In [19]:
product_pivoted.to_csv('pivtoted_out.csv')

In [None]:
## Manually change the Headers in the file based on the sample. It's quicker than doing it programmatically

In [21]:
product_meta = pd.read_csv('pivtoted_out.csv')
product_meta.set_index('Image', inplace=True)
product_meta.fillna(0, inplace=True)

In [22]:
## Classification Logic. Ensure that all the labels fall into their same super domain 
for n in range(1,7):
    product_meta['Object'+ str(n)] = np.where(np.logical_or(
    product_meta['Object'+str(n-1)+'_domain'] == product_meta['Object'+str(n)+'_domain'],
    product_meta['Object'+str(n-1)+'_domain'] == 0),
    product_meta['Object'+str(n)] ,0)
    product_meta['Object'+str(n)+'_domain'] = product_meta['Object'+str(n-1)+'_domain']

In [27]:
## Clean up dataframe by dropping unnecessary columns and fill Nan with a numerical value
product_meta.drop(['Object0_domain','Object1_domain','Object2_domain','Object3_domain','Object4_domain','Object5_domain','Object6_domain'], axis=1, inplace=True)

In [28]:
## Classification logic. Ensure that all the labels fall into their respective sub domains within their super domains
for n in range(1,7):
    product_meta['Object'+ str(n)] = np.where(np.logical_or(
        product_meta['Object'+str(n)+'_restriction'] ==  product_meta['Object'+str(n-1)],
        product_meta['Object'+str(n)+'_restriction'] == 0),
                                              product_meta['Object'+ str(n)],0)

In [29]:
## Clean up dataframe by dropping unnecessary columns
product_meta.drop(['Object0_restriction','Object1_restriction','Object2_restriction','Object3_restriction','Object4_restriction','Object5_restriction','Object6_restriction'], axis=1, inplace=True)

In [32]:
product_meta.to_csv('product_final.csv')

In [None]:
# In Excel manually remove all the 0's in the Object columns only. Be sure to not replace the 0's in the Image ID column