# Preprocess: Create FashionGen metadata for Mmt.

# Defines paths and loads raw csv metadata files from Fashion-BERT and kaleido-BERT.

In [39]:
import pandas as pd
import tensorflow as tf

# i2t: image-to-text.
i2t_path = '/bigstore/mmt/raw_data/fashion_gen/fashion_gen_i2t_test_pairs.csv'
# t2i: text-to-image.
t2i_path = '/bigstore/mmt/raw_data/fashion_gen/fashion_gen_t2i_test_pairs.csv'

t2i_output_path = '/bigstore/mmt/fashion_gen/metadata/fashion_bert_t2i_test.csv'
i2t_output_path = '/bigstore/mmt/fashion_gen/metadata/fashion_bert_i2t_test.csv'

dtype = {
    'image_prod_id': str,
    'prod_img_id': str,
    'text_prod_id': str,
}

with tf.io.gfile.GFile(i2t_path, 'r') as f:
  i2t_df = pd.read_csv(f, dtype=dtype)
with tf.io.gfile.GFile(t2i_path, 'r') as f:
  t2i_df = pd.read_csv(f, dtype=dtype)

In [40]:
i2t_df

Unnamed: 0,text_prod_id,image_prod_id,prod_img_id,desc
0,1877243,1903663,1,"""Wide-leg jeans in navy. High-rise. Raw edge a..."
1,1168593,1903663,1,"Long sleeve shirt in tones of red, blue, yello..."
2,1222813,1903663,1,Short sleeve t-shirt in black. Ribbed crewneck...
3,2294297,1903663,1,Long sleeve pressed wool coat in black. Raw ed...
4,2417117,1903663,1,"""Skinny-fit jeans in 'dirty' blue. Distressing..."
...,...,...,...,...
100984,1743413,1797193,4,"Short sleeve virgin wool, cotton & mohair-blen..."
100985,2218867,1797193,4,Long sleeve wool sweater in red. Rib knit crew...
100986,1797193,1797193,4,Long sleeve jacket in 'darkest' black. Stand c...
100987,1903663,1903663,1,Short sleeve cotton jersey 'coachella' tie-dye...


In [41]:
t2i_df

Unnamed: 0,text_prod_id,image_prod_id,prod_img_id,desc
0,875903,2104647,0,"High-waist stretch pencil skirt in red, white ..."
1,1198303,2104647,0,Long sleeve quilted down jacket in black. Two-...
2,2281297,2104647,0,Long sleeve rib knit virgin wool sweater in bl...
3,2293607,2104647,0,Long sleeve water-repellant coat in dark navy....
4,1597583,2104647,0,Patent leather heel in black. Pointed toe. Cov...
...,...,...,...,...
100984,110763,2104647,0,"Long sleeve flannel shirt in red, navy, and wh..."
100985,1772593,2104647,0,Short sleeve 'mistral' silk-blend crepe t-shir...
100986,91397,2104647,0,Short sleeve relaxed-fit t-shirt colorblocked ...
100987,2328747,2104647,0,Pair of stud earrings in antiqued gold-tone br...


In [42]:
def add_columns(df):
  """Adds new columns to the dataframe.

  New columns: image_id, image_index, text_index, and gt.

  A product will have multiple images (files). They are images from differnt
  angle of view of the product. Thus, `image_prod_id` is the main product id and 
  `prod_img_id` is id for different angles. One product has one text description.

  image_id: image file name.
  image_index: unique image index for the image file.
  text_index: unique text index for the product description.
  gt: if the row in the dataframe is the gruth-truth pair.

  Args:
    df: a pd.DataFrame. Each row of the dataframe is a image-text pair.
  Returns:
    a pd.DataFrame.

  """
  # image_id is the id for an image of a product. A product can have multiple
  # image_id's (image files).
  df['image_id'] = df['image_prod_id'] + '_' + df['prod_img_id']
  # Gives each text_prod_id a unique index.
  df['text_index'] = df.assign(id=df['text_prod_id'])['id'].astype(
      'category').cat.codes
  # Gives each image_id (an image file) a unique index.
  df['image_index'] = df.assign(id=df['image_id'])['id'].astype(
      'category').cat.codes
  # If image and text have the smae product id, they are a ground-truth pair.
  df['gt'] = (df['image_prod_id'] == df['text_prod_id']).astype(int)
  return df

# i2t: add columns


In [43]:
i2t_df = add_columns(i2t_df)

# Gets all ground-truth pairs in gt_df.
gt_df = i2t_df[i2t_df['gt'] == 1][['text_index', 'image_index']].rename(
    columns={'image_index': 'gt_image_index'})
gt_df

Unnamed: 0,text_index,gt_image_index
101,3924,535
200,4858,650
302,6056,799
402,6057,800
504,6658,876
...,...,...
100682,2792,384
100684,2967,409
100885,3033,422
100986,3359,471


In [44]:
# We give each text_index their ground-truth image_index if exists.
# Since FashionGen does not share the same retrieval pool (text pool for i2t),
# some text_index will not have corresponding gt_image_index.
# Thus, we fill -1 for those non-existent gt_image_index.
i2t_df = i2t_df.merge(gt_df, how='left', on='text_index').fillna(-1)

# Converts gt_image_index column from float to int.
i2t_df['gt_image_index'] = i2t_df['gt_image_index'].astype(int)

with tf.io.gfile.GFile(i2t_output_path, 'w') as f:
  i2t_df.to_csv(f, index=False)

i2t_df

Unnamed: 0,text_prod_id,image_prod_id,prod_img_id,desc,image_id,text_index,image_index,gt,gt_image_index
0,1877243,1903663,1,"""Wide-leg jeans in navy. High-rise. Raw edge a...",1903663_1,3784,528,0,520
1,1168593,1903663,1,"Long sleeve shirt in tones of red, blue, yello...",1903663_1,776,528,0,-1
2,1222813,1903663,1,Short sleeve t-shirt in black. Ribbed crewneck...,1903663_1,991,528,0,-1
3,2294297,1903663,1,Long sleeve pressed wool coat in black. Raw ed...,1903663_1,5444,528,0,-1
4,2417117,1903663,1,"""Skinny-fit jeans in 'dirty' blue. Distressing...",1903663_1,5902,528,0,-1
...,...,...,...,...,...,...,...,...,...
100984,1743413,1797193,4,"Short sleeve virgin wool, cotton & mohair-blen...",1797193_4,3139,471,0,-1
100985,2218867,1797193,4,Long sleeve wool sweater in red. Rib knit crew...,1797193_4,5125,471,0,-1
100986,1797193,1797193,4,Long sleeve jacket in 'darkest' black. Stand c...,1797193_4,3359,471,1,471
100987,1903663,1903663,1,Short sleeve cotton jersey 'coachella' tie-dye...,1903663_1,3889,528,1,528


# t2i: add columns

In [45]:
t2i_df = add_columns(t2i_df)
gt_df = t2i_df[t2i_df['gt'] == 1][['text_index', 'image_index']].rename(
    columns={'image_index': 'gt_image_index'})

gt_df

Unnamed: 0,text_index,gt_image_index
183,724,5466
296,758,5772
362,790,5992
465,824,6259
579,849,6452
...,...,...
100711,574,4230
100746,591,4405
100788,598,4441
100808,609,4511


In [46]:
# We give each text_index their ground-truth image_index.
# We don't have non-existed gt_image_index becuase it iss text-to-image.
t2i_df = t2i_df.merge(gt_df, how='left', on='text_index')

with tf.io.gfile.GFile(t2i_output_path, 'w') as f:
  t2i_df.to_csv(f, index=False)
  
t2i_df

Unnamed: 0,text_prod_id,image_prod_id,prod_img_id,desc,image_id,text_index,image_index,gt,gt_image_index
0,875903,2104647,0,"High-waist stretch pencil skirt in red, white ...",2104647_0,932,4701,0,7115
1,1198303,2104647,0,Long sleeve quilted down jacket in black. Two-...,2104647_0,120,4701,0,900
2,2281297,2104647,0,Long sleeve rib knit virgin wool sweater in bl...,2104647_0,713,4701,0,5383
3,2293607,2104647,0,Long sleeve water-repellant coat in dark navy....,2104647_0,720,4701,0,5439
4,1597583,2104647,0,Patent leather heel in black. Pointed toe. Cov...,2104647_0,350,4701,0,2516
...,...,...,...,...,...,...,...,...,...
100984,110763,2104647,0,"Long sleeve flannel shirt in red, navy, and wh...",2104647_0,64,4701,0,430
100985,1772593,2104647,0,Short sleeve 'mistral' silk-blend crepe t-shir...,2104647_0,456,4701,0,3261
100986,91397,2104647,0,Short sleeve relaxed-fit t-shirt colorblocked ...,2104647_0,948,4701,0,7200
100987,2328747,2104647,0,Pair of stud earrings in antiqued gold-tone br...,2104647_0,738,4701,0,5565


In [47]:
# 989 texts have 101 images; 11 texts have 100 images.
print('101 images: ', (t2i_df['text_index'].value_counts() == 101).sum())
print('100 images: ', (t2i_df['text_index'].value_counts() == 100).sum())

# ground-truth pairs
print('# ground-truth: ', t2i_df['gt'].sum())

101 images:  989
100 images:  11
# ground-truth:  1000


In [48]:
# 989 images have 101 text; 11 images have 100 texts.
print('101 images: ', (i2t_df['image_index'].value_counts() == 101).sum())
print('100 images: ', (i2t_df['image_index'].value_counts() == 100).sum())

# ground-truth pairs
print('# ground-truth: ', i2t_df['gt'].sum())

101 images:  989
100 images:  11
# ground-truth:  1000
