In [1]:
import numpy as np
import pandas as pd
# libraries used for nlp below
import spacy
import Levenshtein as lev
from fuzzywuzzy import fuzz
from fuzzywuzzy import process 

# Helper functions

In [2]:
# remove non-english words
def remove_non_eng(tag):
    try:
        tag.encode('ascii')
    except UnicodeEncodeError:
        return '-'
    else:
        return tag

Lev distance functions:

In [3]:
# get the distance and ratio for Levenshtein Distance
def get_lev(sparse_tag):
    str1 = sparse_tag
    lev_distance = []
    lev_ratio = []
    for tag in true_tags:
        str2 = tag
        distance = lev.distance(str1, str2)
        lev_distance.append((str2, distance))
        ratio = lev.ratio(str1, str2)
        lev_ratio.append((str2, ratio))
    
    return lev_distance, lev_ratio

# get the replacement word
def get_replacement_lev(distance, ratio):
    combine_list = zip(distance, ratio)
    top_list = sorted(sorted(combine_list, key=lambda x: x[1][1], reverse=True),
                      key=lambda x: x[0][1], reverse=False) 
    top_ratio = top_list[0][1][1]
    if top_ratio >= 0.5: # threshold score of 0.5 based on our testing
        replacement_word = top_list[0][0][0]
    else:
        # if the similarity between the words do no pass the threshold,
        # drop the tag by tagging it as 'notags'
        replacement_word = 'notags' 
    
    return replacement_word           

# get the replacement tags for each picture using lev distance
def replace_list_lev(list_tag):
    list_tags = []
    for i in range(len(list_tag)):
        current = list_tag[i].lower()
        if current in sparse_tags:
            replace = change_list_lev[current]
            list_tags.append(replace)
        elif '...' in current:
            continue
        else:
            list_tags.append(current)
            
    return list_tags

Fuzzy functions:

In [4]:
# get the best 1 from FuzzyWuzzy using token_set_ratio, with cut off at 80 out of 100
def get_replacement_fuzzy(str2Match):
    ratios = process.extractBests(str2Match, true_tags,
                                  scorer=fuzz.token_set_ratio,
                                  score_cutoff=80, limit = 1)  
    if not ratios:
        return 'notags'
    
    return ratios[0][0]

# get the replacement tags for each picture using FuzzyWuzzy
def replace_list_fuzzy(list_tag):
    list_tags = []
    for i in range(len(list_tag)):
        current = list_tag[i].lower()
        if current in sparse_tags:
            replace = change_list_fuzzy[current]
            list_tags.append(replace)
        elif '...' in current:
            continue
        else:
            list_tags.append(current)
            
    return list_tags

Spacy functions:

In [5]:
# get the best 1 from spaCy, with cut off at 80%
def get_replacement_spacy(spacy_token):
    doc_similarity = []
    for doc in docs_true:
        similarity = doc.similarity(spacy_token)
        doc_similarity.append((doc.text, similarity))
    doc_similarity = sorted(doc_similarity, key = lambda x: x[1], reverse = True)
    if doc_similarity[0][1] >= 0.8:
        return doc_similarity[0][0]
    
    return 'notags'

# get the replacement tags for each picture using Spacy
def replace_list_spacy(list_tag):
    list_tags = []
    for i in range(len(list_tag)):
        current = list_tag[i].lower()
        if current in sparse_tags:
            replace = change_list_spacy[current]
            list_tags.append(replace)
        elif '...' in current:
            continue
        else:
            list_tags.append(current)
            
    return list_tags

Combined vote:

In [6]:
# get the best 1 from Lev Distance, FuzzyWuzzy and Spacy using majority voting
def get_replacement_combine(lev, fuzzy, spacy):
    if lev == fuzzy and lev != 'notags': #if two tags are the same and is not 'notags', return that tag
        return lev
    elif lev == spacy and lev != 'notags': #if two tags are the same and is not 'notags', return that tag
        return lev
    elif fuzzy == spacy and fuzzy != 'notags': #if two tags are the same and is not 'notags', return that tag
        return fuzzy
    elif fuzzy != 'notags': #if all three are different and is not 'notags', use fuzzy
        return fuzzy
    else:
        return 'notags'

# get the replacement tags for each picture using combination
def replace_list_combine(list_tag):
    list_tags = []
    for i in range(len(list_tag)):
        current = list_tag[i].lower()
        if current in sparse_tags:
            replace = change_list_combine[current]
            list_tags.append(replace)
        elif '...' in current:
            continue
        else:
            list_tags.append(current)
            
    return list_tags

Processing for AutoML:

In [56]:
# drop tags that are sparse, below min_freq
def drop_tags_below_freq_thresh(data, min_freq):
    counts = data.sum().sort_values()
    to_drop = counts[counts < min_freq].index.tolist()
    print("Number of tags dropped: {}".format(len(to_drop)))
    return data.drop(columns=to_drop)

# reformat dataframe to a table that can be used by automl
def format_to_automl_labels(data, index, project_name=None):
    # might need to add for 'None_of_the_above'
    data.index = index.apply(lambda x: project_name + x + '.jpg')
    for col in data.columns:
        data[col] = data[col].replace(1, col).replace(0, '')
        if (data[col].value_counts() < 0).any():
            print(col)
    return data

# Load in tags data

In [7]:
with open('clothes_tags.txt', 'r') as f:
    clothes_tags = f.readlines()
    clothes_tags = [s.replace('\n', '') for s in clothes_tags]

with open('chictopia_tags.txt', 'r') as f:
    true_tags = f.readlines()
    true_tags = [s.replace('\n', '') for s in true_tags]

In [8]:
# get tags from dress info table
dress_info_df = 'dress_info_0to10k.csv'
reference_df = pd.read_csv(dress_info_df)
reference_df.tags = reference_df.tags.fillna('["-"]').apply(eval).apply(list)

In [9]:
# get a list of all the unique tags in lowercase
tags = pd.Series(reference_df.tags.sum())\
            .str.lower()\
            .drop_duplicates()\
            .apply(remove_non_eng)\
            .tolist()

In [10]:
# remove those tags that are truncated with '...' and '-' (i.e. NAs)
tags = list(filter(lambda x: '...' not in x and '-' not in x, tags))

# remove all clothing related words from the list of unique tags
tags = [x for x in tags if x not in clothes_tags]

# list of noisy tags, not provided by Chictopia
sparse_tags = [x for x in tags if x not in true_tags]

In [11]:
len(tags), len(sparse_tags)

(712, 707)

# Compute Similarity Scores

##  1. Levenshtein Distance

In [12]:
# generate the replacement tags for each of the noisy tags
# uses Levenshtein Distance as a measurement of similiarity
change_list_lev = {}
for tag in sparse_tags:
    distance, ratio = get_lev(tag)
    replace_tag = get_replacement_lev(distance, ratio)
    change_list_lev[tag] = replace_tag

In [13]:
# replace all the current tags of each of the instances with only true tags or 'notags'
reference_df['levtags'] = reference_df.tags.apply(lambda x: replace_list_lev(x))

In [48]:
# convert column of tags to features as dummies
df = pd.get_dummies(reference_df.levtags.apply(lambda x:pd.Series(x))\
                                .stack().reset_index(level=1,drop=True))\
        .sum(level=0)
df.head(5)

Unnamed: 0,-,3-4 sleeves,50s,60s,70s,80s,90s,Amusement Park,Androgynous,Anniversary,...,walk a-line,winter cas-print dress,беж и бел,розовы и сини,фиолет и беж,фиолет и красны,オフショル,カジュアル,フレンチ,美女
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# drop columns not in true tags, and create columns of true tags that are not found
df = df.reindex(columns=true_tags, fill_value=0)
df.shape

(10000, 102)

## 2. FuzzyWuzzy

In [16]:
# generate the replacement tags for each of the noisy tags using FuzzyWuzzy as a measurement of similiarity
change_list_fuzzy = {}
for tag in sparse_tags:
    replace_tag = get_replacement_fuzzy(tag)
    change_list_fuzzy[tag] = replace_tag

In [17]:
# replace all the current tags of each of the instances with only true tags or 'notags'
reference_df['fuzzytags'] = reference_df.tags.apply(lambda x: replace_list_fuzzy(x))

In [50]:
df_fuzzy = pd.get_dummies(reference_df.fuzzytags.apply(lambda x:pd.Series(x))\
                                      .stack().reset_index(level=1,drop=True))\
                .sum(level=0)
df_fuzzy.head(5)

Unnamed: 0,-,3-4 sleeves,50s,60s,70s,80s,90s,Amusement Park,Androgynous,Anniversary,...,walk a-line,winter cas-print dress,беж и бел,розовы и сини,фиолет и беж,фиолет и красны,オフショル,カジュアル,フレンチ,美女
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
# drop columns not in true tags, and create columns of true tags that are not found
df_fuzzy = df_fuzzy.reindex(columns=true_tags, fill_value=0)
df_fuzzy.shape

(10000, 102)

## 3. spaCy

In [20]:
nlp = spacy.load('en_core_web_lg')

In [21]:
docs_true = [nlp(tag) for tag in true_tags]
docs_sparse = [nlp(tag) for tag in sparse_tags]

In [22]:
# generate the replacement tags for each of the noisy tags using spaCy as a measurement of similiarity
change_list_spacy = {}
for token in docs_sparse:
    replace_tag = get_replacement_spacy(token)
    change_list_spacy[token.text] = replace_tag

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mo

In [23]:
# replace all the current tags of each of the instances with only true tags or 'notags'
reference_df['spacytags'] = reference_df.tags.apply(lambda x: replace_list_spacy(x))

In [52]:
df_spacy = pd.get_dummies(reference_df.spacytags.apply(lambda x:pd.Series(x))\
                                    .stack().reset_index(level=1,drop=True))\
                .sum(level=0)
df_spacy.head(5)

Unnamed: 0,-,3-4 sleeves,50s,60s,70s,80s,90s,Amusement Park,Androgynous,Anniversary,...,walk a-line,winter cas-print dress,беж и бел,розовы и сини,фиолет и беж,фиолет и красны,オフショル,カジュアル,フレンチ,美女
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
# drop columns not in true tags, and create columns of true tags that are not found
df_spacy = df_spacy.reindex(columns=true_tags, fill_value=0)
df_spacy.shape

(10000, 102)

## 4. Combination of three types

In [27]:
# generate the replacement tags for each of the noisy tags using majority voting 
change_list_combine = {}
for tag in sparse_tags:
    lev = change_list_lev[tag]
    fuzzy = change_list_fuzzy[tag]
    spacy = change_list_spacy[tag]
    replace_tag = get_replacement_combine(lev, fuzzy, spacy)
    change_list_combine[tag] = replace_tag

In [28]:
# replace all the current tags of each of the instances with only true tags or 'notags'
reference_df['combinetags'] = reference_df.tags.apply(lambda x: replace_list_combine(x))

In [54]:
df_combine = pd.get_dummies(reference_df.combinetags.apply(lambda x:pd.Series(x))\
                                        .stack().reset_index(level=1,drop=True))\
                .sum(level=0)
df_combine.head(5)

Unnamed: 0,-,3-4 sleeves,50s,60s,70s,80s,90s,Amusement Park,Androgynous,Anniversary,...,walk a-line,winter cas-print dress,беж и бел,розовы и сини,фиолет и беж,фиолет и красны,オフショル,カジュアル,フレンチ,美女
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
# drop columns not in true tags, and create columns of true tags that are not found
df_combine = df_combine.reindex(columns=true_tags, fill_value=0)
df_combine.shape

(10000, 102)

# Drop those tags with lesser than N instances to fit properly into AutoML

In [57]:
freq_thresh = 10
automl_proj = 'gs://imposing-timer-236204-vcm/cropped/'

# lev dist
df = drop_tags_below_freq_thresh(df, freq_thresh)
df = format_to_automl_labels(df, reference_df.name,
                             project_name=automl_proj)
df.to_csv('test_lev_{}.csv'.format(freq_thresh), header=False)

# fuzzy
df_fuzzy = drop_tags_below_freq_thresh(df_fuzzy, freq_thresh)
df_fuzzy = format_to_automl_labels(df_fuzzy, reference_df.name, 
                                   project_name=automl_proj)
df_fuzzy.to_csv('test_fuzzy_{}.csv'.format(freq_thresh), header=False)

# spacy
df_spacy = drop_tags_below_freq_thresh(df_spacy, freq_thresh)
df_spacy = format_to_automl_labels(df_spacy, reference_df.name, 
                                   project_name=automl_proj)
df_spacy.to_csv('test_spacy_{}.csv'.format(freq_thresh), header=False)

# combined
df_combine = drop_tags_below_freq_thresh(df_combine, freq_thresh)
df_combine = format_to_automl_labels(df_combine, reference_df.name, 
                                     project_name=automl_proj)
df_combine.to_csv('test_combine_{}.csv'.format(freq_thresh), header=False)

Number of tags dropped: 9
Number of tags dropped: 13
Number of tags dropped: 9
Number of tags dropped: 9
