# TAG CORPUS CONSOLIDATION

## Imports and function definition

In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import brown
from nltk.data import find
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

import os, sys, time
import numpy as np
import re
import string
import random
import itertools
from itertools import chain
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM
import logging
import collections
from collections import defaultdict
from __future__ import print_function
from __future__ import division
from scipy import stats, optimize
from utils import util, vocabulary
from google.cloud import bigquery
import pandas as pd
from gensim import models


util.require_package("tqdm")  # for nice progress bars
from tqdm import tqdm as ProgressBar

# # Bokeh for plotting.
util.require_package("bokeh")
import bokeh.plotting as bp
from bokeh.models import HoverTool
bp.output_notebook()

#set global number of suggestions for testing all models
num_sugs = 20
subset_size = 1000


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ejhaselden/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ejhaselden/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ejhaselden/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import w2v_model
get_stats = {}
get_stats['number_of_suggestions'] = num_sugs

## PART 1: SIMPLE EMBEDDING

We define functions to retrieve tag data, preprocess the results, find similar embeddings, and introduce a functional suggestion test (inspired by masked-language-model training)

In [3]:
nb_start = time.time()

lemma_list = []
word_list = []

lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop_words = set(stopwords.words('english'))


def word_preprocessing(word):
    lower = word.lower()
    punct_replacer = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    rem_punct = lower.translate(punct_replacer)
    lemma = [lemmatizer.lemmatize(w) for w in nltk.word_tokenize(rem_punct)]
    rem_stop = [w for w in lemma if not w in stop_words]
    rem_digits = [re.sub('\d', '<dig>', i) for i in rem_stop]
    lemma_list.append(rem_digits)
    word_list.append(word)

In [4]:
def get_top_matches(model, test_list, num_sugs):
    """
    Given an embedding model and list of tags, gets most similar results based on Word2Vec embeddings 
    (model constructed in w2v_model.py).
    Runs on one row (asset) at a time.
    """
#     ref_list = []
    matches = {}
    for lstring in test_list:
        tagset = []
        try:
            match = model.wv.most_similar(lstring, topn=num_sugs)

            for tag in range(len(match)):
                tagset.append(match[tag][0])

            matches[lstring] = tagset
        except KeyError:
            pass
    return matches

# search_list = ['blue']
# get_top_matches(vec_model, search_list, num_sugs)

In [5]:
def valid_prediction(test_dictionary):
    """
    Selects one random key from dictionary and determines if any values for that key
    match any other keys in the dictionary (in other words, whether the model's
    suggestion for a given tag matches any existing tags for the same asset).
    """
    
    rand = random.randint(0, len(test_dictionary) - 1)
    keylist = list(test_dictionary.keys())
    key = keylist[rand]
    suggestions = test_dictionary[key]
#     print(key, sugestions)
    matches = 0
    for suggestion in suggestions:
        for key in keylist:
            if suggestion == key:
                print("MATCH!", suggestion,  key, len(keylist))
                return("MATCH")
    return("NO MATCH")
    

We get an embedding model and compute loss (for use in hyperparameter tuning)

In [6]:
#get embedding model and compute loss (for use in hyperparameter tuning)
num_epochs = 2
get_stats['w2v_epochs'] = num_epochs
for i in range(1,num_epochs+1):
    print('epoch:', i)
    start = time.time()
    epochs = i
    vec_size = 10
    window = 5
    test_df = w2v_model.retrieve_expanded_query()
#     vec_model = w2v_model.retrieve_model_no_id(epochs, vec_size, window)
        
    vec_model = models.Word2Vec(test_df.values.tolist(), vector_size=vec_size, window=window, min_count=1, workers=4, compute_loss = True, epochs = epochs)
#     end = time.time()
    loss = vec_model.get_latest_training_loss()
    # perplexity = 2**loss
    print('loss:', loss)

epoch: 1
loss: 4423419.5
epoch: 2
loss: 8269571.5


We compile the set of tags for each asset. For each of those tags, we then get  a list of the most similar tags based on the W2V model.

In [7]:
##to do: Make this a function with a parameter for each type of model

# get lemmatized tag df with 1 row per asset and each tag in a separate column, covert to list of lists
test_df = w2v_model.retrieve_expanded_query()
# test_df = df_for_model
test_vals = test_df.values[0:subset_size]
# test_vals = test_df[0:1000]


#use top_matches method to create a dictionary of related tags suggested by embedding model
asset_dicts = []
start = time.time()
for i in range(len(test_vals)):
#     print("remaining:", len(test_vals) - i)
    test_list = test_vals[i][test_vals[i] != None]
#     rate.append(test_list)
# #     print(test_list)
    top_matches = get_top_matches(vec_model, test_list, num_sugs)
    asset_dicts.append(top_matches)
# #     str(test_list)
end = time.time()
print("elapsed:", end - start)
# asset_dicts

elapsed: 7.329533338546753


Now we have a list of dictionaries where each key is a tag for that asset and each set of values is a list of potential suggestions based on the W2V embeddings.


### Functional Suggestion Test    
We test the effectiveness of this suggestion set by selecting a random tag from each asset and seeing if it matches any other tag assigned to that asset. In other words, if one key matches one of another key's values.    
    
to do: (Consider averaging this over a few iterations)

In [10]:
#use valid_prediction method to determine useful suggestions
asset_results = []
for i in asset_dicts:
#     print(valid_prediction(i), i.keys())
    asset_results.append(valid_prediction(i))
fst_rate = asset_results.count("MATCH")/len(asset_results)
# print("rate of valid suggestions:", fst_rate)
get_stats['w2v'] = fst_rate

MATCH! television program television program 25
MATCH! facial expression facial expression 12
MATCH! jakarta fashion week jakarta fashion week 9
MATCH! technology technology 15
MATCH! knee knee 28
MATCH! <dig><dig><dig><dig>s <dig><dig><dig><dig>s 19
MATCH! performance art performance art 19
MATCH! musician musician 17
MATCH! art culture entertainment art culture entertainment 24
MATCH! stage stage 7
MATCH!   4
MATCH! singing singing 7
MATCH! <dig><dig><dig><dig><dig><dig><dig><dig><dig> <dig><dig><dig><dig><dig><dig><dig><dig><dig> 7
MATCH! monochrome monochrome 19
MATCH! violet violet 20
MATCH! curtain curtain 11
MATCH! performing art performing art 21
MATCH! lighting lighting 20
MATCH! building building 30
MATCH! property property 13
MATCH! horizontal horizontal 25
MATCH! jaw jaw 15
MATCH! mountain classic mountain classic 5
MATCH! art culture entertainment art culture entertainment 4
MATCH! entertainment entertainment 39
MATCH! smile smile 18
MATCH! art culture entertainment art cu

MATCH! music music 15
MATCH! art culture entertainment art culture entertainment 19
MATCH! puj puj 6
MATCH! fashion fashion 22
MATCH! art culture entertainment art culture entertainment 22
MATCH! formal wear formal wear 16
MATCH! premium series shelving premium series shelving 22
MATCH! formal wear formal wear 16
MATCH! music music 18
MATCH! dress shirt dress shirt 19
MATCH! musician musician 19
MATCH! music music 14
MATCH! film industry film industry 21
MATCH! art culture entertainment art culture entertainment 9
MATCH! fashion fashion 10
MATCH! suit suit 10
MATCH! art culture entertainment art culture entertainment 10
MATCH! <dig><dig><dig><dig> <dig><dig> <dig><dig> isa <dig><dig><dig><dig> <dig><dig><dig><dig> <dig><dig> <dig><dig> isa <dig><dig><dig><dig> 21
MATCH! <dig><dig><dig><dig><dig><dig><dig><dig><dig> <dig><dig><dig><dig><dig><dig><dig><dig><dig> 11
MATCH! jean jean 17
MATCH! art culture entertainment art culture entertainment 3
MATCH! nbcu photo bank nbcu photo bank 26
M

MATCH! thoroughfare thoroughfare 23
MATCH! fashion design fashion design 22
MATCH! suit suit 16
MATCH! art culture entertainment art culture entertainment 17
MATCH! entertainment entertainment 18
MATCH! musical instrument accessory musical instrument accessory 28
MATCH! holiday holiday 10
MATCH! entertainment entertainment 19
MATCH! clothing clothing 12
MATCH! art culture entertainment art culture entertainment 15
MATCH! entertainment entertainment 11
MATCH! celebrity celebrity 10
MATCH! event event 15
MATCH!   6
MATCH! condominium condominium 13
MATCH! sleeve sleeve 26
MATCH! fashion fashion 10
MATCH! art culture entertainment art culture entertainment 24
MATCH!   11
MATCH! cheek cheek 25
MATCH! social group social group 27
MATCH! fashion design fashion design 26
MATCH! gesture gesture 18
MATCH!   16
MATCH! coat coat 18
MATCH! music venue music venue 33
MATCH! <dig><dig><dig><dig><dig><dig><dig><dig><dig> <dig><dig><dig><dig><dig><dig><dig><dig><dig> 22


## PART 2: LANGUAGE MODEL    
     
code credit: https://github.com/datasci-w266/2021-summer-main/tree/master/materials/simple_lm   

We use a simple trigram model to see if that offers increased suggestion quality, on the assumption that tags will frequently appear in close proximity to similar tags (ie, attached to the same asset).

In reality, we found a much lower rate of useful suggestions (0.45) as compared to the simple W2V embedding model (0.75). We attempted to improve our trigram model by alphabetizing each tag list prior to training, in the hope that this would further emphasize relationships between related tags. This approach yielded an even lower valid suggestion rate (0.05). {why?}

In [12]:
get_query = w2v_model.lm_retrieve_query()
wordlist = get_query['cn'].tolist()

In [13]:
def normalize_counter(c):
    """Given a dictionary of <item, counts>, return <item, fraction>."""
    total = sum(c.values())
    return {w:float(c[w])/total for w in c}

class SimpleTrigramLM(object):
    def __init__(self, words):
        """Build our simple trigram model."""
        # Raw trigram counts over the corpus. 
        # c(w | w_1 w_2) = self.counts[(w_2,w_1)][w]
        self.counts = defaultdict(lambda: defaultdict(lambda: 0.0))
    
        # Iterate through the word stream once.
        w_1, w_2 = None, None
        for word in words:
            if w_1 is not None and w_2 is not None:
                # Increment trigram count.
                self.counts[(w_2,w_1)][word] += 1
            # Shift context along the stream of words.
            w_2 = w_1
            w_1 = word
            
        # Normalize so that for each context we have a valid probability
        # distribution (i.e. adds up to 1.0) of possible next tokens.
        self.probas = defaultdict(lambda: defaultdict(lambda: 0.0))
        for context, ctr in self.counts.items():
            self.probas[context] = normalize_counter(ctr)
            
    def next_word_proba(self, word, seq):
        """Compute p(word | seq)"""
        context = tuple(seq[-2:])  # last two words
        return self.probas[context].get(word, 0.0)
    
    def predict_next(self, seq):
        """Sample a word from the conditional distribution."""
        context = tuple(seq[-2:])  # last two words
        pc = self.probas[context]  # conditional distribution
        words, probs = zip(*pc.items())  # convert to list
        return np.random.choice(words, p=probs)
    
    def score_seq(self, seq, verbose=False):
        """Compute log probability (base 2) of the given sequence."""
        score = 0.0
        count = 0
        # Start at third word, since we need a full context.
        for i in range(2, len(seq)):
            if (seq[i] == "<s>" or seq[i] == "</s>"):
                continue  # Don't count special tokens in score.
            s = np.log2(self.next_word_proba(seq[i], seq[i-2:i]))
            score += s
            count += 1
            # DEBUG
            if verbose:
                print("log P({:s} | {:s}) = {.03f}".format(seq[i], " ".join(seq[i-2:i]), s))
        return score, count

In [14]:
import re
# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset):
        return word
    else:
        return constants.UNK_TOKEN

def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]

In [15]:
get_query = w2v_model.lm_retrieve_query()
get_query

Unnamed: 0,asset_id,cn
0,assetnum45248253,"film industry, arts culture and entertainment,..."
1,assetnum49642909,"775637829, Basketball, Sport, Basketball Moves..."
2,assetnum46930785,"775635494, Audio Equipment, Spokesperson, Publ..."
3,assetnum31904237,"775578330, 10.21.2020 CMT Awards Fan Viewing P..."
4,assetnum50434474,"775641976, Green, Eyelash, Hair, Font, Dress, ..."
...,...,...
779397,assetnum40965793,BFfulltakes_FTP
779398,assetnum36318105,BFfulltakes_FTP
779399,assetnum33632646,BFfulltakes_FTP
779400,assetnum36318332,BFfulltakes_FTP


In [16]:
#alphabetized version
# import pandas as pd
# alpha_get_query = get_query[0:1000]
# v = np.sort(alpha_get_query.cn.str.split(',', expand=True).fillna(''), axis=1)
# df = pd.DataFrame(v).agg(','.join, 1).str.strip(',').str.lstrip()

# wordlist = df.tolist()


In [17]:
split=0.8
sentences = np.array(list(wordlist), dtype=object)
fmt = (len(sentences), sum(map(len, sentences)))
print("Loaded {:,} sentences ({:g} tokens)".format(*fmt))


rng = np.random.RandomState()
rng.shuffle(sentences)  # in-place
split_idx = int(split * len(sentences))
train_sents = sentences[:split_idx]
test_sents = sentences[split_idx:]

for l in range(len(train_sents)):
    train_sents[l] = train_sents[l].split(", ")
for l in range(len(test_sents)):
    test_sents[l] = test_sents[l].split(", ")
# train_sents = train_sents.split(",")
# test_sents = test_sents.split(",")

fmt = (len(train_sents), sum(map(len, train_sents)))
print("Training set: {:,} sentences ({:,} tokens)".format(*fmt))
fmt = (len(test_sents), sum(map(len, test_sents)))
print("Test set: {:,} sentences ({:,} tokens)".format(*fmt))


Loaded 779,402 sentences (1.43624e+08 tokens)
Training set: 623,521 sentences (9,231,443 tokens)
Test set: 155,881 sentences (2,304,756 tokens)


In [19]:
vocab = vocabulary.Vocabulary(canonicalize_word(w) for w in ProgressBar(util.flatten(train_sents)))
print("Train set vocabulary: %d words" % vocab.size)

100%|██████████| 9231443/9231443 [00:29<00:00, 310894.22it/s]


Train set vocabulary: 39691 words


In [20]:
def sents_to_tokens(sents):
    """Returns an flattened list of the words in the sentences, with padding for a trigram model."""
    padded_sentences = (["<s>", "<s>"] + s + ["</s>"] for s in sents)
    # This will canonicalize words, and replace anything not in vocab with <unk>
    return np.array([util.canonicalize_word(w, wordset=vocab.wordset) 
                     for w in ProgressBar(util.flatten(padded_sentences))], dtype=object)

train_tokens = sents_to_tokens(train_sents)
test_tokens = sents_to_tokens(test_sents)

t0 = time.time()
print("Building trigram LM...",)
lm = SimpleTrigramLM(train_tokens)
print("done in %.02f s" % (time.time() - t0))

100%|██████████| 11102006/11102006 [00:10<00:00, 1026542.43it/s]
100%|██████████| 2772399/2772399 [00:02<00:00, 1040682.57it/s]


Building trigram LM...
done in 25.12 s


### Generating Sample Predictions    
When we task our model with generating predictions, we do see some relevance in the results. We quantify this later using our Functional Suggestion Test.

In [22]:
def lm_predictions(l_model, tag, max_length):
        seq = ["<s>", tag]
        for i in range(max_length):
            try:
                seq.append(l_model.predict_next(seq))
            except ValueError:
                seq.append('nodata_nodata')
                
        ## dedupe list of suggested tags
        seq = set(seq)
        seq = list(seq)
        seq = [i for i in seq if i not in ['<s>','</s>', tag, 'nodata_nodata']]
        
        ## n prevents an infinite loop in the next section
        n=0
        
        ## take length of deduped list and use it to return 15 suggestions
        while len(seq) < max_length+2 and n < 50:
            try:
                seq.append(l_model.predict_next(seq))
            except ValueError:
                seq.append('nodata_nodata')
            seq = set(seq)
            seq = list(seq)
            seq = [i for i in seq if i not in ['<s>','</s>', tag, 'nodata_nodata']]
            n+=1
        seq = seq[2:]
        return seq

# lm_predictions(lm, 'blue', num_sugs)

### Scoring  
We check the perplexity and then employ the same functional suggestion test that we used for the W2V embeddings.    
We see that the rate of valid predictions for the LM is actually much lower than that of the simple embeddings.

In [23]:
log_p_data, num_real_tokens = lm.score_seq(train_tokens)
print("Train perplexity: {:.02f}".format(2**(-1*log_p_data/num_real_tokens)))

Train perplexity: 21.27


In [24]:
lengths = []
for i in asset_dicts:
    lengths.append(len(i.values()))
np.average(lengths)

14.98

In [25]:
def lm_get_top_matches(l_model, test_list, num_sugs):
    """
    Given a language model and list of tags, gets most similar results.
    Runs on one row (asset) at a time.
    """
#     ref_list = []
    matches = {}
#     not_found = 0
    for lstring in test_list:
        tagset = []
        try:
            match = lm_predictions(lm, lstring, num_sugs)
            for tag in range(len(match)):
#             for tag in range(2):
                tagset.append(match[tag])

# #             ref_list.append(tagset)
            matches[lstring] = tagset
        except KeyError:
    #         print(ls, ' : ','NOT_FOUND')
            pass
#     print("Not found", not_found)
    return matches

In [26]:
##to do: Make this a function with a parameter for each type of model

# get lemmatized tag df with 1 row per asset and each tag in a separate column, covert to list of lists
test_df = w2v_model.retrieve_expanded_query()
lm_test_vals = test_df.values[0:subset_size]
  
#use top_matches method to create a dictionary of related tags suggested by embedding model
lm_asset_dicts = []
start = time.time()
for i in range(len(lm_test_vals)):
#     print("remaining:", len(test_vals) - i)
    lm_test_list = lm_test_vals[i][lm_test_vals[i] != None]
#     rate.append(test_list)
# #     print(test_list)
    lm_top_matches = lm_get_top_matches(lm, lm_test_list, num_sugs)
    lm_asset_dicts.append(lm_top_matches)
# #     str(test_list)
end = time.time()
print("elapsed:", end - start)
# lm_asset_dicts

elapsed: 485.1953749656677


In [27]:
#use valid_prediction method to determine useful suggestions
lm_asset_results = []
for i in lm_asset_dicts:
#     print(valid_prediction(i), i.keys())
    lm_asset_results.append(valid_prediction(i))
    lm_fst_rate = lm_asset_results.count("MATCH")/len(lm_asset_results)
print("Rate of valid suggestions:", lm_fst_rate)
get_stats['lm'] = lm_fst_rate
# asset_results.count("MATCH")

MATCH! dress dress 19
MATCH! crowd crowd 19
MATCH! musical instrument accessory musical instrument accessory 20
MATCH! electric blue electric blue 24
MATCH! music music 20
MATCH! performance art performance art 21
MATCH! shoulder shoulder 26
MATCH! fashion fashion 30
MATCH! sky sky 25
MATCH! musical instrument musical instrument 39
MATCH! fashion fashion 18
MATCH! lighting lighting 17
MATCH! music artist music artist 28
MATCH! black hair black hair 18
MATCH! indoors indoors 10
MATCH! musician musician 24
MATCH! sleeve sleeve 21
MATCH! puj puj 7
MATCH! performance performance 23
MATCH! happy happy 24
MATCH! fashion fashion 22
MATCH! runway runway 8
MATCH! microphone microphone 17
MATCH! spec sheet spec sheet 13
MATCH! microphone microphone 20
MATCH! green green 9
MATCH! dancer dancer 9
MATCH! team sport team sport 21
MATCH! entertainment entertainment 16
MATCH! music venue music venue 8
MATCH! entertainment entertainment 18
MATCH! smile smile 14
MATCH! music music 13
MATCH! music music 

## PART 3. BERT ATTEMPT  
Given that our corpus is full of unusual terms and that our "sentences" are order-agnostic, BERT's pre-trained bi-directional nature makes it a counterintuitive choice. We propose a novel application, however, in which BERT is fine-tuned on our tag corpus. As in our LM test, we would order tags in our corpus alphabetically to impose a sense of word order significance. For now, BERT fine-tuning has proven to be beyond the scope of this project.
        


In [28]:
#code credit: https://gist.github.com/yuchenlin/a2f42d3c4378ed7b83de65c7a2222eb2
# !pip install torchvision 


Vanilla BERT 

In [29]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bmodel = BertForMaskedLM.from_pretrained('bert-base-uncased')
def predict_masked_sent(text, top_k=5):
    # Tokenize input
    text = "[CLS] %s [SEP]"%text
    tokenized_text = tokenizer.tokenize(text)
    masked_index = tokenized_text.index("[MASK]")
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    # tokens_tensor = tokens_tensor.to('cuda')    # if you have gpu

    # Predict all tokens
    with torch.no_grad():
        outputs = bmodel(tokens_tensor)
        predictions = outputs[0]

    probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1)
    top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)
    preds = []
    
    for i, pred_idx in enumerate(top_k_indices):
        predicted_token = tokenizer.convert_ids_to_tokens([pred_idx])[0]
        token_weight = top_k_weights[i]
        preds.append(predicted_token)
#         print("[MASK]: '%s'"%predicted_token, " | weights:", float(token_weight))

    return preds
        

predict_masked_sent("'white', '[MASK]'", 5)    
# predict_masked_sent("'Lighting', 'Arts Culture and Entertainment', [MASK], 'Water', 'Chandelier', 'Ceiling Fixture', 'Ceiling'", 5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['black', 'white', 'red', '.', 'blue']

Vanilla BERT given a test tag sequence

In [30]:
w2v_model.is_not_rare('bffulltakes ftp')

False

In [31]:
# start = time.time()
bert_df = w2v_model.retrieve_expanded_query()
bert_lite = bert_df[0:subset_size]
bert_values = bert_lite.values


In [35]:
def bert_get_top_matches(test_list, num_sugs):
    """
    Given a bert model and list of tags, gets most similar results.
    Runs on one row (asset) at a time.
    """
    matches = {}
    test_list = test_list[test_list != None]
    for intag in test_list:
        lstring = [intag, ', [MASK]']
#         print(itag, lstring)
        tagset = []
        try:
            match = predict_masked_sent(lstring, num_sugs)
#             print(match)

            for outtag in range(len(match)):
                tagset.append(match[outtag])
            
                    ## dedupe list of suggested tags
                # initializing punctuations string
#             punc = [lstring, '&', ';', ',', '.', '"', "'", ':', '/', '[UNK]', 'and', 'the', '<', '>', '[', ']', '!', '?']
#             tagset = set(tagset)
#             tagset = list(tagset)
#             tagset = [i for i in tagset if i not in punc]
#             tagset = tagset[0:15]
            
            matches[intag] = tagset
        except KeyError:
            pass
    return matches
bert_get_top_matches(bert_values[6], num_sugs)

{'<dig><dig><dig><dig><dig><dig><dig><dig><dig>': ["'",
  'and',
  ',',
  '...',
  'or',
  '&',
  '`',
  'etc',
  '.',
  '*',
  '/',
  '-',
  'x',
  ';',
  '?',
  '+',
  's',
  '<',
  '\\',
  'but'],
 'electronics': ["'",
  'etc',
  'and',
  'music',
  '...',
  'computer',
  'radio',
  '.',
  ',',
  'electronics',
  'sound',
  '&',
  'software',
  'sounds',
  'people',
  'video',
  'computers',
  'time',
  'voice',
  'instruments'],
 'light': ["'",
  'light',
  'water',
  'and',
  '...',
  'sound',
  'dark',
  'colour',
  'heat',
  'fire',
  'darkness',
  'white',
  ',',
  'moon',
  'time',
  'blue',
  'night',
  'sun',
  'air',
  'etc'],
 'music': ["'",
  'music',
  'art',
  'etc',
  'song',
  'poetry',
  'and',
  'words',
  '...',
  'songs',
  'people',
  'dance',
  ',',
  'time',
  '.',
  'love',
  'lyrics',
  'literature',
  'culture',
  'man'],
 'fashion': ['style',
  'art',
  "'",
  'fashion',
  'culture',
  'beauty',
  'design',
  'colour',
  'etc',
  'lifestyle',
  'music',
  '

In [37]:
#use top_matches method to create a dictionary of related tags suggested by BERT model
bert_asset_dicts = []
start = time.time()
for i in range(len(bert_values)):
#     print("remaining:", len(test_vals) - i)
    bert_test_list = bert_values[i]
#     rate.append(test_list)
# #     print(test_list)
    bert_top_matches = bert_get_top_matches(bert_test_list, num_sugs)
    bert_asset_dicts.append(bert_top_matches)
# #     str(test_list)
end = time.time()
print("elapsed:", end - start)

elapsed: 529.9097473621368


In [38]:
# bert_asset_dicts

In [39]:
#use valid_prediction method to determine useful suggestions
bert_asset_results = []
for i in bert_asset_dicts:
#     print(valid_prediction(i), i.keys())
    bert_asset_results.append(valid_prediction(i))
    bert_fst_rate = bert_asset_results.count("MATCH")/len(bert_asset_results)
print("Rate of valid suggestions:", bert_fst_rate)
get_stats['vanilla_bert'] = bert_fst_rate
# asset_results.count("MATCH")

MATCH! font font 25
MATCH! neck neck 28
MATCH! music music 16
MATCH! celebrity celebrity 19
MATCH! concert concert 20
MATCH! stage stage 17
MATCH! blue blue 24
MATCH! stage stage 7
MATCH! event event 7
MATCH! winter winter 7
MATCH! art art 19
MATCH! jewellery jewellery 20
MATCH! curtain curtain 11
MATCH! light light 21
MATCH! stage stage 20
MATCH! shoulder shoulder 26
MATCH! world world 9
MATCH! fashion fashion 30
MATCH! property property 13
MATCH! city city 25
MATCH! neck neck 15
MATCH! zoo zoo 5
MATCH! drum drum 39
MATCH! trophy trophy 18
MATCH! photography photography 18
MATCH! art art 17
MATCH! stage stage 9
MATCH! transport transport 5
MATCH! carpet carpet 5
MATCH! performance performance 28
MATCH! lip lip 18
MATCH! music music 23
MATCH! bench bench 10
MATCH! music music 24
MATCH! crowd crowd 22
MATCH! comfort comfort 21
MATCH! inclusive inclusive 7
MATCH! stage stage 18
MATCH! music music 25
MATCH! photograph photograph 24
MATCH! thigh thigh 22
MATCH! music music 38
MATCH! music 

MATCH! entertainment entertainment 15
MATCH! spring spring 18
MATCH! keep keep 13
MATCH! short short 29
MATCH! song song 20
MATCH! chin chin 9
MATCH! city city 12
MATCH! short short 20
MATCH! thumb thumb 12
MATCH! dress dress 13
MATCH! fashion fashion 24
MATCH! zoo zoo 9
MATCH! artist artist 32
MATCH! sport sport 12
MATCH! sky sky 21
MATCH! light light 12
MATCH! entertainment entertainment 11
MATCH! jacket jacket 12
MATCH! beauty beauty 29
MATCH! car car 15
MATCH! hair hair 14
MATCH! celebrity celebrity 20
MATCH! concert concert 28
MATCH! cooking cooking 7
MATCH! fashion fashion 8
MATCH! americana americana 5
MATCH! computer computer 26
MATCH! art art 22
MATCH! sport sport 10
MATCH! people people 9
MATCH! music music 25
MATCH! tool tool 5
MATCH! grass grass 18
MATCH! fashion fashion 13
MATCH! suit suit 14
MATCH! ball ball 21
MATCH! crowd crowd 20
MATCH! furniture furniture 5
MATCH! wheel wheel 17
MATCH! knee knee 24
MATCH! entertainment entertainment 11
MATCH! guitar guitar 41
MATCH! c

## PART 4. SUGGESTION FUNCTION IMPLEMENTATION   
We loop through the list of assets and attempt to offer alternative tags for any rare tags that we enconter.

Using the simple W2V embeddings for now

In [40]:
to_check = w2v_model.lm_retrieve_expanded_query()
# to_check

In [41]:
#given a list of tags, identify those that are rare and provide suggested replacements
def get_candidates(tag_list):
    start = time.time()
    lemma_dict = w2v_model.lemma_map()
    lemma_common_tags = []
    lemma_rare_tags = []
    candidates = []
#     cand_list = []
    for tag in tag_list:
        try:
            if w2v_model.is_not_rare(lemma_dict[tag]):
                lemma_common_tags.append(lemma_dict[tag])
                continue
            else:
                lemma_rare_tags.append(lemma_dict[tag])
        except KeyError:
            if w2v_model.is_not_rare(tag):
                continue
        try:    
            if tag is not None:
                candidate = get_top_matches(vec_model, ['', lemma_dict[tag]], num_sugs)
    #             print('tag: ', tag, '\nsuggestions:', candidates.values(),'\n')
#                 cand_list.append(candidate)
                candidates.append(candidate.values())
        except KeyError:
            pass
    flat = list(chain(*candidates))
    flatter = list(chain(*flat)) 
#     print('lemma common tags: ', lemma_common_tags, '\nlemma rare tags: ', lemma_rare_tags)      
    return flatter

def suggest_better_tags(list_of_tags):
    tag_candidates = get_candidates(list_of_tags)
    tag_candidates = [w2v_model.delete_rare(tc) for tc in tag_candidates]
    tag_candidates = [tag for tag in tag_candidates if tag != ""]
    
    return tag_candidates


We gather suggestions for every tag and concatenate those into a
single suggestion list.    
Then we discard uncommon (rare) suggestions and identify suggestions that were already tags for the given asset (duplicates).     
The final list of suggested tags then consists only of common tags that are not already applied to the asset.   
We also track the duplicates so they can be used to validate the usability of our suggestions.

In [42]:
'''
For each asset (row), gather suggestions for every tag and concatenate those into a
single suggestion list. Then discard uncommon (rare) suggestions and identify
suggestions that were already tags for the given asset (duplicates). The final list of suggested tags
is then only common tags that are not already applied to the asset.

This function also tracks the duplicates so they can be used to validate the usability of our suggestions.
'''

def get_real_suggestions(existing_tags):
    #GIVE THIS FUNCTION AT LEAST TWO LISTS OF TAGS OR A LIST WRAPPED IN AN EMPTY LIST
    start = time.time()
    count = len(existing_tags)
    
    all_live_suggestions = []
    all_new_suggestions = []
    all_dupe_suggestions = []
    all_weighted = []

    lemma_dict = w2v_model.lemma_map()
    for i in range(count):
        live_tags = existing_tags[i]
#         print("tags in \n", live_tags, '\n')
        live_suggestions = suggest_better_tags(live_tags)
        all_live_suggestions.append(live_suggestions)
    #     print("all suggestions \n", live_suggestions, '\n')
        new_suggestions = []
        dupe_suggestions = []
        for sug in live_suggestions:
            for tag in live_tags:
                if sug == lemma_dict.get(tag):
                    if sug not in dupe_suggestions:
                        dupe_suggestions.append(sug)
                    continue              
            if sug not in dupe_suggestions:
                new_suggestions.append(sug)
        counts = collections.Counter(new_suggestions)
        weighted = counts.most_common()
        all_weighted.append(weighted)
        
        all_dupe_suggestions.append(dupe_suggestions)
        all_new_suggestions.append(new_suggestions)
#         counts = collections.Counter(all_new_suggestions)
#         weigthed = counts.most_common()
    return [all_live_suggestions, all_dupe_suggestions, all_new_suggestions, all_weighted]
test_check = to_check.values[0:100]

start = time.time()
real_suggestions = get_real_suggestions(test_check)
end = time.time()
print(end - start)

252.20378851890564


Our function returns a batch of new legitimate suggested tags for each asset (ideally), sorted by frequency.  

In [43]:
for i in range(len(real_suggestions[0])):
    print('\n\n Asset', i, '\n\nORIGINAL TAG LIST: \n', test_check[i], '\nSUGGESTIONS: \n', real_suggestions[0][i], '\nDUPLICATES: \n', real_suggestions[1][i], '\nLEGITIMATE SUGGESTIONS:  \n', real_suggestions[2][i],  '\nBY WEIGHT:  \n', real_suggestions[3][i])



 Asset 0 

ORIGINAL TAG LIST: 
 ['film industry' 'arts culture and entertainment' 'Choreography' 'Dance'
 'celebrities' '775621438' 'awards ceremony' 'BFselects_FTP'
 'Entertainment' 'Fashion Design' 'Performing Arts' None None None None
 None None None None None None None None None None None None None None
 None None None None None None None None None None None None None None
 None None None None None None None None] 
SUGGESTIONS: 
 ['fashion', 'music', 'entertainment', 'purple', 'event', 'film industry', 'magenta', 'blue', 'red', 'pink', 'fashion', 'film industry', 'music', 'entertainment', 'blue', 'red', 'purple', 'fashion', 'music', 'entertainment', 'purple', 'event', 'film industry', 'magenta', 'blue', 'red', 'pink', 'pink', 'purple', 'event', 'red', 'magenta', 'entertainment', 'fashion', 'fashion', 'music', 'entertainment', 'purple', 'event', 'film industry', 'magenta', 'blue', 'red', 'pink', 'pink', 'event', 'purple', 'entertainment', 'red', 'magenta', 'fashion', 'music', 'ent

To do: scoring/stats, etc   
We see at least one valid, common suggestion (defined as a tag that was actually attached by a user in our initial data set) for approx 80% of assets. (based on 1000 assets)

In [44]:
dupes = real_suggestions[1]
# def condition(x): return len(x) == 0
output = [idx for idx, element in enumerate(dupes) if len(element) > 0]
len(output)/len(real_suggestions[1])


0.86

Our function suggests an average of approx 9 common tags per asset.

In [45]:
new_unique = real_suggestions[3]
new_unique_lengths = []
for nu in new_unique:
    l = len(nu)
    new_unique_lengths.append(l)
sum(new_unique_lengths)/len(new_unique_lengths)


17.62

In [46]:
#random list of common tags

rand_tags = w2v_model.retrieve_rare(3)
rand_tags

['Font', 'Font', 'Fur']

In [47]:
nb_end = time.time()
print(nb_end - nb_start)

1455.2216267585754


In [48]:
get_stats['time'] = (nb_end - nb_start)

In [49]:
get_stats

{'number_of_suggestions': 20,
 'w2v_epochs': 2,
 'w2v': 0.799,
 'lm': 0.388,
 'vanilla_bert': 0.583,
 'time': 1455.2216267585754}

# Model comparisons 


{'number_of_suggestions': 5,
 'w2v_epochs': 2,
 'w2v': 0.644,
 'lm': 0.21,
 'vanilla_bert': 0.43}    
 {'number_of_suggestions': 10,
 'w2v_epochs': 2,
 'w2v': 0.747,
 'lm': 0.32,
 'vanilla_bert': 0.51}    
 {'number_of_suggestions': 15,
 'w2v_epochs': 2,
 'w2v': 0.779,
 'lm': 0.38,
 'vanilla_bert': 0.52}    
 {'number_of_suggestions': 20,
 'w2v_epochs': 2,
 'w2v': 0.804,
 'lm': 0.43,
 'vanilla_bert': 0.53}    
 {'number_of_suggestions': 25,
 'w2v_epochs': 2,
 'w2v': 0.84,
 'lm': 0.5,
 'vanilla_bert': 0.65}
 {'number_of_suggestions': 30,
 'w2v_epochs': 2,
 'w2v': 0.825,
 'lm': 0.36,
 'vanilla_bert': 0.61}    
 {'number_of_suggestions': 35,
 'w2v_epochs': 2,
 'w2v': 0.858,
 'lm': 0.57,
 'vanilla_bert': 0.71}    
 {'number_of_suggestions': 40,
 'w2v_epochs': 2,
 'w2v': 0.838,
 'lm': 0.42,
 'vanilla_bert': 0.65}      
