# TAG CORPUS CONSOLIDATION

## Imports and function definition

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#Import from custom w2v model file
import w2v_model
import os, sys, time
import numpy as np
import re
import string
import random
import itertools
from itertools import chainimport torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM
import logging
import collections
from collections import defaultdict
from __future__ import print_function
from __future__ import division
from scipy import stats, optimize
import utils, vocabulary

utils.require_package("tqdm")  # for nice progress bars
from tqdm import tqdm as ProgressBar

# # Bokeh for plotting.
utils.require_package("bokeh")
import bokeh.plotting as bp
from bokeh.models import HoverTool
bp.output_notebook()

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

## PART 1: SIMPLE EMBEDDING

to do:   
use loss and test function to fine-tune embeddings 
add preprocessing notebook or keep it separate?

We define functions to retrieve tag data, preprocess the results, find similar embeddings, and introduce a functional test (inspired by masked-language-model training)

In [1]:
nb_start = time.time()

lemma_list = []
word_list = []

lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop_words = set(stopwords.words('english'))


def word_preprocessing(word):
    lower = word.lower()
    punct_replacer = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    rem_punct = lower.translate(punct_replacer)
    lemma = [lemmatizer.lemmatize(w) for w in nltk.word_tokenize(rem_punct)]
    rem_stop = [w for w in lemma if not w in stop_words]
    rem_digits = [re.sub('\d', '<dig>', i) for i in rem_stop]
    lemma_list.append(rem_digits)
    word_list.append(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ejhaselden/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ejhaselden/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ejhaselden/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def get_top_matches(model, test_list):
    """
    Given an embedding model and list of tags, gets most similar results based on Word2Vec embeddings 
    (model constructed in w2v_model.py).
    Runs on one row (asset) at a time.
    """
#     ref_list = []
    matches = {}
#     not_found = 0
    for lstring in test_list:
        tagset = []
        try:
            match = model.wv.most_similar(lstring)
    #         print(ls, ' : ', match)
#             ref_list.append(lstring)
            for tag in range(len(match)):
                tagset.append(match[tag][0])

#             ref_list.append(tagset)
            matches[lstring] = tagset
        except KeyError:
    #         print(ls, ' : ','NOT_FOUND')
            pass
#     print("Not found", not_found)
    return matches

# search_list = ['blue']
# get_top_matches(model, search_list)

In [4]:
def valid_prediction(test_dictionary):
    """
    Selects one random key from dictionary and determines if any values for that key
    match any other keys in the dictionary (in other words, whether the model's
    suggestion for a given tag matches any existing tags for the same asset).
    """
    
    rand = random.randint(0, len(test_dictionary) - 1)
    keylist = list(test_dictionary.keys())
    key = keylist[rand]
    suggestions = test_dictionary[key]
#     print(key, sugestions)
    matches = 0
    for suggestion in suggestions:
        for key in keylist:
            if suggestion == key:
#                 print("MATCH!", suggestion,  key)
                return("MATCH")
    return("NO MATCH")
    

We get embedding model and compute loss (for use in hyperparameter tuning)

In [6]:
#get embedding model and compute loss (for use in hyperparameter tuning)
for i in range(1,2):
    print('epoch:', i)
    start = time.time()
    epochs = i
    vec_size = 10
    window = 5
    vec_model = w2v_model.retrieve_model_no_id(epochs, vec_size, window)
    end = time.time()
    print('elapsed:', end - start)
    loss = vec_model.get_latest_training_loss()
    # perplexity = 2**loss
    print('loss:', loss)



epoch: 1
elapsed: 31.088252782821655
loss: 4402973.0


We compile the set of tags for each asset. For each of those tags, we then get  a list of the most similar tags based on the W2V model.

In [7]:
##to do: Make this a function with a parameter for each type of model

# get lemmatized tag df with 1 row per asset and each tag in a separate column, covert to list of lists
test_df = w2v_model.retrieve_expanded_query()
test_vals = test_df.values[0:100]


#use top_matches method to create a dictionary of related tags suggested by embedding model
asset_dicts = []
start = time.time()
for i in range(len(test_vals)):
#     print("remaining:", len(test_vals) - i)
    test_list = test_vals[i][test_vals[i] != None]
#     rate.append(test_list)
# #     print(test_list)
    top_matches = get_top_matches(vec_model, test_list)
    asset_dicts.append(top_matches)
# #     str(test_list)
end = time.time()
print("elapsed:", end - start)
# asset_dicts

elapsed: 0.8590764999389648


Now we have a list of dictionaries where each key is a tag for that asset and each set of values is a list of potential suggestions based on the W2V embeddings.


In [11]:
# asset_dicts[0].keys()

In [12]:
# asset_dicts[0].values()

We test the effectiveness of this suggestion set by selecting a random tag from each asset and seeing if it matches any other tag assigned to that asset. In other words, if one key matches one of another key's values.    
    
to do: (Consider averaging this over a few iterations)

In [13]:
#use valid_prediction method to determine useful suggestions
asset_results = []
for i in asset_dicts:
#     print(valid_prediction(i), i.keys())
    asset_results.append(valid_prediction(i))
print("rate of valid suggestions:", asset_results.count("MATCH")/len(asset_results))
# asset_results.count("MATCH")

rate of valid suggestions: 0.74


## PART 2: LANGUAGE MODEL    
     
code credit: https://github.com/datasci-w266/2021-summer-main/tree/master/materials/simple_lm   
to do: bring in smooting, more scoring from lm1.ipynb?  


In [16]:
get_query = w2v_model.lm_retrieve_query()
wordlist = get_query['cn'].tolist()

In [17]:
def normalize_counter(c):
    """Given a dictionary of <item, counts>, return <item, fraction>."""
    total = sum(c.values())
    return {w:float(c[w])/total for w in c}

class SimpleTrigramLM(object):
    def __init__(self, words):
        """Build our simple trigram model."""
        # Raw trigram counts over the corpus. 
        # c(w | w_1 w_2) = self.counts[(w_2,w_1)][w]
        self.counts = defaultdict(lambda: defaultdict(lambda: 0.0))
    
        # Iterate through the word stream once.
        w_1, w_2 = None, None
        for word in words:
            if w_1 is not None and w_2 is not None:
                # Increment trigram count.
                self.counts[(w_2,w_1)][word] += 1
            # Shift context along the stream of words.
            w_2 = w_1
            w_1 = word
            
        # Normalize so that for each context we have a valid probability
        # distribution (i.e. adds up to 1.0) of possible next tokens.
        self.probas = defaultdict(lambda: defaultdict(lambda: 0.0))
        for context, ctr in self.counts.items():
            self.probas[context] = normalize_counter(ctr)
            
    def next_word_proba(self, word, seq):
        """Compute p(word | seq)"""
        context = tuple(seq[-2:])  # last two words
        return self.probas[context].get(word, 0.0)
    
    def predict_next(self, seq):
        """Sample a word from the conditional distribution."""
        context = tuple(seq[-2:])  # last two words
        pc = self.probas[context]  # conditional distribution
        words, probs = zip(*pc.items())  # convert to list
        return np.random.choice(words, p=probs)
    
    def score_seq(self, seq, verbose=False):
        """Compute log probability (base 2) of the given sequence."""
        score = 0.0
        count = 0
        # Start at third word, since we need a full context.
        for i in range(2, len(seq)):
            if (seq[i] == "<s>" or seq[i] == "</s>"):
                continue  # Don't count special tokens in score.
            s = np.log2(self.next_word_proba(seq[i], seq[i-2:i]))
            score += s
            count += 1
            # DEBUG
            if verbose:
                print("log P({:s} | {:s}) = {.03f}".format(seq[i], " ".join(seq[i-2:i]), s))
        return score, count

In [18]:
split=0.8
sentences = np.array(list(wordlist), dtype=object)
fmt = (len(sentences), sum(map(len, sentences)))
print("Loaded {:,} sentences ({:g} tokens)".format(*fmt))


rng = np.random.RandomState()
rng.shuffle(sentences)  # in-place
split_idx = int(split * len(sentences))
train_sents = sentences[:split_idx]
test_sents = sentences[split_idx:]

for l in range(len(train_sents)):
    train_sents[l] = train_sents[l].split(", ")
for l in range(len(test_sents)):
    test_sents[l] = test_sents[l].split(", ")
# train_sents = train_sents.split(",")
# test_sents = test_sents.split(",")

fmt = (len(train_sents), sum(map(len, train_sents)))
print("Training set: {:,} sentences ({:,} tokens)".format(*fmt))
fmt = (len(test_sents), sum(map(len, test_sents)))
print("Test set: {:,} sentences ({:,} tokens)".format(*fmt))


Loaded 779,402 sentences (1.43624e+08 tokens)
Training set: 623,521 sentences (9,232,391 tokens)
Test set: 155,881 sentences (2,303,808 tokens)


In [19]:
train_sents

array([list(['Performance', 'Public Event', 'Entertainment', 'Performing Arts', 'Violet', 'Event', 'Stage', '775550602', 'Pop Music', 'Song', 'Fashion', 'Music Artist', 'Performance Art']),
       list(['Celebrities', 'Music Artist', 'Public Event', 'Performing Arts', 'Arts Culture and Entertainment', '775589786', 'Musical Theatre', 'Event', 'Music', 'Fashion', 'Musician', 'Performance', 'Entertainment', 'BFfulltakes_FTP', 'Performance Art']),
       list(['Product', 'MasterList']), ...,
       list(['Musician', 'Microphone', 'Celebrities', 'Music Artist', 'Awards Ceremony', 'BFfulltakes_FTP', 'Performing Arts', 'Arts Culture and Entertainment', 'Arm', 'Fun', 'Music', '775621471', 'Film Industry', 'Purple', 'Entertainment']),
       list(['Sport', '775639767', 'Fan', 'Baseball Cap', 'Baseball Player', 'Glasses', 'Recreation', 'Sunglasses', 'Player', 'Gesture', 'Competition Event', 'Cap', 'Crowd', 'Championship', 'Horse Racing', 'T Shirt', 'Eyewear', 'Baseball Field', 'Hat', 'BFfulltake

In [20]:
# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset):
        return word
    else:
        return constants.UNK_TOKEN

def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]

In [21]:
vocab = vocabulary.Vocabulary(canonicalize_word(w) for w in ProgressBar(utils.flatten(train_sents)))
print("Train set vocabulary: %d words" % vocab.size)

100%|██████████| 9232391/9232391 [00:28<00:00, 325106.14it/s]

Train set vocabulary: 39563 words





In [22]:
def sents_to_tokens(sents):
    """Returns an flattened list of the words in the sentences, with padding for a trigram model."""
    padded_sentences = (["<s>", "<s>"] + s + ["</s>"] for s in sents)
    # This will canonicalize words, and replace anything not in vocab with <unk>
    return np.array([utils.canonicalize_word(w, wordset=vocab.wordset) 
                     for w in ProgressBar(utils.flatten(padded_sentences))], dtype=object)

train_tokens = sents_to_tokens(train_sents)]z_
test_tokens = sents_to_tokens(test_sents)

t0 = time.time()
print("Building trigram LM...",)
lm = SimpleTrigramLM(train_tokens)
print("done in %.02f s" % (time.time() - t0))

100%|██████████| 11102954/11102954 [00:10<00:00, 1016869.43it/s]
100%|██████████| 2771451/2771451 [00:02<00:00, 1016028.42it/s]


Building trigram LM...
done in 23.15 s


### Sampling Sentences    
Before we run scoring, let's look at some generated sentences from our model. We'll generate them sequentially, one token at a time:

In [23]:
def lm_predictions(l_model, tag, max_length):
        seq = ["<s>", tag]
        for i in range(max_length):
            try:
                seq.append(l_model.predict_next(seq))
            # Stop at end-of-sentence
    #         if seq[-1] == "</s>": break
            except ValueError:
                seq.append('nodata_nodata')
#         print(" ".join(seq))
#         print("[{1:d} tokens; log P(seq): {0:.02f}]".format(*lm.score_seq(seq)))
#         print("")
        seq = seq[2:]
        return seq

lm_predictions(lm, 'blue', 15)

['beer company',
 'ufo',
 'logo',
 'vertical',
 '</s>',
 '<s>',
 '<s>',
 'arts culture and entertainment',
 'DGDGDGDGDGDGDGDGDG',
 'film industry',
 'wheelchair',
 'DGDGDGDGDGDGDGDGDG',
 'community',
 'face',
 'arts culture and entertainment']

### Scoring  

In [24]:
log_p_data, num_real_tokens = lm.score_seq(train_tokens)
print("Train perplexity: {:.02f}".format(2**(-1*log_p_data/num_real_tokens)))

log_p_data, num_real_tokens = lm.score_seq(test_tokens)
print("Test perplexity: {:.02f}".format(2**(-1*log_p_data/num_real_tokens)))

Train perplexity: 21.27


  s = np.log2(self.next_word_proba(seq[i], seq[i-2:i]))


Test perplexity: inf


In [25]:
lengths = []
for i in asset_dicts:
    lengths.append(len(i.values()))
np.average(lengths)

15.28

In [26]:
def lm_get_top_matches(l_model, test_list):
    """
    Given a language model and list of tags, gets most similar results.
    Runs on one row (asset) at a time.
    """
#     ref_list = []
    matches = {}
#     not_found = 0
    for lstring in test_list:
#         print(lstring)
        tagset = []
        try:
            match = lm_predictions(lm, lstring, 15)
            for tag in range(len(match)):
#             for tag in range(2):
                tagset.append(match[tag])

# #             ref_list.append(tagset)
            matches[lstring] = tagset
        except KeyError:
    #         print(ls, ' : ','NOT_FOUND')
            pass
#     print("Not found", not_found)
    return matches



In [None]:
##to do: Make this a function with a parameter for each type of model

# get lemmatized tag df with 1 row per asset and each tag in a separate column, covert to list of lists
test_df = w2v_model.retrieve_expanded_query()
lm_test_vals = test_df.values[0:100]
  
#use top_matches method to create a dictionary of related tags suggested by embedding model
lm_asset_dicts = []
start = time.time()
for i in range(len(lm_test_vals)):
#     print("remaining:", len(test_vals) - i)
    lm_test_list = lm_test_vals[i][lm_test_vals[i] != None]
#     rate.append(test_list)
# #     print(test_list)
    lm_top_matches = lm_get_top_matches(lm, lm_test_list)
    lm_asset_dicts.append(lm_top_matches)
# #     str(test_list)
end = time.time()
print("elapsed:", end - start)
lm_asset_dicts

In [28]:
#use valid_prediction method to determine useful suggestions
lm_asset_results = []
for i in lm_asset_dicts:
#     print(valid_prediction(i), i.keys())
    lm_asset_results.append(valid_prediction(i))
print("Rate of valid suggestions:", lm_asset_results.count("MATCH")/len(lm_asset_results))
# asset_results.count("MATCH")

Rate of valid suggestions: 0.51


## BERT  
Given that our corpus is full of unusual terms and that our "sentences" are order-agnostic, BERT's pre-trained bi-directional nature makes it a counterintuitive choice. We propose a novel application, however, in which BERT is fine-tuned on our tag corpus (onto which we impose the arbitrary order of alphabeticization).   
    
        
BERT plan:    
1. create version of reconstructed_assets with rare tags removed, each set of tags alphabetized   
2. fine-tune BERT on that <<---- this is where I'm stuck :( 
3. give BERT an unedited tag list for a given asset, with rare tags changed to [MASK]



In [29]:
#code credit: https://gist.github.com/yuchenlin/a2f42d3c4378ed7b83de65c7a2222eb2
# !pip install torchvision 


Vanilla BERT 

In [30]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
def predict_masked_sent(text, top_k=5):
    # Tokenize input
    text = "[CLS] %s [SEP]"%text
    tokenized_text = tokenizer.tokenize(text)
    masked_index = tokenized_text.index("[MASK]")
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    # tokens_tensor = tokens_tensor.to('cuda')    # if you have gpu

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1)
    top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)

    for i, pred_idx in enumerate(top_k_indices):
        predicted_token = tokenizer.convert_ids_to_tokens([pred_idx])[0]
        token_weight = top_k_weights[i]
        print("[MASK]: '%s'"%predicted_token, " | weights:", float(token_weight))

        
predict_masked_sent("BERT is [MASK] at this.", top_k=5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[MASK]: 'surprised'  | weights: 0.1910516619682312
[MASK]: 'shocked'  | weights: 0.16232925653457642
[MASK]: 'amused'  | weights: 0.08754459768533707
[MASK]: 'stunned'  | weights: 0.07133634388446808
[MASK]: 'horrified'  | weights: 0.0690688043832779


Vanilla BERT given a test tag sequence

In [31]:
predict_masked_sent("[MASK], 'Lighting', 'Arts Culture and Entertainment', 'Light Fixture', 'Water', 'Chandelier', 'Ceiling Fixture', 'Ceiling', 'BFfulltakes_FTP'", top_k=5)

[MASK]: 'lighting'  | weights: 0.07484932988882065
[MASK]: 'including'  | weights: 0.02368135191500187
[MASK]: 'music'  | weights: 0.013157987967133522
[MASK]: 'architecture'  | weights: 0.00914605800062418
[MASK]: 'website'  | weights: 0.00885855220258236


In [32]:
##  TEST WITH FUNCTION

## ACTUAL SUGGESTION FUNCTION    
We loop through the list of assets and attempt to offer alternative tags for any rare tags that we enconter.

Using the simple W2V embeddings for now

In [33]:
to_check = w2v_model.lm_retrieve_expanded_query()
to_check

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,film industry,arts culture and entertainment,Choreography,Dance,celebrities,775621438,awards ceremony,BFselects_FTP,Entertainment,Fashion Design,...,,,,,,,,,,
1,775637829,Basketball,Sport,Basketball Moves,Sports Uniform,Footwear,Contact Sport,ozteam,Justinian Jessup,Active Shorts,...,,,,,,,,,,
2,775635494,Audio Equipment,Spokesperson,Public Address System,Keyboard,Microphone,Music Artist,Entertainment,BFfulltakes_FTP,Microphone Stand,...,,,,,,,,,,
3,775578330,10.21.2020 CMT Awards Fan Viewing Party_Kempin,Darkness,Music,Performance,Sky,Fashion,Celebrities,BFfulltakes_FTP,Arts Culture and Entertainment,...,,,,,,,,,,
4,775641976,Green,Eyelash,Hair,Font,Dress,Arm,BFfulltakes_FTP,Arts Culture and Entertainment,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779397,BFfulltakes_FTP,,,,,,,,,,...,,,,,,,,,,
779398,BFfulltakes_FTP,,,,,,,,,,...,,,,,,,,,,
779399,BFfulltakes_FTP,,,,,,,,,,...,,,,,,,,,,
779400,BFfulltakes_FTP,,,,,,,,,,...,,,,,,,,,,


TO DO:  
add more iterations to feed result list of tags back into function for further refinement 

In [34]:
#given a list of tags, identify those that are rare and provide suggested replacements
def get_candidates(tag_list):
    start = time.time()
    lemma_dict = w2v_model.lemma_map()
    lemma_common_tags = []
    lemma_rare_tags = []
    candidates = []
#     cand_list = []
    for tag in tag_list:
        try:
            if w2v_model.is_not_rare(lemma_dict[tag]):
                lemma_common_tags.append(lemma_dict[tag])
                continue
            else:
                lemma_rare_tags.append(lemma_dict[tag])
        except KeyError:
            if w2v_model.is_not_rare(tag):
                continue
        try:    
            if tag is not None:
                candidate = get_top_matches(vec_model, ['', lemma_dict[tag]])
    #             print('tag: ', tag, '\nsuggestions:', candidates.values(),'\n')
#                 cand_list.append(candidate)
                candidates.append(candidate.values())
        except KeyError:
            pass
    flat = list(chain(*candidates))
    flatter = list(chain(*flat)) 
#     print('lemma common tags: ', lemma_common_tags, '\nlemma rare tags: ', lemma_rare_tags)      
    return flatter

def suggest_better_tags(list_of_tags):
    tag_candidates = get_candidates(list_of_tags)
    tag_candidates = [w2v_model.delete_rare(tc) for tc in tag_candidates]
    tag_candidates = [tag for tag in tag_candidates if tag != ""]
    
    return tag_candidates
end = time.time()
print(end - start)
# list_of_tags = to_check.values[2]
# list_of_tags = ["Saw"]
# zzz = suggest_better_tags(list_of_tags, 1)
# zzz
# # print(lemma_dict(list_of_tags), zzz)
# # yyy = get_candidates(list_of_tags)
# # yyy
# list_of_tags

41.64086318016052


We gather suggestions for every tag and concatenate those into a
single suggestion list.    
Then we discard uncommon (rare) suggestions and identify suggestions that were already tags for the given asset (duplicates).     
The final list of suggested tags then consists only of common tags that are not already applied to the asset.   
We also track the duplicates so they can be used to validate the usability of our suggestions.

In [35]:
'''
For each asset (row), gather suggestions for every tag and concatenate those into a
single suggestion list. Then discard uncommon (rare) suggestions and identify
suggestions that were already tags for the given asset (duplicates). The final list of suggested tags
is then only common tags that are not already applied to the asset.

This function also tracks the duplicates so they can be used to validate the usability of our suggestions.
'''

def get_real_suggestions(existing_tags):
    #GIVE THIS FUNCTION AT LEAST TWO LISTS OF TAGS OR A LIST WRAPPED IN AN EMPTY LIST
    start = time.time()
    count = len(existing_tags)
    
    all_live_suggestions = []
    all_new_suggestions = []
    all_dupe_suggestions = []
    all_weighted = []

    lemma_dict = w2v_model.lemma_map()
    for i in range(count):
        live_tags = existing_tags[i]
#         print("tags in \n", live_tags, '\n')
        live_suggestions = suggest_better_tags(live_tags)
        all_live_suggestions.append(live_suggestions)
    #     print("all suggestions \n", live_suggestions, '\n')
        new_suggestions = []
        dupe_suggestions = []
        for sug in live_suggestions:
            for tag in live_tags:
                if sug == lemma_dict.get(tag):
                    if sug not in dupe_suggestions:
                        dupe_suggestions.append(sug)
                    continue              
            if sug not in dupe_suggestions:
                new_suggestions.append(sug)
        counts = collections.Counter(new_suggestions)
        weighted = counts.most_common()
        all_weighted.append(weighted)
        
        all_dupe_suggestions.append(dupe_suggestions)
        all_new_suggestions.append(new_suggestions)
#         counts = collections.Counter(all_new_suggestions)
#         weigthed = counts.most_common()
    return [all_live_suggestions, all_dupe_suggestions, all_new_suggestions, all_weighted]
test_check = to_check.values[0:100]

start = time.time()
real_suggestions = get_real_suggestions(test_check)
end = time.time()
print(end - start)

72.75653457641602


Our function returns a batch of new legitimate suggested tags for each asset (ideally), sorted by frequency.  

In [36]:
for i in range(len(real_suggestions[0])):
    print('\n\n Asset', i, '\n\nORIGINAL TAG LIST: \n', test_check[i], '\nSUGGESTIONS: \n', real_suggestions[0][i], '\nDUPLICATES: \n', real_suggestions[1][i], '\nLEGITIMATE SUGGESTIONS:  \n', real_suggestions[2][i],  '\nBY WEIGHT:  \n', real_suggestions[3][i])



 Asset 0 

ORIGINAL TAG LIST: 
 ['film industry' 'arts culture and entertainment' 'Choreography' 'Dance'
 'celebrities' '775621438' 'awards ceremony' 'BFselects_FTP'
 'Entertainment' 'Fashion Design' 'Performing Arts' None None None None
 None None None None None None None None None None None None None None
 None None None None None None None None None None None None None None
 None None None None None None None None] 
SUGGESTIONS: 
 ['fashion', 'film industry', 'music', 'pink', 'fashion', 'music', 'pink', 'music', 'red', 'purple', 'fashion', 'music', 'event', 'entertainment', 'purple', 'film industry', 'red', 'fashion', 'music', 'fashion', 'event', 'entertainment', 'purple', 'bestof', 'color image', 'topix', 'film industry', 'horizontal', 'vertical', 'fashion', 'musician', 'entertainment', 'music'] 
DUPLICATES: 
 ['film industry', 'entertainment'] 
LEGITIMATE SUGGESTIONS:  
 ['fashion', 'music', 'pink', 'fashion', 'music', 'pink', 'music', 'red', 'purple', 'fashion', 'music', 'event

To do: scoring/stats, etc   
We see at least one valid, common suggestion (defined as a tag that was actually attached by a user in our initial data set) for approx 80% of assets. (based on 1000 assets)

In [38]:
dupes = real_suggestions[1]
# def condition(x): return len(x) == 0
output = [idx for idx, element in enumerate(dupes) if len(element) > 0]
len(output)/len(real_suggestions[1])


0.84

Our function suggests an average of approx 10 common tags per asset.

In [39]:
new_unique = real_suggestions[3]
new_unique_lengths = []
for nu in new_unique:
    l = len(nu)
    new_unique_lengths.append(l)
sum(new_unique_lengths)/len(new_unique_lengths)


9.59

In [None]:
#random list of common tags

rand_tags = w2v_model.retrieve_rare(3)
rand_tags

In [None]:
nb_end = time.time()
print(nb_end - nb_start)