Goal: *Try to get the vocabulary as close to the embeddings as possible.* 


Notes
+ url appears 3907 times in the processed_text_deep
+ does not appear in the embeddings
+ added column with 'url' removed


Results:  
+ Found embeddings for 70.81% of vocab  
+ Found embeddings for  92.91% of all text

In [None]:
import pandas as pd
from ast import literal_eval
import re

In [None]:
def vocabulary(tweets):
    """
    Creates a dictionary of all the words found in the
    given data.

    Parameters:
        tweets (list): list of strings/words to be added to the dictionary
    
    Returns:
        (dictionary): vocabulary of the words found in the given tweets and the number of occurances for each.
    """
    voc = {}
    for tweet in tweets:
        for word in tweet.split():
            if word in voc:
                voc[word] += 1
            else:
                voc[word] = 1
    return voc

In [None]:
def differ(voc, emb):
    """
    Check the difference of the vocabulary and the embeddings.

    Parameters:
        voc (dictionary): vocabulary used in tuning of the deep learning algorithm.
        emb (set): embeddings used in pretraining.

    Returns:
        (dictionary): words in vocabulary not found in the embeddings sorted by value.
    """
    notfound = {}
    found = {}
    countNotFound = 0 # total number of occurances of words not found
    countFound = 0 # total number of occurances of words found
    for word in voc:
        if not (word in emb):
            notfound[word] = voc[word]
            countNotFound += voc[word]
        else:
            found[word] = voc[word]
            countFound += voc[word]

    print('Found embeddings for {:.2%} of vocab'.format(len(found) / len(voc)))
    print('Found embeddings for  {:.2%} of all text'.format(countFound / (countFound + countNotFound)))
    return notfound

In [None]:
# load vocabulary/embeddings used in pretraining
with open('../dictionaries/vocab.txt') as f:
    emb = {line.strip() for line in f}

In [None]:
# load dataset used for tuning
tweets_df = pd.read_csv('../dataset/train_dropduplicates.csv')
tweets_df

In [None]:
# create the vocabulary based on a processed column from the dataset
tweets = [str(tweet).lower() for tweet in tweets_df['processed_text_deep_without_url']]
voc = vocabulary(tweets)

In [None]:
d = differ(voc, emb)
sorted(d.items(), key=lambda x:x[1], reverse=True)

In [None]:
# load dataset used for tuning
tweets_df = pd.read_csv('../dataset/test_processed.csv')
for index, row in tweets_df.iterrows():
    tweets_df.at[index, 'processed_text_deep_without_url'] = row['processed_text_deep'].replace('url', '')