In [1]:
import re
import pandas as pd
import collections
import math
import pickle
import timeit
from random import *
import random
from dateutil import parser

# Start With Existing Data Here [Acronym, Meaning, Content]

In [17]:
full_df = pd.read_pickle('DataSets/custom_acr_mean_cont.pkl')

# Remove people in the following dataset:
This dataset was created by training an LSTM classifier to classify people in the wikipedia dataset

In [18]:
ppl_remove = pd.read_pickle('DataSets/people_to_remove.pkl')

In [19]:
print(len(full_df))
full_df = full_df[~full_df['Meaning'].isin(list(ppl_remove['Meaning']))]
print(len(full_df))

510519
410954


# Clean Data

In [20]:
# To Lower Case
full_df = full_df.apply(lambda x: x.astype(str).str.lower())

### Remove rows with empty aconym, meaning, or content

In [21]:
def check_empty(row):
    mean = row['Meaning']
    acr = row['Acronym']
    cont = row['Content']
    
    if len(cont) < 2:
        return 1
    else:
        return 0

In [22]:
full_df['Remove'] = full_df.apply(check_empty, axis=1)

In [23]:
# Drop rows
print(len(full_df))
full_df = full_df[full_df['Remove'] == 0]
print(len(full_df))

# Delete Remove Column
del full_df['Remove']

410954
410915


### Remove duplicate rows

In [24]:
print(len(full_df))
full_df = full_df.drop_duplicates(subset=['Acronym', 'Meaning', 'Content'], keep='first')
print(len(full_df))

410915
409709


### Remove Rows With Dates
These mainly contain people, songs, tv shows, and other non-relevant content

In [25]:
def contains_date(content):
    try:
        dates_str = content[content.find("(")+1:content.find(")")]
        date_strs = re.split(' – | -', dates_str)
        dts = [parser.parse(d) for d in date_strs]
        return True
    except:
        return False

In [26]:
full_df['Contains_Date'] = full_df['Content'].apply(contains_date)

In [27]:
print(len(full_df))
full_df = full_df[full_df['Contains_Date'] == False]
del full_df['Contains_Date']
print(len(full_df))

409709
376919


### Remove names if content contains (born

In [28]:
# Remove names if content contains '(born'
def remove_names(row):
    c = row["Content"]
    if "(born" in c:
        return True
    else:
        return False

In [29]:
print(len(full_df))
full_df["Remove_Born"] = full_df.apply(remove_names, axis=1)

376919


In [30]:
# As a tangent, might as well save the people as a separate database
people = full_df[full_df['Remove_Born'] == True]
people.to_pickle('DataSets/people.pkl')

full_df = full_df[full_df['Remove_Born'] == False]
del full_df['Remove_Born']

print(len(full_df))

261879


### Lemmatize Content

In [31]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_cont(content):
    return ' '.join([lemmatizer.lemmatize(w) for w in content.split()])

cnt = collections.Counter(' '.join(full_df['Content']).split())

print(len(cnt))

start_time = timeit.default_timer()

full_df['Content'] = full_df['Content'].apply(lemmatize_cont)

print(timeit.default_timer() - start_time)

cnt = collections.Counter(' '.join(full_df['Content']).split())
print(len(cnt))

4055941
901.0211881713005
4032756


In [32]:
def lemmatize_cont(mean):
    return ' '.join([lemmatizer.lemmatize(w) for w in mean.split()])

cnt = collections.Counter(' '.join(full_df['Meaning']).split())

len(cnt)

start_time = timeit.default_timer()

full_df['Meaning'] = full_df['Meaning'].apply(lemmatize_cont)

print(timeit.default_timer() - start_time)

cnt = collections.Counter(' '.join(full_df['Meaning']).split())
len(cnt)

4.65458959529451


151304

### Remove Special Characters

In [33]:
# In Meanings, Replace hyphons with spaces
def rep_hyp_with_sp(row):
    mean = row['Meaning']
    return mean.replace('-', ' ')

In [34]:
full_df['Meaning'] = full_df.apply(rep_hyp_with_sp, axis=1)

In [35]:
# Remove special characters from content
def remove_special_char(row):
    content = row['Content']

    return re.sub('[^A-Za-z0-9\s]+', '', content)

In [36]:
start_time = timeit.default_timer()

full_df['Content'] = full_df.apply(remove_special_char, axis=1)

print(timeit.default_timer() - start_time)

42.98787504476286


### Remove meaningless words

In [37]:
ignore = ['1','2','3','4','5','6','7','8','9','0',' ','or','is','the','be','to','of','and','in','that','have','it','for','not','on','with',
         'a','an','as','do','at','this','but','by']

def remove_meaningless(content):
    return ' '.join([w for w in content.split() if w not in ignore])

start_time = timeit.default_timer()

full_df['Content'] = full_df['Content'].apply(remove_meaningless)

print(timeit.default_timer() - start_time)

101.90124757459944


# Remove Names

### Find names by frequency of seeing his, her, he, she, etc.

In [38]:
person_words = "him his her he she".split()

def person_in(content):
    top = [r[0] for r in collections.Counter(content.split()).most_common()[:10]]
    for w in top:
        if w in person_words:
            return True
    return False
    
people['Person_In'] = people['Content'].apply(person_in)

In [39]:
len(people)

115040

In [40]:
print(len(full_df))
full_df['Person_In'] = full_df['Content'].apply(person_in)

people = people.append(full_df[full_df['Person_In'] == True])
people.to_pickle('DataSets/people.pkl')

full_df = full_df[full_df['Person_In'] == False]
del full_df['Person_In']
print(len(full_df))

261879
223950


In [41]:
len(people)

152969

### Find names by comparing meaning against name database

In [42]:
# Get Names
f = open("DataSets/names.txt", "r")
first_names = []
for r in f:
    first_names += [r.split()[0].lower()]
    
f = open("DataSets/lastnames.txt", "r")
last_names = []
for r in f:
    last_names += [r.split()[0].lower()]

In [43]:
def meaning_is_name(mean):
    return all([True if w in first_names or w in last_names else False for w in mean.split()])

In [44]:
start_time = timeit.default_timer()

print(len(full_df))
full_df['Person_In'] = full_df['Meaning'].apply(meaning_is_name)

people = people.append(full_df[full_df['Person_In'] == True])
people.to_pickle('DataSets/people.pkl')

full_df = full_df[full_df['Person_In'] == False]
del full_df['Person_In']
print(len(full_df))

print(timeit.default_timer() - start_time)

223950
196774
1455.2639543919354


In [45]:
len(people)

180145

In [46]:
# Use this to find new rows that shouldn't be in the data

# idx = randint(0, len(full_df))
# print(full_df['Acronym'].iloc[idx])
# print(full_df['Meaning'].iloc[idx])
# print()
# print(full_df['Content'].iloc[idx])

### Remove rows where each word in Meaning is not common in the corpus

In [47]:
corpus_counts = collections.Counter(' '.join(full_df['Content'].values.tolist()).split())

In [48]:
thresh = 0.000015
total_cnts = sum(corpus_counts.values())

def mean_not_common(mean):
    return all([True if corpus_counts[w]/total_cnts < thresh else False for w in mean.split()])

In [49]:
start_time = timeit.default_timer()

full_df['Not_Common'] = full_df['Meaning'].apply(mean_not_common)
full_df = full_df[full_df['Not_Common'] == False]
del full_df['Not_Common']

print(timeit.default_timer() - start_time)

0.8719645920764378


In [50]:
len(full_df)

153880

In [51]:
# Save Data
full_df.to_pickle('DataSets/before_lemmatize.pkl')

In [72]:
full_df = pd.read_pickle('DataSets/before_lemmatize.pkl')

### Remove words from Content that are infrequent

In [52]:
corpus_counts = collections.Counter(' '.join(full_df['Content'].values.tolist()).split())

total_cnts = sum(corpus_counts.values())
word_tf = {w: corpus_counts[w]/total_cnts for w in corpus_counts.keys()}
filtered_vocab = {w: v for w, v in word_tf.items() if v > 0.000005}
print(len(word_tf))
print(len(filtered_vocab))
collections.Counter(filtered_vocab).most_common()[-20:]

1370778
15433


[('cradle', 5.014356516872947e-06),
 ('nyu', 5.014356516872947e-06),
 ('attributes', 5.014356516872947e-06),
 ('corey', 5.014356516872947e-06),
 ('ideally', 5.014356516872947e-06),
 ('towing', 5.014356516872947e-06),
 ('intramural', 5.014356516872947e-06),
 ('printers', 5.014356516872947e-06),
 ('monochrome', 5.014356516872947e-06),
 ('deportation', 5.014356516872947e-06),
 ('codified', 5.014356516872947e-06),
 ('economies', 5.014356516872947e-06),
 ('reunification', 5.014356516872947e-06),
 ('importing', 5.014356516872947e-06),
 ('estimates', 5.014356516872947e-06),
 ('olivier', 5.014356516872947e-06),
 ('hungry', 5.014356516872947e-06),
 ('viktor', 5.014356516872947e-06),
 ('twentyfour', 5.014356516872947e-06),
 ('confusing', 5.014356516872947e-06)]

In [53]:
len(' '.join(full_df['Content']).split())

67605883

In [54]:
def remove_infrequent(content):
    return ' '.join([w for w in content.split() if w in filtered_vocab.keys()])

start_time = timeit.default_timer()

full_df['Content'] = full_df['Content'].apply(remove_infrequent)

print(timeit.default_timer() - start_time)

29.309187675103203


In [55]:
len(' '.join(full_df['Content']).split())

58388288

In [56]:
# Save Data
full_df.to_pickle('DataSets/custom_clean_acronym_intermediate.pkl')

# Start With Existing Data Here

In [57]:
full_df = pd.read_pickle('DataSets/custom_clean_acronym_intermediate.pkl')

### Remove rows where letters in Acronym are not letters or numbers

In [58]:
proper_acr = 'a b c d e f g h i j k l m n o p q r s t u v w x y z 1 2 3 4 5 6 7 8 9 0'.split()

def remove_by_acr(acr):
    for l in acr:
        if l not in proper_acr:
            return True
    return False

In [59]:
print(len(full_df))
full_df['Remove'] = full_df['Acronym'].apply(remove_by_acr)
full_df = full_df[full_df['Remove'] == False]
del full_df['Remove']
print(len(full_df))

153880
153342


### Get Term Frequency

In [60]:
N = len(full_df)
def get_tf(row):
    content = row['Content']
    counts = collections.Counter(content.split())
    
    cont_len = len(content.split())
    tf_dict = {w:counts[w]/cont_len for w in counts}
    
    return tf_dict

In [61]:
start_time = timeit.default_timer()

full_df['TF'] = full_df.apply(get_tf, axis=1)

print(timeit.default_timer() - start_time)

45.66344976816799


### Remove Movies, Albums, Songs w/ TF

In [62]:
remove_words = 'film movie band song singer album imdb actress actor sitcom university school'.split()

In [63]:
def remove_with_tf(row_tf, rem_words):
    tf = collections.Counter(row_tf)
    mc = [x[0] for x in tf.most_common()[:10]]
    for w in mc:
        if w in rem_words:
            return True
        else:
            return False

In [64]:
start_time = timeit.default_timer()

print(len(full_df))
full_df['Remove'] = full_df['TF'].apply(remove_with_tf, args=(remove_words,))
full_df = full_df[full_df['Remove'] == False]
del full_df['Remove']
print(len(full_df))

print(timeit.default_timer() - start_time)

153342
144357
24.14344504024575


### Remove Words that occur too often in corpus

In [65]:
num_words_remove = 50
corpus_counts = collections.Counter(' '.join(full_df['Content'].values.tolist()).split())

total_cnts = sum(corpus_counts.values())
word_tf = {w: corpus_counts[w]/total_cnts for w in corpus_counts.keys()}
corpus_tf = collections.Counter(word_tf)
remove_words = [w for w, v in corpus_tf.most_common()[:num_words_remove]]

In [66]:
pct = sum([v for w, v in corpus_tf.most_common()[:num_words_remove]])*100
print("These words account for " + '{0:.1f}'.format(pct) + "% of the corpus.")

These words account for 13.7% of the corpus.


In [67]:
def remove_frequent(content):
    return ' '.join([w for w in content.split() if w not in remove_words])

start_time = timeit.default_timer()

full_df['Content'] = full_df['Content'].apply(remove_frequent)

print(timeit.default_timer() - start_time)

61.26941607821209


In [68]:
corpus_counts = collections.Counter(' '.join(full_df['Content'].values.tolist()).split())
print('There were ' + str(total_cnts) + ' in the corpus, now there are ' + str(sum(corpus_counts.values())))

There were 54833169 in the corpus, now there are 47332943


In [69]:
# Remove unnecessary columns
# del full_df['TF']
# del full_df['Title']

### Remove acr/meanings from content

In [70]:
# Remove acronyms/meanings from content
def rem_meanings_from_content(row):
    acr = row['Acronym']
    mean = row['Meaning']
    content = row['Content']
    
    for w in mean.split():
        content = content.replace(w, '')
    
    content = content.replace(acr, '')
    
    return content

In [71]:
full_df['Content'] = full_df.apply(rem_meanings_from_content, axis=1)

In [72]:
# Save Data
full_df.to_pickle('DataSets/custom_clean_acronym_with_content.pkl')

# Start With Existing Clean Data Here [Acronym, Meaning, Content]

In [17]:
full_df = pd.read_pickle('DataSets/custom_clean_acronym_with_content.pkl')

# Get TFIDF for Content

### Corpus Counts

In [18]:
corpus_counts = collections.Counter(' '.join(full_df['Content'].values.tolist()).split())

### Calculate TFIDFs for Each Row

In [19]:
N = len(full_df)
def get_tfidf(row):
    content = row['Content']
    counts = collections.Counter(content.split())
    
#     content_dict = {}
#     for w in counts:
#         tf = counts[w]/len(content.split())
#         idf = math.log(N/corpus_counts[w])
#         content_dict[w] = tf*idf

    cont_len = len(content.split())   

    # content_dict = {w: tf*idf}  
    # tf = counts[w]/cont_len
    # idf = math.log(N/corpus_counts[w]) 
                   
    content_dict = {w:(counts[w]/cont_len)*math.log(N/corpus_counts[w]) for w in counts}
    
    return content_dict

In [20]:
start_time = timeit.default_timer()

full_df['TFIDF'] = full_df.apply(get_tfidf, axis=1)

print(timeit.default_timer() - start_time)

43.519573773139314


# Get Tags

In [27]:
num_tags = 50 # Number of tags for each acronym
def get_tags(row):
    tfidf_row = row['TFIDF']
    tfidf = collections.Counter(tfidf_row)
    tags = list(dict(tfidf.most_common()[:num_tags]).keys())
    tags = [tag for tag in tags if len(tag) < 15] # Remove rediculously long tags
    
    min_rand_len = 6
    max_rand_len = len(tags)
    if len(tags) < min_rand_len:
        max_rand_len = min_rand_len
    rand_len = randint(min_rand_len, max_rand_len)
    tags = tags[:rand_len]
    
    return tags

In [28]:
full_df['Tags'] = full_df.apply(get_tags, axis=1)

In [29]:
#del full_df['TFIDF']

### Remove Rows if Len of TFIDF is below thresh

In [30]:
full_df['Length'] = full_df['TFIDF'].apply(lambda x: len(x))

In [31]:
full_df = full_df[full_df['Length'] >= 5]
del full_df['Length']

### Create 'Vocab To Int'/'Int To Vocab' Dictionaries

In [34]:
from itertools import count
vocab_to_int = {'<PAD>': 0, '<UNK>': 1, '</ACR>': 2, '<GO>': 3, '</TAG>': 4, '</MEAN>': 5}

cnt = collections.Counter(' '.join(full_df['Meaning']).split())
cnt.update([t for tags in full_df['Tags'] for t in tags])
cnt.update('a b c d e f g h i j k l m n o p q r s t u v w x y z'.split())

# Keep vocabulary with counts greater than threshold
thresh = 0
c = count(len(vocab_to_int))
vocab_to_int.update({w: next(c) for (w, val) in cnt.items() if val > thresh}) 
int_to_vocab = {i: w for w, i in vocab_to_int.items()}

In [35]:
len(vocab_to_int)

113082

In [36]:
# Save Dictionaries to Pickle files
pickle.dump(vocab_to_int, open("DataSets/custom_vocab_to_int.p", "wb"))
pickle.dump(int_to_vocab, open("DataSets/custom_int_to_vocab.p", "wb"))

### Create duplicate rows with different tag sets

Example:
- acs	access control service	[web, factored, swt, draft, federation]
- acs	access control service	[platform, customizable, python, windows, brow...
- acs	access control service	[programmatic, portal, integration, google, ac...

In [37]:
start_time = timeit.default_timer()

keys = set(vocab_to_int.keys())

num_tags = 50
#interval = 10
num_tag_sets = 1
len_tag_set = 20
df_exp = pd.DataFrame(columns=['Acronym', 'Meaning', 'Tags'])

sliced_tags = []
acronyms = []
meanings = []

for idx, row in full_df.iterrows():
    mc = [r[0] for r in collections.Counter(row['TFIDF']).most_common() if r[0] in keys]
    tags = mc[:num_tags]
    
    
    if len(tags) <= 3:
        continue
    
    #trunc_num_tags = (len(tags)//interval)*interval
    
    #sliced = [tags[i:i+interval] for i in range(0, trunc_num_tags, interval)]
    #sliced_tags += sliced
    
    # Take random values from full num_tags and get a subset of length len_tag_set
    # This will create multiple lists of tags with similar tag sets, but not exactly the same    
    slice_len = len_tag_set
    if slice_len > len(tags):
        slice_len = len(tags)
        
    if len(tags) > 1.5*len_tag_set:  
        tag_set_cnt = num_tag_sets
        for i in range(num_tag_sets):
            sliced_tags += [[tags[i] for i in random.sample(range(len(tags)), slice_len)]]
    else:
        tag_set_cnt = 1
        sliced_tags += [tags]
        
    #list_len = len(sliced)
    
    acronyms += [row['Acronym']]*tag_set_cnt
    meanings += [row['Meaning']]*tag_set_cnt
    
print(timeit.default_timer() - start_time)

KeyboardInterrupt: 

In [86]:
df_exp = df_exp.append(pd.DataFrame({'Acronym': acronyms,'Meaning': meanings,'Tags': sliced_tags}))
full_df = df_exp

In [38]:
# Save Data
full_df.to_pickle('DataSets/custom_acronyms_with_tags.pkl')

# Start with Existing Data Here [Acronym, Meaning, Content, Tags]

In [39]:
df = pd.read_pickle('DataSets/custom_acronyms_with_tags.pkl')
vocab_to_int = pickle.load(open("DataSets/custom_vocab_to_int.p", "rb"))
int_to_vocab = pickle.load(open("DataSets/custom_int_to_vocab.p", "rb"))

In [40]:
len(df)

143150

# Convert Text To IDs

### Data To IDs

In [90]:
# TODO: Probably don't need <ACR> at the beginning of encoder inputs. Might want to try without.

# <ACR>,a,c,r,o,n,y,m,</ACR>,<TAG>,these,are,the,tags</TAG>

# Example:
a = 'acronym'
b = ['these', 'are', 'the', 'tags']
c = ('<ACR> ' + ' '.join(list(a)) + ' </ACR> ' + '<TAG> ' + ' '.join(b) + ' </TAG>').split()
e = 'this is the meaning'
f = ('<MEAN> ' + e + ' </MEAN>').split()
print(c)
print(f)

['<ACR>', 'a', 'c', 'r', 'o', 'n', 'y', 'm', '</ACR>', '<TAG>', 'these', 'are', 'the', 'tags', '</TAG>']
['<MEAN>', 'this', 'is', 'the', 'meaning', '</MEAN>']


In [41]:
max_acr_len = max([len(a) for a in list(df['Acronym'])])
#trunc_thresh = 20 # If There are more than this many tags, than truncate the rest off
# keys = set(vocab_to_int.keys())

def data_2_ids(row):
    acr = row['Acronym']
    mean = row['Meaning']
    tags = row['Tags']
    
    # Get a list of tags that occur in the vocab
#     valid_tags = []
#     for tag in tags:
#         if tag in set(vocab_to_int.keys()):
#             valid_tags += [tag]
#     valid_tags = [tag for tag in tags if tag in keys]
    
    # Trim tags if there are more than thresh
#     if len(valid_tags) > trunc_thresh:
#         valid_tags = valid_tags[:trunc_thresh-1]
    
    # Removed '<ACR> ' and '<TAG>' from beginning of source_text. Don't think I need them.
    # Source text shouldn't need a start delimeter and having </ACR> & <TAG> is redundant.
    #source_text = '<ACR> ' + ' '.join(list(acr)) + ' </ACR> ' + '<TAG> ' + ' '.join(tags) + ' </TAG>'
    #source_text = ' '.join(list(acr)) + ' </ACR> ' + ' '.join(valid_tags) + ' </TAG>'
    source_text = list(acr) + ['</ACR>'] + tags + ['</TAG>']

    # The decoder needs a <GO> tag to know when to start generating output. It also needs an <EOS>
    #     to know when a sentence ends or indicate it as output.
    #target_text = mean + ' </MEAN>'
    target_text = mean.split() + ['</MEAN>']
    
#     # Add <PAD> to end of source
#     for i in range(len(source_text.split()), max_source + 4):
#         source_text += ' <PAD>'
    
#     # Add <PAD> to end of target meaning so all targets have the same length
#     for i in range(len(target_text.split()), max_target + 2):
#         target_text += ' <PAD>'
    
    source_id_text = [vocab_to_int.get(w, vocab_to_int['<UNK>']) for w in source_text if w in vocab_to_int]
    target_id_text = [vocab_to_int.get(w, vocab_to_int['<UNK>']) for w in target_text]
    
    return pd.Series([row['Acronym'], row['Meaning'], row['Tags'], source_id_text, target_id_text])

In [42]:
start_time = timeit.default_timer()

df2 = df.apply(data_2_ids, axis=1)
df2.columns = ['Acronym', 'Meaning', 'Tags', 'Source', 'Target']

print(timeit.default_timer() - start_time)

45.78163758136361


In [43]:
# df2 = df2[df2['Remove'] == False]

In [44]:
# Save Data
df2.to_pickle('DataSets/custom_acronyms_with_tags_src_trgt.pkl')

# Start With Existing Data Here

In [95]:
df2 = pd.read_pickle('DataSets/custom_acronyms_with_tags_src_trgt.pkl')

### Remove Unkown Meanings

In [45]:
# Remove rows where target is all unknowns <UNK>
thresh = 0.50

def mostly_unk(tar):
    unk_cnt = 0
    known_cnt = 0
    
    for w in tar:
        if w == vocab_to_int['<UNK>']:
            unk_cnt += 1
        elif w == vocab_to_int['</MEAN>']:
            pass
        else:
            known_cnt += 1
            
    total = known_cnt + unk_cnt
    pct = unk_cnt/total
    
    if tar[len(tar)-1] == vocab_to_int['</TAG>'] and tar[len(tar)-2] == vocab_to_int['</ACR>']:
        return True
    
    if pct > thresh:
        return True
    else:
        return False

In [46]:
len(df2)

143150

In [47]:
df2['Remove'] = df2['Target'].apply(mostly_unk)
df2 = df2[df2['Remove'] == False]
del df2['Remove']

In [48]:
df2['Remove'] = df2['Source'].apply(mostly_unk)
df2 = df2[df2['Remove'] == False]
del df2['Remove']

In [49]:
len(df2)

143150

In [50]:
# Shuffle data
from sklearn.utils import shuffle
df2 = shuffle(df2)

In [51]:
train_split_percent = 0.9
train_split_size = round(len(df2)*train_split_percent)
validation_set = df2[train_split_size:]
df2 = df2[:train_split_size]

In [52]:
# Save data set
validation_set.to_pickle('DataSets/custom_acr_with_src_trgt_validation.pkl')

### Sort By Source Length

In [53]:
# def get_len(col):
#     return len(col)

# df2['Source_Len'] = df2['Source'].apply(get_len)
# df2['Target_Len'] = df2['Target'].apply(get_len)
# df3 = df2.sort_values(['Source_Len', 'Target_Len'])

In [54]:
# # Save data set
# df3.to_pickle('DataSets/custom_acr_with_src_trgt.pkl')

In [55]:
# Save data set
df2.to_pickle('DataSets/custom_acr_with_src_trgt.pkl')

In [56]:
len(df2)

128835

# Load Data Set From Here with Source/Target

In [57]:
df = pd.read_pickle('DataSets/custom_acr_with_src_trgt.pkl')

In [58]:
df[df['Acronym'] == 'bjt']

Unnamed: 0,Acronym,Meaning,Tags,Source,Target
0,bjt,bipolar junction transistor,"[charge, mode, direction, bias, diffusion, flo...","[57427, 69588, 21816, 2, 25858, 27383, 91557, ...","[73997, 614, 58590, 5]"
0,bjt,bhandasar jain temple,"[rajasthan, pillar, gallery, yellowstone, retr...","[57427, 69588, 21816, 2, 68150, 13344, 6852, 6...","[30466, 9828, 505, 5]"
0,bjt,business jet traveler,"[editorial, publishers, marketed, countrys, ma...","[57427, 69588, 21816, 2, 75158, 41040, 74842, ...","[66572, 9710, 6189, 5]"
0,bjt,brockley jack theatre,"[plus, workshop, play, visiting, venue, regist...","[57427, 69588, 21816, 2, 104354, 77036, 2004, ...","[58221, 18291, 12384, 5]"
0,bjt,belgrade jazz trio,"[festival, clarinet, yugoslavia, then, guitar,...","[57427, 69588, 21816, 2, 70767, 19634, 108836,...","[87705, 33823, 26231, 5]"
0,bjt,beijing jiaotong tai,"[xm, citys, fm, 80, traffic, am, junction, fif...","[57427, 69588, 21816, 2, 94592, 50379, 7285, 3...","[87184, 42559, 27152, 5]"
0,bjt,breitling jet team,"[engagement, 2002, lasting, swiss, rhode, fran...","[57427, 69588, 21816, 2, 108806, 103518, 11131...","[56974, 9710, 20125, 5]"


# Create Embeddings

### Using ConceptNet Embeddings

In [None]:
# Load Conceptnet Numberbatch's (CN) embeddings, similar to GloVe, but probably better 
# (https://github.com/commonsense/conceptnet-numberbatch)
embeddings_index = {}
with open('DataSets/numberbatch.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

In [None]:
pickle.dump(embeddings_index, open("DataSets/embeddings_index.p", "wb"))