## data parsing

In [1]:
# dependencies
import json
from functools import reduce
from nltk import word_tokenize

In [2]:
#constants
DATA_PATH="data.json"

In [3]:
# parsing  and preprocessing functions

'''
returns tag as array of words (no ':').
add_category determines if the tag category is added to the array.
'''
def tag_to_words(tag,add_category=False):
    category, text = tag.split(":")
    if add_category:
        return word_tokenize("{} {}".format(category,text).lower())
    return word_tokenize(text.lower())
    
'''
transforms array of tags into array of words.
add_category determines if the tag categories are added to the text.
'''
def tag_array_to_word_array(tags,add_category=False):
    aux_tags = list(tags)
    aux_tags[0] = tag_to_words(tags[0],add_category)
    return reduce(lambda x,y : x + tag_to_words(y,add_category),aux_tags)
    
'''
returns array of tag arrays from original json.
'''
def data_to_tags(data):
    return [instance["tags"] for instance in data]
    
'''
returns arrays of word arrays from tags
'''    
def preprocess(data):
    return [ tag_array_to_word_array(instance,False) for instance in data_to_tags(data)]


In [4]:
data = json.load(open(DATA_PATH))
documents = preprocess(data)

## Text embedding generation

First approach: documents will be turned into 300 dimensional embeddings by using a weighted average of pretrained Glove Vectors. The weights of the average will come from a previously computed TF-IDF model.



In [5]:
# dependencies
import numpy as np
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models.keyedvectors import KeyedVectors
from gensim.sklearn_api import TfIdfTransformer

In [6]:
# constants
GLOVE_PATH= "glove-sbwc.i25.vec"
vectors = 855380

In [7]:
word_dict = corpora.Dictionary(documents,prune_at=None)
bows = [word_dict.doc2bow(doc) for doc in documents]

In [8]:
tfidf = TfidfModel(bows,normalize=True)
tfidf.eps = -1

In [9]:
tfidf.idfs.get(12, 0.0)

0.0

In [10]:
score_hash =  { tup[0]:tup[1] for tup in tfidf.__getitem__(bows[0:2],-1)} # threshold
token2id = word_dict.token2id

In [11]:
score_hash

{(0, 0.22624513776304528): (1, 0.12613006942179017),
 (5, 0.05267112345173021): (7, 0.12057601186869804)}

[(5, 0.05267112345173021),
 (7, 0.12057601186869804),
 (10, 0.10534224690346042),
 (11, 0.05267112345173021),
 (15, 0.2162825767178983),
 (16, 0.37989402998406635),
 (17, 0.4325651534357966),
 (18, 0.12057601186869804),
 (19, 0.37989402998406635),
 (20, 0.37989402998406635),
 (21, 0.05267112345173021),
 (22, 0.37989402998406635),
 (23, 0.2162825767178983),
 (24, 0.2162825767178983),
 (25, 0.2162825767178983)]

In [26]:
tfidf[bows[0]]

[(0, 0.22624513776304528),
 (1, 0.12613006942179017),
 (2, 0.12613006942179017),
 (3, 0.39739297752684344),
 (4, 0.39739297752684344),
 (5, 0.055097297999247105),
 (6, 0.39739297752684344),
 (7, 0.12613006942179017),
 (8, 0.39739297752684344),
 (9, 0.39739297752684344),
 (10, 0.055097297999247105),
 (11, 0.055097297999247105),
 (13, 0.22624513776304528),
 (14, 0.22624513776304528)]

In [43]:
#load glove vectors
model=KeyedVectors.load_word2vec_format(GLOVE_PATH,limit=vectors)

KeyboardInterrupt: 

In [64]:
doc = documents[0]

# weighted words
weighted_embeddings = np.array([np.dot(model[word],score_hash[token2id[word]]) for word in filter(lambda word : token2id[word] in score_hash,doc)])
# vector sum
np.sum(weighted_embeddings, axis=0)
