In [22]:
import json
import numpy as np
import nltk
from pprint import pprint
from nltk.corpus import wordnet
from string import punctuation
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer 
import gensim 
import gensim.downloader as api
from gensim.models import Word2Vec, KeyedVectors
import os

nltk.download("wordnet")
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()


[nltk_data] Downloading package wordnet to /home/vatsal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/vatsal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
from glove import Corpus, Glove
corpus = Corpus() 

In [4]:
glob_corpora = []
top100 = ['#travel', '#wanderlust', '#nature', '#travelling', '#traveling', '#traveller', '#photography', '#traveler', '#trip', '#travels', '#vacation', '#love', '#travelers', '#adventure', '#tourist', '#landscape', '#travellers', '#holiday', '#explore', '#beautiful', '#tourism', '#hiking', '#beach', '#photo', '#sunset', '#photographer', '#mountains', '#globetrotter', '#summer', '#art', '#sky', '#treking', '#europe', '#view', '#architecture', '#sea', '#fun', '#happy', '#city', '#sun', '#amazing', '#lifestyle', '#backpacking', '#wanderer', '#italy', '#follow', '#life', '#visiting', '#fashion', '#autumn', '#ocean', '#outdoors', '#explorer', '#world', '#india', '#beauty', '#mountain', '#spain', '#style', '#backpacker', '#like', '#clouds', '#france', '#exploring', '#trekking', '#asia', '#me', '#friends', '#canon', '#usa', '#blogger', '#ig', '#happiness', '#sunrise', '#smile', '#holidays', '#girl', '#wander', '#germany', '#island', '#paradise', '#Travel', '#flowers', '#discover', '#voyage', '#turkey', '#sightseeing', '#landscapes', '#italia', '#outdoor', '#cute', '#indonesia', '#history', '#food', '#pic', '#forest', '#beaches', '#inspiration', '#green', '#memories']
top100 = [r.replace('#', '') for r in top100]

In [5]:
top100_stemmed = [ps.stem(word) for word in top100]
top100_mapping = {}
for i in range(0,100):
    top100_mapping[top100_stemmed[i]]=top100[i]

In [6]:
def strip_punc(s):
    new_str = ""
    for c in s:
        if c in punctuation:
            new_str += " "
        else:
            new_str += c
    return new_str

In [7]:
for file in os.listdir("./jsons/"):
    print("./jsons/" + file)
    file_ptr = open("./jsons/" + file, "r")
    dic = json.load(file_ptr)
    for post in dic:
        text = strip_punc(dic[post]['text_des'].lower())
        toks = word_tokenize(text)
        toks_ = []
        for tok in toks:
            if not wordnet.synsets(tok) or len(tok) < 3:
                continue
            tok = ps.stem(tok)
            if tok not in stop_words:
                toks_.append(tok)
        if toks_:
            glob_corpora.append(toks_)


./jsons/Luxarytravel.json
./jsons/travelbook.json
./jsons/Traveldeeper.json
./jsons/Hiking.json
./jsons/travelquotes.json
./jsons/travelstoke.json
./jsons/travel.json
./jsons/traveladdict.json
./jsons/travellersnotebook.json
./jsons/travelguide.json
./jsons/Travelabout.json
./jsons/trip.json
./jsons/solotravel.json
./jsons/Travelphotography.json
./jsons/travelbug.json
./jsons/travelpic.json
./jsons/travelgram.json
./jsons/tourist.json
./jsons/travelislife.json
./jsons/beachvibes.json
./jsons/treking.json
./jsons/Travelawesome.json
./jsons/traveltheglobe.json
./jsons/travelworld.json
./jsons/worldtraveller.json
./jsons/Travellove.json
./jsons/nature.json
./jsons/travelcaptures.json
./jsons/Citytravel.json


In [8]:
print(len(glob_corpora))

25481


In [9]:
# Using word2vec for training

glob_model = Word2Vec(glob_corpora, min_count = 2, size = 300)

In [10]:
print(glob_model.wv.most_similar(positive = 'grass', topn = 20))

[('mist', 0.9922024011611938), ('bamboo', 0.990009069442749), ('foggi', 0.9859453439712524), ('dawn', 0.9804486036300659), ('fog', 0.9797745943069458), ('wood', 0.9797183275222778), ('pomegran', 0.9796435832977295), ('lush', 0.9788281321525574), ('clad', 0.9786802530288696), ('kazakh', 0.9773837924003601), ('wasteland', 0.9763006567955017), ('woodland', 0.9761473536491394), ('hoop', 0.9753665924072266), ('hors', 0.9752497673034668), ('leaf', 0.9750267863273621), ('rainbow', 0.9740670919418335), ('inconceiv', 0.9730759859085083), ('syrup', 0.9726815819740295), ('mane', 0.9717535376548767), ('montana', 0.9717276096343994)]


In [11]:
# Using Glove for training

corpus.fit(glob_corpora, window = 10)
glove = Glove(no_components = 300, learning_rate = 0.05)
glove.fit(corpus.matrix, epochs = 30, no_threads = 4, verbose = True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')


Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [12]:
# Preprocessing of input text

input_sentence = "city life enjoyment metro car"
text = strip_punc(input_sentence.lower())
toks = word_tokenize(text)
toks_ = []
for tok in toks:
    if wordnet.synsets(tok):
        tok = ps.stem(tok)
        if tok not in stop_words and len(tok)>2:
            toks_.append(tok)

input_words = toks_
print(input_words)

['citi', 'life', 'enjoy', 'metro', 'car']


In [13]:
# Calculating sentence embedding using word2vec

sent = []
for word in input_words:
    if word in glob_model.wv.vocab:
        sent.append(glob_model.wv[word])
    
print(len(sent))
sent = np.array(sent)
print(sent.shape)
embed = np.average(sent, axis = 0)
print(embed.shape)

5
(5, 300)
(300,)


In [14]:
# Finding similarity of top 100 hashtags using word2vec

ans = []
for hasht in top100_mapping:
    try:
        v1 = glob_model.wv[hasht]
        simi = np.dot(v1, embed) / (np.linalg.norm(v1) * np.linalg.norm(embed))
        ans.append((simi, top100_mapping[hasht]))
    except:
        pass

ans = sorted(ans, reverse = True)
print(ans[:20])

[(0.8402681, 'life'), (0.83269423, 'memories'), (0.76508474, 'cute'), (0.7484694, 'smile'), (0.7464194, 'ig'), (0.7383746, 'beauty'), (0.7346288, 'inspiration'), (0.73444617, 'love'), (0.70989937, 'happiness'), (0.6989758, 'girl'), (0.69524664, 'fun'), (0.6729203, 'like'), (0.66195184, 'style'), (0.6618949, 'amazing'), (0.6576407, 'friends'), (0.6469303, 'city'), (0.64483446, 'food'), (0.62304217, 'sightseeing'), (0.60987926, 'fashion'), (0.6062473, 'summer')]


In [15]:
# Finally top k for word2vec

k = 10
topk_word2vec = [h[1] for h in ans[:k]]
print(topk_word2vec)

['life', 'memories', 'cute', 'smile', 'ig', 'beauty', 'inspiration', 'love', 'happiness', 'girl']


In [16]:
# Calculating sentence embedding using glove

sent = []
print(input_words)
for word in input_words:
    if word in glove.dictionary:
        sent.append(glove.word_vectors[glove.dictionary[word]])
    
sent = np.array(sent)
print(sent.shape)
embed = np.average(sent, axis = 0)
print(embed.shape)

['citi', 'life', 'enjoy', 'metro', 'car']
(5, 300)
(300,)


In [17]:
# Finding similarity using glove embeddings

ans2 = []
for hasht in top100_mapping:
    try:
        v1 = glove.word_vectors[glove.dictionary[hasht]]
        simi = np.dot(v1, embed) / (np.linalg.norm(v1) * np.linalg.norm(embed))
        ans2.append((simi, top100_mapping[hasht]))
    except:
        pass

ans2 = sorted(ans2, reverse = True)
print(ans2[:20])

[(0.7850062647418138, 'life'), (0.6989680952474028, 'city'), (0.6901631371832204, 'memories'), (0.6113891536112518, 'lifestyle'), (0.6095009959491364, 'trip'), (0.5776050934044664, 'exploring'), (0.5730859192811777, 'visiting'), (0.5718418631289597, 'beauty'), (0.5648133073923786, 'fun'), (0.5558073084679296, 'history'), (0.5515839359251011, 'amazing'), (0.5461834018353884, 'adventure'), (0.5426864478934118, 'photography'), (0.5347772889125532, 'inspiration'), (0.5319193704254476, 'smile'), (0.5239102348067891, 'love'), (0.5228078829499611, 'vacation'), (0.5195034870389083, 'happiness'), (0.5155602216077344, 'holidays'), (0.5149448796976712, 'friends')]


In [18]:
# Finally topk for glove

k = 10

topk_glove = [h[1] for h in ans2[:k]]
print(topk_glove)

['life', 'city', 'memories', 'lifestyle', 'trip', 'exploring', 'visiting', 'beauty', 'fun', 'history']


In [19]:
print(topk_glove, topk_word2vec)

['life', 'city', 'memories', 'lifestyle', 'trip', 'exploring', 'visiting', 'beauty', 'fun', 'history'] ['life', 'memories', 'cute', 'smile', 'ig', 'beauty', 'inspiration', 'love', 'happiness', 'girl']


In [20]:
# Training on top of pre-trained 100-D word embeddings

model_2 = Word2Vec(size = 300, min_count = 2)
model_2.build_vocab(glob_corpora)
total_examples = model_2.corpus_count
model = KeyedVectors.load_word2vec_format("./glove.6B/glove.6B.300d.w2vformat.txt", binary=False)

25481


FileNotFoundError: [Errno 2] No such file or directory: './glove.6B/glove.6B.300d.w2vformat.txt'

In [122]:
model_2.build_vocab([list(model.vocab.keys())], update=True)
model_2.intersect_word2vec_format("./glove.6B/glove.6B.300d.w2vformat.txt", binary=False, lockf=1.0)
model_2.train(glob_corpora, total_examples=total_examples, epochs=model_2.iter)

  This is separate from the ipykernel package so we can avoid doing imports until


(1608376, 1942600)

In [55]:
# Calculating sentence embedding using word2vec built on top

sent = []
for word in input_words:
    if word in model_2.wv.vocab:
        sent.append(model_2.wv[word])
    
print(len(sent))
sent = np.array(sent)
print(sent.shape)
embed = np.average(sent, axis = 0)
print(embed.shape)

5
(5, 100)
(100,)


In [127]:
# Finding similarity of top 100 hashtags using word2vec built on top

ans3 = []
for hasht in top100_mapping:
    try:
        v1 = model_2.wv[hasht]
        simi = np.dot(v1, embed) / (np.linalg.norm(v1) * np.linalg.norm(embed))
        ans3.append((simi, top100_mapping[hasht]))
    except:
        pass

ans3 = sorted(ans3, reverse = True)
print(ans3[:20])

[(0.1590738238310768, 'life'), (0.1568674105766756, 'inspiration'), (0.14864744117986625, 'lifestyle'), (0.14443303061448606, 'green'), (0.1438923174633605, 'happiness'), (0.13621081185388623, 'architecture'), (0.12903651775479635, 'style'), (0.12686728826314567, 'smile'), (0.12567769092664607, 'beauty'), (0.12381021717285125, 'art'), (0.12063842743665494, 'autumn'), (0.1157567075987542, 'history'), (0.11364085842518182, 'spain'), (0.11207138762507324, 'love'), (0.11107323926624592, 'food'), (0.11090056420187287, 'fashion'), (0.1086669006549339, 'landscapes'), (0.09683028480330677, 'cute'), (0.08633663624716623, 'forest'), (0.08347055978678607, 'summer')]


In [128]:
topk_word2vec_on_top = [h[1] for h in ans3[:k]] 
print(topk_word2vec_on_top, topk_word2vec)

['life', 'inspiration', 'lifestyle', 'green', 'happiness', 'architecture', 'style', 'smile', 'beauty', 'art'] ['life', 'memories', 'cute', 'smile', 'beauty', 'love', 'inspiration', 'ig', 'happiness', 'fun']


## Vatsal Part

In [None]:
# Using Glove for training

corpus.fit(glob_corpora, window = 10)
glove = Glove(no_components = 300, learning_rate = 0.05)
glove.fit(corpus.matrix, epochs = 30, no_threads = 4, verbose = True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')


## Vatsal Part

In [130]:
print("Input ", input_sentence)
print("Glove output\n ", topk_glove)
print("Word2Vec output\n ", topk_word2vec)
print("Word2Vec output with training on top of pre-trained\n ", topk_word2vec_on_top)

Input  city life enjoyment metro car
Glove output
  ['life', 'city', 'memories', 'visiting', 'trip', 'lifestyle', 'fun', 'history', 'inspiration', 'exploring']
Word2Vec output
  ['life', 'memories', 'cute', 'smile', 'beauty', 'love', 'inspiration', 'ig', 'happiness', 'fun']
Word2Vec output with training on top of pre-trained
  ['life', 'inspiration', 'lifestyle', 'green', 'happiness', 'architecture', 'style', 'smile', 'beauty', 'art']


In [136]:
test = "fun"
# test = "grass"
print(glob_model.wv.most_similar(positive = test, topn = 10))
print(model_2.wv.most_similar(positive = test, topn = 10))

[('sequoia', 0.993638813495636), ('confetti', 0.9873069524765015), ('dawn', 0.9858814477920532), ('mist', 0.9847056865692139), ('wildflow', 0.984110414981842), ('hue', 0.9838014841079712), ('cowgirl', 0.9802432060241699), ('textur', 0.9802113175392151), ('windi', 0.9799920320510864), ('stream', 0.9789674282073975)]
[('lawn', 0.5653282403945923), ('greeneri', 0.546645998954773), ('wildflow', 0.539504885673523), ('ferocactu', 0.5318676829338074), ('inflat', 0.5228531360626221), ('tourmalin', 0.5141727924346924), ('horticultur', 0.5055184960365295), ('agricultur', 0.5021069049835205), ('mojav', 0.5011772513389587), ('shorelin', 0.49985647201538086)]


In [98]:
temp = Word2Vec(size = 100, min_count = 1)
temp.build_vocab(sentences)
# print(temp.vocab.keys())

In [99]:
temp2 = KeyedVectors.load_word2vec_format("./glove.6B/glove.6B.100d.w2vformat.txt", binary=False)

In [100]:
print(temp2.wv.vocab['world'])

Vocab(count:399915, index:85)


  """Entry point for launching an IPython kernel.


In [101]:
temp.build_vocab([list(temp2.vocab.keys())], update=True)
# model_2.intersect_word2vec_format("./glove.6B/glove.6B.100d.w2vformat.txt", binary=False, lockf=1.0)
# model_2.train(glob_corpora, total_examples=total_examples, epochs=model_2.iter)

In [102]:
print(len(list(temp.wv.vocab.keys())))

400002


In [103]:
print(len(list(model_2.wv.vocab.keys())))

8604
