In [8]:
import json
import numpy as np
import nltk
from pprint import pprint
from nltk.corpus import wordnet
from string import punctuation
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer 
import gensim 
import gensim.downloader as api
from gensim.models import Word2Vec, KeyedVectors
import os
import re

nltk.download("wordnet")
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()


[nltk_data] Downloading package wordnet to /home/vatsal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/vatsal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
def entry(line): 
    w, c = line.split("\t", 2)
    return (w, int(c))

dict_path = "./dict.txt"
dictionary = dict(entry(line) for line in open(dict_path))
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))
cleanup = re.compile(r'[^a-z0-9]')

def word_prob(word): 
    return dictionary.get(word, 0) / total

def segment(text): 
    text = re.sub(cleanup, ' ', text)
    probs, lasts = [1.0], [0]
    for i in range(1,len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while i > 0:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words

In [10]:
from glove import Corpus, Glove
corpus = Corpus() 

In [11]:
glob_corpora = []
top100 = ['#travel', '#wanderlust', '#nature', '#travelling', '#traveling', '#traveller', '#photography', '#traveler', '#trip', '#travels', '#vacation', '#love', '#travelers', '#adventure', '#tourist', '#landscape', '#travellers', '#holiday', '#explore', '#beautiful', '#tourism', '#hiking', '#beach', '#photo', '#sunset', '#photographer', '#mountains', '#globetrotter', '#summer', '#art', '#sky', '#treking', '#europe', '#view', '#architecture', '#sea', '#fun', '#happy', '#city', '#sun', '#amazing', '#lifestyle', '#backpacking', '#wanderer', '#italy', '#follow', '#life', '#visiting', '#fashion', '#autumn', '#ocean', '#outdoors', '#explorer', '#world', '#india', '#beauty', '#mountain', '#spain', '#style', '#backpacker', '#like', '#clouds', '#france', '#exploring', '#trekking', '#asia', '#me', '#friends', '#canon', '#usa', '#blogger', '#ig', '#happiness', '#sunrise', '#smile', '#holidays', '#girl', '#wander', '#germany', '#island', '#paradise', '#Travel', '#flowers', '#discover', '#voyage', '#turkey', '#sightseeing', '#landscapes', '#italia', '#outdoor', '#cute', '#indonesia', '#history', '#food', '#pic', '#forest', '#beaches', '#inspiration', '#green', '#memories']
top100 = [r.replace('#', '') for r in top100]

In [12]:
top100_stemmed = [ps.stem(word) for word in top100]
top100_mapping = {}
for i in range(0,100):
    top100_mapping[top100_stemmed[i]]=top100[i]

In [13]:
def strip_punc(s):
    new_str = ""
    for c in s:
        if c in punctuation:
            new_str += " "
        else:
            new_str += c
    return new_str

In [14]:
for file in os.listdir("./jsons/travel"):
    print("./jsons/travel/" + file)
    file_ptr = open("./jsons/travel/" + file, "r")
    dic = json.load(file_ptr)
    for post in dic:
        text = strip_punc(dic[post]['text_des'].lower())
        toks = word_tokenize(text)
        toks_ = []
        for tok in toks:
            tok = segment(tok)
            for t in tok:
                if not wordnet.synsets(t) or len(t) < 3:
                    continue

                t = ps.stem(t)
                if t not in stop_words:
                    toks_.append(t)
            
        if toks_:
            glob_corpora.append(toks_)


./jsons/travel/Luxarytravel.json
./jsons/travel/travelbook.json
./jsons/travel/Traveldeeper.json
./jsons/travel/Hiking.json
./jsons/travel/travelquotes.json
./jsons/travel/travelstoke.json
./jsons/travel/travel.json
./jsons/travel/traveladdict.json
./jsons/travel/travellersnotebook.json
./jsons/travel/travelguide.json
./jsons/travel/Travelabout.json
./jsons/travel/trip.json
./jsons/travel/solotravel.json
./jsons/travel/Travelphotography.json
./jsons/travel/travelbug.json
./jsons/travel/travelpic.json
./jsons/travel/travelgram.json
./jsons/travel/tourist.json
./jsons/travel/travelislife.json
./jsons/travel/beachvibes.json
./jsons/travel/treking.json
./jsons/travel/Travelawesome.json
./jsons/travel/traveltheglobe.json
./jsons/travel/travelworld.json
./jsons/travel/worldtraveller.json
./jsons/travel/Travellove.json
./jsons/travel/nature.json
./jsons/travel/travelcaptures.json
./jsons/travel/Citytravel.json


In [15]:
print(len(glob_corpora))

25764


In [16]:
# Using word2vec for training

glob_model = Word2Vec(glob_corpora, min_count = 2, size = 300)

In [17]:
print(glob_model.wv.most_similar(positive = 'grass', topn = 20))

[('foggi', 0.9027673602104187), ('pine', 0.9022570252418518), ('poni', 0.8992195725440979), ('snowi', 0.8893986940383911), ('leaf', 0.888798177242279), ('wood', 0.886253297328949), ('meadow', 0.8788093328475952), ('botani', 0.8753212690353394), ('classifi', 0.8740636110305786), ('protector', 0.8727027773857117), ('dusk', 0.8701772689819336), ('woodland', 0.8608576655387878), ('frosti', 0.853971004486084), ('rainbow', 0.8525438904762268), ('trailhead', 0.8525169491767883), ('glisten', 0.8508540391921997), ('mist', 0.8495299220085144), ('plant', 0.8428325653076172), ('starri', 0.842252254486084), ('vortex', 0.8400585651397705)]


In [18]:
# Using Glove for training

corpus.fit(glob_corpora, window = 10)
glove = Glove(no_components = 300, learning_rate = 0.05)
glove.fit(corpus.matrix, epochs = 30, no_threads = 4, verbose = True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')


Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [48]:
# Preprocessing of input text
# input_sentence = "Magestic Mountains The views which i got during my roadtrip to Heaven of Earth Kasmir still give a pleasure to my soul @holidaycompass"
# input_sentence = "Straight out of The Sound of Music scenery is what you can wake up to, hiking and mountain biking your days away and enjoying Tyrolean hospitality.⁠"
input_sentence = "Enjoy the little things in life Because someday you will look back and realize they were the BIG THINGS..."
# input_sentence = "#life #enjoy #nevergiveup #bigthings #travel #travelblogger #worldtraveler #instagood #relax #fitnessmodel #beautiful #view #india #nature #instadaily #fit #modellife #king #waterfall #bodytransformation #bodypositive #cool #followforfollowback #follow"
text = strip_punc(input_sentence.lower())
toks = word_tokenize(text)
toks_ = []
for tok in toks:
    if wordnet.synsets(tok):
        tok = ps.stem(tok)
        if tok not in stop_words and len(tok)>2:
            toks_.append(tok)

input_words = toks_
print(input_words)

['life', 'enjoy', 'travel', 'relax', 'beauti', 'view', 'india', 'natur', 'fit', 'king', 'waterfal', 'cool', 'follow']


In [45]:
# Calculating sentence embedding using word2vec

sent = []
for word in input_words:
    if word in glob_model.wv.vocab:
        sent.append(glob_model.wv[word])
    
print(len(sent))
sent = np.array(sent)
print(sent.shape)
embed = np.average(sent, axis = 0)
print(embed.shape)

10
(10, 300)
(300,)


In [46]:
# Finding similarity of top 100 hashtags using word2vec

ans = []
for hasht in top100_mapping:
    try:
        v1 = glob_model.wv[hasht]
        simi = np.dot(v1, embed) / (np.linalg.norm(v1) * np.linalg.norm(embed))
        ans.append((simi, top100_mapping[hasht]))
    except:
        pass

ans = sorted(ans, reverse = True)
print(ans[:20])

[(0.6766943, 'memories'), (0.5459038, 'friends'), (0.5297315, 'happiness'), (0.5202689, 'smile'), (0.4357107, 'fun'), (0.42792997, 'love'), (0.40455705, 'life'), (0.40052202, 'cute'), (0.37394595, 'amazing'), (0.36331025, 'like'), (0.3442025, 'ig'), (0.32219133, 'summer'), (0.3100821, 'view'), (0.28454116, 'beauty'), (0.2819438, 'city'), (0.2796521, 'vacation'), (0.27281737, 'inspiration'), (0.26252228, 'paradise'), (0.25469896, 'sightseeing'), (0.25130293, 'exploring')]


In [47]:
# Finally top k for word2vec

k = 10
topk_word2vec = [h[1] for h in ans[:k]]
print(topk_word2vec)

['memories', 'friends', 'happiness', 'smile', 'fun', 'love', 'life', 'cute', 'amazing', 'like']


In [23]:
# Calculating sentence embedding using glove

sent = []
print(input_words)
for word in input_words:
    if word in glove.dictionary:
        sent.append(glove.word_vectors[glove.dictionary[word]])
    
sent = np.array(sent)
print(sent.shape)
embed = np.average(sent, axis = 0)
print(embed.shape)

['mountain', 'view', 'got', 'heaven', 'earth', 'still', 'give', 'pleasur', 'soul']
(9, 300)
(300,)


In [24]:
# Finding similarity using glove embeddings

ans2 = []
for hasht in top100_mapping:
    try:
        v1 = glove.word_vectors[glove.dictionary[hasht]]
        simi = np.dot(v1, embed) / (np.linalg.norm(v1) * np.linalg.norm(embed))
        ans2.append((simi, top100_mapping[hasht]))
    except:
        pass

ans2 = sorted(ans2, reverse = True)
print(ans2[:20])

[(0.6697867535959405, 'mountain'), (0.627505397864887, 'view'), (0.6051651429506182, 'amazing'), (0.5091753441777954, 'happiness'), (0.4994858372706846, 'clouds'), (0.4982898338430924, 'beauty'), (0.4957473564542348, 'wanderlust'), (0.49444455483083094, 'love'), (0.4927563704821279, 'outdoor'), (0.4884201657783221, 'wander'), (0.4874135342249487, 'exploring'), (0.48211441994794046, 'hiking'), (0.4809235225271503, 'landscapes'), (0.48001779337101474, 'pic'), (0.4753250472503025, 'adventure'), (0.4697627488815335, 'life'), (0.464225100417928, 'discover'), (0.46414031178342136, 'green'), (0.4599611057199374, 'nature'), (0.4571901429257149, 'world')]


In [25]:
# Finally topk for glove

k = 10

topk_glove = [h[1] for h in ans2[:k]]
print(topk_glove)

['mountain', 'view', 'amazing', 'happiness', 'clouds', 'beauty', 'wanderlust', 'love', 'outdoor', 'wander']


In [26]:
print(topk_glove, topk_word2vec)

['mountain', 'view', 'amazing', 'happiness', 'clouds', 'beauty', 'wanderlust', 'love', 'outdoor', 'wander'] ['mountain', 'ig', 'amazing', 'forest', 'view', 'green', 'outdoor', 'hiking', 'landscapes', 'clouds']


In [28]:
# Training on top of pre-trained 300-D word embeddings

model_2 = Word2Vec(size = 300, min_count = 2)
model_2.build_vocab(glob_corpora)
total_examples = model_2.corpus_count
model = KeyedVectors.load_word2vec_format("./glove.6B/glove.6B.300d.w2vformat.txt", binary=False)

In [29]:
model_2.build_vocab([list(model.vocab.keys())], update=True)
model_2.intersect_word2vec_format("./glove.6B/glove.6B.300d.w2vformat.txt", binary=False, lockf=1.0)
model_2.train(glob_corpora, total_examples=total_examples, epochs=model_2.iter)

  This is separate from the ipykernel package so we can avoid doing imports until


(3678141, 4997515)

In [30]:
# Calculating sentence embedding using word2vec built on top

sent = []
for word in input_words:
    if word in model_2.wv.vocab:
        sent.append(model_2.wv[word])
    
print(len(sent))
sent = np.array(sent)
print(sent.shape)
embed = np.average(sent, axis = 0)
print(embed.shape)

9
(9, 300)
(300,)


In [31]:
# Finding similarity of top 100 hashtags using word2vec built on top

ans3 = []
for hasht in top100_mapping:
    try:
        v1 = model_2.wv[hasht]
        simi = np.dot(v1, embed) / (np.linalg.norm(v1) * np.linalg.norm(embed))
        ans3.append((simi, top100_mapping[hasht]))
    except:
        pass

ans3 = sorted(ans3, reverse = True)
print(ans3[:20])

[(0.56321794, 'mountain'), (0.49063486, 'view'), (0.4691943, 'amazing'), (0.41959488, 'landscapes'), (0.41020262, 'clouds'), (0.40212187, 'love'), (0.40068957, 'beauty'), (0.39671206, 'exploring'), (0.39055562, 'nature'), (0.37776214, 'adventure'), (0.3502084, 'life'), (0.33659765, 'sky'), (0.33638433, 'hiking'), (0.33566836, 'forest'), (0.335485, 'discover'), (0.32776678, 'green'), (0.3153249, 'sunrise'), (0.31120405, 'world'), (0.3103188, 'inspiration'), (0.29969543, 'india')]


In [32]:
topk_word2vec_on_top = [h[1] for h in ans3[:k]] 
print(topk_word2vec_on_top, topk_word2vec)

['mountain', 'view', 'amazing', 'landscapes', 'clouds', 'love', 'beauty', 'exploring', 'nature', 'adventure'] ['mountain', 'ig', 'amazing', 'forest', 'view', 'green', 'outdoor', 'hiking', 'landscapes', 'clouds']


In [33]:
print("Input ", input_sentence)
print("Glove output\n ", topk_glove)
print("Word2Vec output\n ", topk_word2vec)
print("Word2Vec output with training on top of pre-trained\n ", topk_word2vec_on_top)

Input  Magestic Mountains The views which i got during my roadtrip to Heaven of Earth Kasmir still give a pleasure to my soul @holidaycompass
Glove output
  ['mountain', 'view', 'amazing', 'happiness', 'clouds', 'beauty', 'wanderlust', 'love', 'outdoor', 'wander']
Word2Vec output
  ['mountain', 'ig', 'amazing', 'forest', 'view', 'green', 'outdoor', 'hiking', 'landscapes', 'clouds']
Word2Vec output with training on top of pre-trained
  ['mountain', 'view', 'amazing', 'landscapes', 'clouds', 'love', 'beauty', 'exploring', 'nature', 'adventure']


In [34]:
test = "fun"
# test = "grass"
print(glob_model.wv.most_similar(positive = test, topn = 10))
print(model_2.wv.most_similar(positive = test, topn = 10))

[('philippin', 0.6941967606544495), ('holiday', 0.6935321092605591), ('summer', 0.6657976508140564), ('wednesday', 0.6657053232192993), ('happi', 0.650692343711853), ('cebu', 0.6470319628715515), ('good', 0.6442487239837646), ('smile', 0.6439228057861328), ('memori', 0.6433659791946411), ('flirt', 0.6238188743591309)]
[('philippin', 0.48417752981185913), ('good', 0.457288920879364), ('colleagu', 0.4232737421989441), ('holiday', 0.4217432737350464), ('love', 0.4049089550971985), ('funni', 0.3766258955001831), ('summer', 0.3744525909423828), ('cute', 0.37307867407798767), ('happi', 0.37076669931411743), ('vacat', 0.3666195571422577)]


In [40]:
print(len(list(model_2.wv.vocab.keys())))

9338


In [50]:
origs = ['life', 'enjoy', 'travel', 'relax', 'beauti', 'view', 'india', 'natur', 'fit', 'king', 'waterfal', 'cool', 'follow']
preds = ['memories', 'friends', 'happiness', 'smile', 'fun', 'love', 'life', 'cute', 'amazing', 'like']


matr = np.empty((len(preds), len(origs)))

for i, pred in enumerate(preds):
    for j, orig in enumerate(origs):
        try:
            similarity = model_2.wv.similarity(ps.stem(orig), ps.stem(pred))
        except:
            similarity = 0
        matr[i][j] = similarity

In [58]:
# print(matr)
accs = []
from prettytable import PrettyTable
tabl = PrettyTable()
tabl.field_names = ['Predicted'] + origs
for ind, row in enumerate(matr):
    tabl.add_row([preds[ind]] + [round(i,2) for i in row])
    accs.append(max(row))
    indi = np.argmax(row)
    print(preds[ind] + "-------------" + origs[indi])
print(tabl)
# print(accs)
# print(np.mean(accs))

ans = 0
for acc in accs:
    if round(acc,1) >= 0.4:
        ans += 1
print(ans / len(preds))

memories-------------life
friends-------------follow
happiness-------------life
smile-------------fit
fun-------------cool
love-------------travel
life-------------life
cute-------------cool
amazing-------------beauti
like-------------follow
+-----------+------+-------+--------+-------+--------+------+-------+-------+------+-------+----------+------+--------+
| Predicted | life | enjoy | travel | relax | beauti | view | india | natur | fit  |  king | waterfal | cool | follow |
+-----------+------+-------+--------+-------+--------+------+-------+-------+------+-------+----------+------+--------+
|  memories | 0.36 |  0.31 |  0.31  |  0.22 |  0.26  | 0.12 |  0.11 |  0.13 | 0.04 | -0.01 |   0.0    | 0.17 |  0.12  |
|  friends  | 0.17 |  0.32 |  0.11  |  0.22 |  0.15  | 0.15 |  0.1  |  0.1  | 0.15 |  0.18 |   0.0    | 0.24 |  0.35  |
| happiness | 0.37 |  0.36 |  0.25  |  0.3  |  0.32  | 0.17 |  0.15 |  0.21 | 0.27 |  0.11 |   0.0    | 0.27 |  0.32  |
|   smile   | 0.14 |  0.24 |  0.05  | 

In [53]:
# !pip3 install prettytable --user

Collecting prettytable
Installing collected packages: prettytable
Successfully installed prettytable-0.7.2
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
