In [5]:
import json
import numpy as np
import nltk
from pprint import pprint
from nltk.corpus import wordnet
from string import punctuation
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer 
import gensim 
from gensim.models import Word2Vec 
import os
import re

nltk.download("wordnet")
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()


[nltk_data] Downloading package wordnet to /home/vatsal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/vatsal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def entry(line): 
    w, c = line.split("\t", 2)
    return (w, int(c))

dict_path = "./dict.txt"
dictionary = dict(entry(line) for line in open(dict_path))
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))
cleanup = re.compile(r'[^a-z0-9]')

def word_prob(word): 
    return dictionary.get(word, 0) / total

def segment(text): 
    text = re.sub(cleanup, ' ', text)
    probs, lasts = [1.0], [0]
    for i in range(1,len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while i > 0:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words

In [7]:
from glove import Corpus, Glove
corpus = Corpus() 

In [8]:
glob_corpora = []
top100 = ['#travel', '#wanderlust', '#nature', '#travelling', '#traveling', '#traveller', '#photography', '#traveler', '#trip', '#travels', '#vacation', '#love', '#travelers', '#adventure', '#tourist', '#landscape', '#travellers', '#holiday', '#explore', '#beautiful', '#tourism', '#hiking', '#beach', '#photo', '#sunset', '#photographer', '#mountains', '#globetrotter', '#summer', '#art', '#sky', '#treking', '#europe', '#view', '#architecture', '#sea', '#fun', '#happy', '#city', '#sun', '#amazing', '#lifestyle', '#backpacking', '#wanderer', '#italy', '#follow', '#life', '#visiting', '#fashion', '#autumn', '#ocean', '#outdoors', '#explorer', '#world', '#india', '#beauty', '#mountain', '#spain', '#style', '#backpacker', '#like', '#clouds', '#france', '#exploring', '#trekking', '#asia', '#me', '#friends', '#canon', '#usa', '#blogger', '#ig', '#happiness', '#sunrise', '#smile', '#holidays', '#girl', '#wander', '#germany', '#island', '#paradise', '#Travel', '#flowers', '#discover', '#voyage', '#turkey', '#sightseeing', '#landscapes', '#italia', '#outdoor', '#cute', '#indonesia', '#history', '#food', '#pic', '#forest', '#beaches', '#inspiration', '#green', '#memories']
top100 = [r.replace('#', '') for r in top100]

In [9]:
top100_stemmed = [ps.stem(word) for word in top100]
top100_mapping = {}
for i in range(0,100):
    top100_mapping[top100_stemmed[i]]=top100[i]

In [10]:
def strip_punc(s):
    new_str = ""
    for c in s:
        if c in punctuation:
            new_str += " "
        else:
            new_str += c
    return new_str

In [39]:
for file in os.listdir("./jsons/travel"):
    print("./jsons/travel/" + file)
    file_ptr = open("./jsons/travel/" + file, "r")
    dic = json.load(file_ptr)
    for post in dic:
        text = strip_punc(dic[post]['text_des'].lower())
        toks = word_tokenize(text)
        toks_ = []
        for tok in toks:
            tok = segment(tok)
            for t in tok:
                if not wordnet.synsets(t) or len(t) < 3:
                    continue

                t = ps.stem(t)
                if t not in stop_words:
                    toks_.append(t)
            
        if toks_:
            glob_corpora.append(toks_)


./jsons/travel/Luxarytravel.json
./jsons/travel/travelbook.json
./jsons/travel/Traveldeeper.json
./jsons/travel/Hiking.json
./jsons/travel/travelquotes.json
./jsons/travel/travelstoke.json
./jsons/travel/travel.json
./jsons/travel/traveladdict.json
./jsons/travel/travellersnotebook.json
./jsons/travel/travelguide.json
./jsons/travel/Travelabout.json
./jsons/travel/trip.json
./jsons/travel/solotravel.json
./jsons/travel/Travelphotography.json
./jsons/travel/travelbug.json
./jsons/travel/travelpic.json
./jsons/travel/travelgram.json
./jsons/travel/tourist.json
./jsons/travel/travelislife.json
./jsons/travel/beachvibes.json
./jsons/travel/treking.json
./jsons/travel/Travelawesome.json
./jsons/travel/traveltheglobe.json
./jsons/travel/travelworld.json
./jsons/travel/worldtraveller.json
./jsons/travel/Travellove.json
./jsons/travel/nature.json
./jsons/travel/travelcaptures.json
./jsons/travel/Citytravel.json


In [40]:
print(len(glob_corpora))

25764


In [41]:
# Using word2vec for training

glob_model = Word2Vec(glob_corpora, min_count = 1)

In [42]:
print(glob_model.wv.most_similar(positive = 'grass', topn = 20))

[('pine', 0.9038450717926025), ('wood', 0.8921689987182617), ('poni', 0.8802863359451294), ('hydrangea', 0.8703752756118774), ('fog', 0.8680858612060547), ('leaf', 0.8678985238075256), ('meadow', 0.8659888505935669), ('foggi', 0.8658663034439087), ('terri', 0.8652238249778748), ('cynic', 0.8553472757339478), ('woodland', 0.8542168736457825), ('orchid', 0.8531167507171631), ('butterfli', 0.8529670238494873), ('shade', 0.8500607013702393), ('rainbow', 0.8481327295303345), ('downhil', 0.8466649055480957), ('majest', 0.8458655476570129), ('snowi', 0.8452242612838745), ('archeri', 0.845212996006012), ('mustang', 0.8440729975700378)]


In [43]:
# Using Glove for training

corpus.fit(glob_corpora, window = 10)
glove = Glove(no_components = 100, learning_rate = 0.05)
glove.fit(corpus.matrix, epochs = 30, no_threads = 4, verbose = True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')
glove.add_dictionary(corpus.dictionary)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [11]:
# Preprocessing of input text

# input_sentence = "#forest#green#wanderlust#tour"
# input_sentence = "Straight out of The Sound of Music scenery is what you can wake up to, hiking and mountain biking your days away and enjoying Tyrolean hospitality.⁠
input_sentence = "#Travel #trip #photo #picoftheday #holiday #summer2018# fun #drone #moment #photography #focus #pic #color #igtravel #happyholidays #instapassport #travelgram #tflers #photooftheday #instatravel #travelingram #exposure #capture #snapshot #traveling #instago #photographyislifee"
text = strip_punc(input_sentence.lower())
toks = word_tokenize(text)
toks_ = []
for tok in toks:
    tok = segment(tok)
    for t in tok:
        if wordnet.synsets(t):
            t = ps.stem(t)
            if t not in stop_words and len(t)>2:
                toks_.append(t)

input_words = toks_
print(input_words)

['travel', 'trip', 'photo', 'pic', 'day', 'holiday', 'summer', 'fun', 'drone', 'moment', 'photographi', 'focu', 'pic', 'color', 'travel', 'happi', 'holiday', 'passport', 'travel', 'gram', 'photo', 'day', 'travel', 'travel', 'exposur', 'captur', 'snapshot', 'travel', 'photographi']


In [53]:
# Calculating sentence embedding using word2vec

sent = []
for word in input_words:
    if word in glob_model.wv.vocab:
        sent.append(glob_model.wv[word])
    
print(len(sent))
sent = np.array(sent)
print(sent.shape)
embed = np.average(sent, axis = 0)
print(embed.shape)

13
(13, 100)
(100,)


In [54]:
# Finding similarity of top 100 hashtags using word2vec

ans = []
for hasht in top100_mapping:
    try:
        v1 = glob_model.wv[hasht]
        simi = np.dot(v1, embed) / (np.linalg.norm(v1) * np.linalg.norm(embed))
        ans.append((simi, top100_mapping[hasht]))
    except:
        pass

ans = sorted(ans, reverse = True)
print(ans[:20])

[(0.79706544, 'hiking'), (0.7241504, 'mountain'), (0.6522338, 'trekking'), (0.6347725, 'forest'), (0.61286086, 'green'), (0.590361, 'outdoor'), (0.5352306, 'clouds'), (0.52261513, 'adventure'), (0.48653263, 'happiness'), (0.46745187, 'landscapes'), (0.46624708, 'nature'), (0.45242748, 'ig'), (0.4503826, 'love'), (0.4496548, 'fun'), (0.44890246, 'sunrise'), (0.44407701, 'sky'), (0.4428573, 'view'), (0.42693922, 'exploring'), (0.42463803, 'backpacker'), (0.42436984, 'autumn')]


In [55]:
# Finally top k for word2vec

k = 10
topk_word2vec = [h[1] for h in ans[:k]]
print(topk_word2vec)

['hiking', 'mountain', 'trekking', 'forest', 'green', 'outdoor', 'clouds', 'adventure', 'happiness', 'landscapes']


In [56]:
# Calculating sentence embedding using glove

sent = []
print(input_words)
for word in input_words:
    if word in glove.dictionary:
        sent.append(glove.word_vectors[glove.dictionary[word]])
    
sent = np.array(sent)
print(sent.shape)
embed = np.average(sent, axis = 0)
print(embed.shape)

['straight', 'sound', 'music', 'sceneri', 'wake', 'hike', 'mountain', 'bike', 'day', 'away', 'enjoy', 'role', 'hospit']
(13, 100)
(100,)


In [57]:
# Finding similarity using glove embeddings

ans2 = []
for hasht in top100_mapping:
    try:
        v1 = glove.word_vectors[glove.dictionary[hasht]]
        simi = np.dot(v1, embed) / (np.linalg.norm(v1) * np.linalg.norm(embed))
        ans2.append((simi, top100_mapping[hasht]))
    except:
        pass

ans2 = sorted(ans2, reverse = True)
print(ans2[:20])

[(0.7453365819867968, 'mountain'), (0.7298255248162238, 'hiking'), (0.6122607011664623, 'happiness'), (0.6106694595571494, 'life'), (0.5710631251832585, 'summer'), (0.5665188474463855, 'trekking'), (0.5631765135574418, 'love'), (0.5479993013866538, 'outdoor'), (0.5466948142020419, 'friends'), (0.5455864144595995, 'green'), (0.5374060591790857, 'beauty'), (0.5373822325422785, 'adventure'), (0.53635091556927, 'view'), (0.535265329724896, 'fun'), (0.5232745004195499, 'pic'), (0.5168948279325701, 'wanderlust'), (0.5093231041139552, 'nature'), (0.5072057520100826, 'memories'), (0.5071905281011232, 'clouds'), (0.5029636688800222, 'like')]


In [58]:
# Finally topk for glove
k = 10

topk_glove = [h[1] for h in ans2[:k]]
print(topk_glove)

['mountain', 'hiking', 'happiness', 'life', 'summer', 'trekking', 'love', 'outdoor', 'friends', 'green']


In [59]:
print("Glove output",topk_glove)
print("Word2vec output", topk_word2vec)

Glove output ['mountain', 'hiking', 'happiness', 'life', 'summer', 'trekking', 'love', 'outdoor', 'friends', 'green']
Word2vec output ['hiking', 'mountain', 'trekking', 'forest', 'green', 'outdoor', 'clouds', 'adventure', 'happiness', 'landscapes']
