In [None]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
from gensim.models import Word2Vec
import gensim.models as models

from statistics import mean, stdev, median

In [None]:
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [None]:
# mount drive
# use it only when you run on google collab

from google.colab import drive

drive.mount("/content/gdrive")

In [None]:
titles = pd.read_csv('../../data/data.csv')

In [None]:
titles.head()

# Remove characters and parse

In [None]:
def prep(product_name) :
    clean = product_name.apply(lambda x : re.sub("[^A-Za-z0-9]+", ' ', str(x)).lower())
    words_total = []
    for product in clean :
        words = product.split()
        if len(words) != 0 :
            words_total.append(words)
    return words_total

In [None]:
words = prep(titles['titles'])
words[0:20]

In [None]:
word_freq = defaultdict(int)
for sent in words:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
print('Statistics of Word Frequencies')
print('Mean : ', str(mean(list(word_freq.values()))))
print('Median : ', str(median(list(word_freq.values()))))
print('Stdev : ', str(stdev(list(word_freq.values()))))
print('Max : ', str(max(list(word_freq.values()))))
print('Min : ', str(min(list(word_freq.values()))))

In [None]:
length = []
for a in words :
    length.append(len(a))

In [None]:
print('Statistics of Product Name Length')
print('Mean : ', str(mean(length)))
print('Median : ', str(median(length)))
print('Stdev : ', str(stdev(length)))
print('Max : ', str(max(length)))
print('Min : ', str(min(length)))

# Word2Vec

In [None]:
w2v = Word2Vec(min_count=1,
                     window=3,
                     size=300)

In [None]:
w2v.build_vocab()

In [None]:
t = time()

w2v.build_vocab(words, progress_per=1000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
# number of vocabs
len(w2v.wv.index2word)

In [None]:
t = time()

w2v.train(words, total_examples=w2v.corpus_count, epochs=50)

In [None]:
w2v.init_sims(replace=True)

## Checking word2vec result

In [None]:
w2v.wv.most_similar(positive=['baju'])

## Save w2v

In [None]:
w2v.save('w2v_result.wv')