In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

seed_ = 20200218
np.random.seed(seed_)
tf.random.set_seed(seed_)

from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid",
              context="paper",
              font_scale=1.25,
              rc={
                  "figure.figsize": (10.5, 4.5),
                  "figure.dpi": 150,
                  "grid.alpha": 0.1,
                  "grid.color": "#1b262c",
                  "grid.linewidth": 0.5,
                  "font.family": "Operator Mono"
              })

_30k = ["#202f66", "#ff7048", "#7f68d0", "#f3d36e", "#d869ab", "#48ADA9", "#1b262c"]
sns.set_palette(_30k)

import warnings
warnings.filterwarnings('ignore')

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

pio.templates.default = "plotly_dark"

plotly_config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'custom_image',
    'height': 900,
    'width': 2100,
    'scale': 1 # Multiply title/legend/axis/canvas sizes by this factor
  }
}

In [3]:
import tensorflow_datasets as tfds

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import gensim
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

# tensorflow text preprocessing function

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
from bs4 import BeautifulSoup

# Load Data

In [6]:
with open("./outputs/imdb_cleaned_sentences.txt", "r", encoding="utf-8") as f:
    sentences = f.readlines()

In [7]:
sentences = [sentence.strip("\n").split(' ') for sentence in sentences]

In [8]:
sentences[1]

['known',
 'fall',
 'asleep',
 'films',
 'usually',
 'due',
 'combination',
 'things',
 'including',
 'really',
 'tired',
 'warm',
 'comfortable',
 'sette',
 'eaten',
 'lot',
 'however',
 'occasion',
 'fell',
 'asleep',
 'film',
 'rubbish',
 'plot',
 'development',
 'constant',
 'constantly',
 'slow',
 'boring',
 'things',
 'seemed',
 'happen',
 'explanation',
 'causing',
 'admit',
 'may',
 'missed',
 'part',
 'film',
 'watched',
 'majority',
 'everything',
 'seemed',
 'happen',
 'accord',
 'without',
 'real',
 'concern',
 'anything',
 'else',
 'cant',
 'recommend',
 'film']

## Another Preprocess

In [9]:
def substitution_repeated_word(word):
    repeat_pattern = r"(\w)\1{2,}"
    substitution_pattern = r"\1"
    return re.compile(repeat_pattern).sub(substitution_pattern, word)

In [10]:
for i in range(len(sentences)):
    for j in range(len(sentences[i])):
        sentences[i][j] = substitution_repeated_word(sentences[i][j])

# Build the word vectors by gensim

- print loss https://stackoverflow.com/questions/52038651/loss-does-not-decrease-during-training-word2vec-gensim
- loss increasing https://stackoverflow.com/questions/58186670/gensim-word2vec-model-getting-worse-by-increasing-the-number-of-epochs

In [11]:
class MyCallBack(CallbackAny2Vec):
    def __init__(self):
        self.epochs = 0
    
    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epochs != 0 and self.epochs % 5 == 0:
            print(f"{self.epochs}:\t{round(loss - self.prev_loss, 2)}")
        self.epochs += 1
        self.prev_loss = loss

In [12]:
# select only some sentences for train
lim = 100
sentences_ = np.random.choice(sentences, replace=False, size=lim).tolist()

In [13]:
embedding_dims = 128
window_size =  5
max_vocab_size = 2500
min_count = 5

In [14]:
w2v = Word2Vec(sentences=sentences_,
               size=embedding_dims,
               window=window_size,
               max_vocab_size=max_vocab_size,
               min_count=min_count,
               compute_loss=True, callbacks=[MyCallBack()])

In [15]:
w2v.most_similar(positive=['funny'])

[('movie', 0.9751654863357544),
 ('also', 0.9720869064331055),
 ('film', 0.9709030389785767),
 ('one', 0.9698842763900757),
 ('like', 0.9693364500999451),
 ('good', 0.9680224657058716),
 ('family', 0.9677156805992126),
 ('think', 0.9667198061943054),
 ('big', 0.9657655954360962),
 ('movies', 0.9652640223503113)]

In [16]:
w2v.most_similar(positive=['bad'])

[('movie', 0.9770684838294983),
 ('get', 0.9733237028121948),
 ('one', 0.9731695652008057),
 ('also', 0.9730850458145142),
 ('family', 0.9713889956474304),
 ('would', 0.970788836479187),
 ('first', 0.9705204367637634),
 ('film', 0.9704143404960632),
 ('think', 0.9701133966445923),
 ('life', 0.9696893095970154)]

In [17]:
w2v_train_epochs = 20
w2v.train(
    sentences=sentences, 
    total_examples=w2v.corpus_count, 
    epochs=w2v_train_epochs,
    start_alpha=0.001,
    compute_loss=1, 
    callbacks=[MyCallBack()])

5:	1102849.0
10:	1055825.0
15:	1191562.0


(36247415, 112763320)

In [18]:
w2v.most_similar(positive=['funny'])

[('laugh', 0.9997960329055786),
 ('fun', 0.9997955560684204),
 ('felt', 0.9997910857200623),
 ('lot', 0.9997721910476685),
 ('boring', 0.9997698068618774),
 ('interesting', 0.9997669458389282),
 ('really', 0.9997642636299133),
 ('ending', 0.9997607469558716),
 ('quite', 0.9997581243515015),
 ('maybe', 0.9997577667236328)]

In [19]:
w2v.most_similar(positive=['bad'])

[('movie', 0.9997435808181763),
 ('really', 0.9996969103813171),
 ('awful', 0.9996936321258545),
 ('say', 0.9996858835220337),
 ('terrible', 0.9996750354766846),
 ('think', 0.9996659755706787),
 ('better', 0.999651312828064),
 ('acting', 0.9996463656425476),
 ('made', 0.9996272921562195),
 ('thought', 0.9996194839477539)]

In [20]:
w2v_train_epochs = 100
w2v.train(
    sentences=sentences, 
    total_examples=w2v.corpus_count, 
    epochs=w2v_train_epochs,
    start_alpha=0.001,
    compute_loss=1, 
    callbacks=[MyCallBack()])

5:	1243566.5
10:	1039379.0
15:	1185738.0
20:	1160756.0
25:	1133308.0
30:	515208.0
35:	505496.0
40:	519876.0
45:	507544.0
50:	519872.0
55:	500448.0
60:	520376.0
65:	519196.0
70:	520788.0
75:	511612.0
80:	519628.0
85:	528060.0
90:	526144.0
95:	3192.0


(181220130, 563816600)

In [21]:
w2v.most_similar(positive=['funny'])

[('laughs', 0.8263293504714966),
 ('humor', 0.7667121887207031),
 ('comedy', 0.734045684337616),
 ('laugh', 0.7217087745666504),
 ('parts', 0.6550390720367432),
 ('bits', 0.6472859382629395),
 ('really', 0.6034287214279175),
 ('fun', 0.5920424461364746),
 ('lame', 0.5882127285003662),
 ('cool', 0.5822098851203918)]

In [30]:
w2v.most_similar(positive=['bad'])

[('awful', 0.8887099027633667),
 ('terrible', 0.8773662447929382),
 ('lame', 0.8500813245773315),
 ('ok', 0.8014817237854004),
 ('worse', 0.7903109788894653),
 ('mean', 0.7554093599319458),
 ('acting', 0.7259005904197693),
 ('poor', 0.7052242755889893),
 ('really', 0.6994863748550415),
 ('boring', 0.6783396005630493)]

# Save Model

In [23]:
w2v.save("./outputs/imdb-w2v-gensim.model")

In [24]:
words = w2v.wv.vocab.keys()
vecs = w2v.wv.vectors
vecs.shape

(444, 128)

In [25]:
len(words)

444

In [26]:
vecs_df = pd.DataFrame(vecs)
words_df = pd.DataFrame(words)

In [27]:
vecs_df.to_csv("./outputs/imdb-vecs.tsv", sep='\t', header=False, index=False)
words_df.to_csv("./outputs/imdb-words.tsv", sep='\t', header=False, index=False)

# Explore Result

In [64]:
w2v.most_similar(positive=['romance'])

[('romantic', 0.9145389795303345),
 ('drama', 0.88703453540802),
 ('typical', 0.8631957769393921),
 ('charming', 0.8135483860969543),
 ('subtle', 0.8103457689285278),
 ('adds', 0.7703721523284912),
 ('light', 0.7587342858314514),
 ('adventure', 0.7583597898483276),
 ('dramatic', 0.7565268874168396),
 ('strong', 0.7531163692474365)]

In [63]:
w2v.most_similar(positive=['good'])

[('pretty', 0.7499832510948181),
 ('ok', 0.7430225610733032),
 ('cool', 0.7426775097846985),
 ('overall', 0.713636577129364),
 ('really', 0.7088854312896729),
 ('acting', 0.6756110191345215),
 ('bad', 0.6623809933662415),
 ('laughs', 0.648076057434082),
 ('great', 0.6459245681762695),
 ('rest', 0.6380834579467773)]

In [62]:
w2v.most_similar(positive=["actor"])

[('actress', 0.9585381150245667),
 ('role', 0.9257036447525024),
 ('performance', 0.9061633348464966),
 ('roles', 0.8646007776260376),
 ('talented', 0.8518898487091064),
 ('career', 0.8168497085571289),
 ('voice', 0.8128608465194702),
 ('scott', 0.7891513705253601),
 ('plays', 0.7793706655502319),
 ('john', 0.7726655602455139)]

In [60]:
# actor // father
# ? // mother

# actor - father = ? - mother
# ? = (actor + mother) - father

w2v.most_similar(positive=["actor", "mother"], negative=["father"])

[('actress', 0.9597461223602295),
 ('role', 0.9129691123962402),
 ('performance', 0.8907528519630432),
 ('roles', 0.8451204895973206),
 ('talented', 0.8351970911026001),
 ('career', 0.7961633801460266),
 ('voice', 0.7956693172454834),
 ('plays', 0.7613451480865479),
 ('scott', 0.7585989832878113),
 ('john', 0.7364364266395569)]

In [66]:
w2v.most_similar(positive=["man", "girl"], negative=["boy"])

[('woman', 0.9423226118087769),
 ('hand', 0.8159111738204956),
 ('hero', 0.8105056285858154),
 ('turns', 0.8059001564979553),
 ('meets', 0.7977406978607178),
 ('tough', 0.7710468173027039),
 ('evil', 0.7701187133789062),
 ('becomes', 0.7620344161987305),
 ('young', 0.7597670555114746),
 ('whose', 0.756074070930481)]