In [1]:
import gensim.models
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim import utils
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

from tqdm import tqdm

import pandas as pd
import re
import plotly

import multiprocessing; cores = multiprocessing.cpu_count() # Count the number of cores in a computer

from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling

# Gensim test

In [2]:

import gensim.downloader as api
gg = api.load('word2vec-google-news-300')

# Clean the text

In [3]:
# Read file
df = pd.read_csv("../TVTropesData/tropes.csv")

In [4]:
# Drop rows with no description
df = df.dropna(subset=['Description']).reset_index(drop=True)

In [5]:
len(df)

30960

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,TropeID,Trope,Description
0,0,t00001,AbandonedArea,\nAbandoned places make good settings for fict...
1,1,t00002,AbandonedCatchphrase,Catchphrases are a great and simple way to hel...
2,2,t00003,AbandonedHospital,The creepy abandoned hospital/mental instituti...
3,3,t00004,AbandonedHospitalAwakening,"An Abandoned Hospital Awakening is, as the nam..."
4,4,t00005,AbandonedInfoPage,When a work is getting more and more complicat...


In [7]:
# split Trope
# re.findall('[A-Z][a-z]*',df.Trope[0])

In [8]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", str(phrase))
    phrase = re.sub(r"can\'t", "can not", str(phrase))

    # general
    phrase = re.sub(r"n\'t", " not", str(phrase))
    phrase = re.sub(r"\'re", " are", str(phrase))
    phrase = re.sub(r"\'s", " is", str(phrase))
    phrase = re.sub(r"\'d", " would", str(phrase))
    phrase = re.sub(r"\'ll", " will", str(phrase))
    phrase = re.sub(r"\'t", " not", str(phrase))
    phrase = re.sub(r"\'ve", " have", str(phrase))
    phrase = re.sub(r"\'m", " am", str(phrase))
    return phrase

In [9]:
# Cleaning Text feature
preprocessed_text = []
# tqdm is for printing the status bar
for sentance in tqdm(df['Description'].values):
    sent = decontracted(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    # https://gist.github.com/sebleier/554280
    sent = ' '.join(e for e in sent.split())
    preprocessed_text.append(sent.lower().strip())
    
df["clean_text"] = preprocessed_text

100%|██████████████████████████████████| 30960/30960 [00:02<00:00, 11517.96it/s]


In [10]:
sentences = [row.split() for row in df["clean_text"]]

In [11]:
# Source: https://radimrehurek.com/gensim/models/phrases.html

# Each sentence must be a list of string tokens:
first_sentence = next(iter(sentences))
print(first_sentence[:20])

['abandoned', 'places', 'make', 'good', 'settings', 'for', 'fiction', 'normally', 'seen', 'in', 'fiction', 'that', 'evokes', 'types', 'of', 'horror', 'the', 'concept', 'of', 'a']


In [12]:
# Train a toy phrase model on our training corpus.
phrase_model = Phrases(sentences, min_count=2, connector_words=ENGLISH_CONNECTOR_WORDS)

In [13]:
# text = "Look around the room you're in. Notice anyone with particularly prominent, exaggerated, or high cheekbones? Yes? Chances are, he or she is the one out to get you. Villains seem to have a tendency toward quite noticeable cheekbones. It's far more common in animated works, as facial features are often exaggerated. This trope is quite often seen on The Baroness or the Dragon Lady. It's perhaps because sharper cheekbones lend a face a sharper, more angular look, which can convey intense emotion more easily sometimes, or (as with Lean and Mean) because of the unsettlingly skeletal, deathly appearance it can give you. It's also the opposite of the kind of smoother, softer face known as 'baby-faced'—and since a baby-faced person looks like an innocent child, it follows that the prominent bone structure will look like the opposite of this.Compare Lean and Mean and Evil Is Sexy, if you so desire. Sister Trope to Sinister Schnoz and Thin Chin of Sin. Contrast with the Lantern Jaw of Justice for the good guys."

In [14]:
text = "Just because you're officially in charge, that doesn't always mean you're really in charge. The Dragon-in-Chief is a version of The Dragon who serves as the de facto Big Bad of the story, even if they're supposedly not the one in charge. They're nominally subordinate to another villain, but typically so much smarter, stronger or more skilful, and just as evil if not more so (and almost always scarier) that it's clear who's really the bigger menace. This character tends to have almost no respect for the Big Bad due to their comparative lack of vision, courage or common sense. The main villain, for their part, is relegated to Big Bad Wannabe as they seriously or even fatally overestimate The Dragon's loyalty, or are just too afraid of them to be able to keep them in line. The Dragon-in-Chief is not simply the main villainous driving force behind the plot, even if they did not initiate it, but they are such to the point that the Big Bad is pushed aside or even endangered by them, and rendered less important by comparison. The Hero treats, or comes to treat, The Dragon-In-Chief as the actual main villain of the story, and very often It's Personal with the Dragon. The Dragon-In-Chief will typically they think the Big Bad either lacks ambition, or is just an idiot. They may start off as junior partners in the Big Bad's business: after years of hard (but fun) living as a dangerous felon, they have found themselves steady employment with the Big Bad and hope to take over the business some day or retire on the fortune made from their latest Master Plan. This is when they start to complain about their master’s unambitious and/or just plain incompetent way of running things, though the Big Bad might retort that their way is from experience and The Dragon's ways will ultimately lead to ruin. Occasionally, their warnings turn out to be right. It is occasional that the Dragon-In-Chief truly is loyal to the ostensible Big Bad, but in occasions like these their superior is liable to be done in either before the Dragon-in-Chief is, or afterwards in a Post-Climax Confrontation. The Man in Front of the Man is a related, but different trope where the position of the The Dragon as the actual Big Bad is kept hidden until The Reveal, thus The Dragon is less likely to act like a true Dragon-in-Chief in order to keep the jig up from the audience as well as the in-universe characters, including the supposed Big Bad, who might not be aware that they are being manipulated. The Dragon-in-Chief doesn't need this theatre, making their position as the true Big Bad clear from the start. When a character fills this role because the Big Bad is merely physically absent from the main story, and/or not as important to the hero, then the superior is a Greater-Scope Villain (if the Dragon is operating entirely or almost entirely on their own). See also Hyper-Competent Sidekick, Dragon Ascendant, The Starscream and especially the Big Bad Wannabe, whom Dragons-in-Chiefs usually work for. Compare/Contrast The Heavy, which is usually simply The Dragon to an off-screen Greater-Scope Villain, but not necessarily a Dragon-In-Chief in themselves. Important Note: this does not simply refer to any Dragon that is physically superior to a Non-Action Big Bad or Greater-Scope Villain who can be dispatched with relative ease. Plenty of physically weak villains present far greater moral, psychological or other challenges to the hero and their actions drive the character arcs of the protagonists. It is these roles that must be supplanted for a Dragon to truly ascend to Dragon-in-Chief. Please be sure the character actually fits this criteria before adding them as an example."

In [15]:
sent = utils.simple_preprocess(text)

In [16]:
# Apply the trained phrases model to a new, unseen sentence.
phrase_model[sent]

['just',
 'because',
 'you',
 're',
 'officially',
 'in',
 'charge',
 'that',
 'doesn',
 'always',
 'mean',
 'you',
 're',
 'really',
 'in',
 'charge',
 'the',
 'dragon_in_chief',
 'is',
 'version',
 'of',
 'the',
 'dragon',
 'who',
 'serves_as',
 'the',
 'de_facto',
 'big_bad',
 'of',
 'the',
 'story',
 'even',
 'if',
 'they',
 're',
 'supposedly',
 'not',
 'the',
 'one',
 'in',
 'charge',
 'they',
 're',
 'nominally_subordinate',
 'to',
 'another',
 'villain',
 'but',
 'typically',
 'so_much',
 'smarter_stronger',
 'or',
 'more',
 'skilful',
 'and',
 'just',
 'as',
 'evil',
 'if',
 'not',
 'more',
 'so',
 'and',
 'almost_always',
 'scarier',
 'that',
 'it',
 'clear',
 'who',
 'really',
 'the',
 'bigger',
 'menace',
 'this',
 'character',
 'tends',
 'to',
 'have',
 'almost',
 'no_respect',
 'for',
 'the',
 'big_bad',
 'due',
 'to',
 'their',
 'comparative',
 'lack',
 'of',
 'vision',
 'courage',
 'or',
 'common_sense',
 'the',
 'main',
 'villain',
 'for',
 'their',
 'part',
 'is',
 'r

# Word Embeddings

In [17]:
# https://towardsdatascience.com/word2vec-for-phrases-learning-embeddings-for-more-than-one-word-727b6cf723cf

In [18]:
bigram = Word2Vec(phrase_model[sentences],
                 min_count=2,
                window=5,
                vector_size=300,
                workers=cores-1,
                epochs=10,
                seed=42)

In [19]:
unigram = Word2Vec(sentences,
                 min_count=5,
                window=5,
                vector_size=300,
                workers=cores-1,
                epochs=10,
                seed=42)

In [37]:
bigram.wv.most_similar(['dragon'])

[('lancer', 0.6604410409927368),
 ('wolf', 0.6275288462638855),
 ('squad', 0.6199926137924194),
 ('wannabe', 0.6189258694648743),
 ('big_bad', 0.6147741079330444),
 ('knight', 0.5930625200271606),
 ('rival', 0.5887823104858398),
 ('minion', 0.5824518203735352),
 ('big_guy', 0.5746404528617859),
 ('demon', 0.573767364025116)]

In [38]:
unigram.wv.most_similar(['dragon'])

[('tiger', 0.5185089707374573),
 ('starscream', 0.5106976628303528),
 ('lancer', 0.5036728382110596),
 ('ascendant', 0.46887052059173584),
 ('sidekick', 0.4687366485595703),
 ('demon', 0.46054014563560486),
 ('dragons', 0.4532422423362732),
 ('chessmaster', 0.4402010142803192),
 ('angel', 0.43569180369377136),
 ('nemesis', 0.4220254421234131)]

In [39]:
gg.most_similar(['dragon'])

[('dragons', 0.8061283826828003),
 ('minotaur', 0.5826537013053894),
 ('unicorn', 0.5748640894889832),
 ('fairy', 0.5660958886146545),
 ('goblin', 0.563858687877655),
 ('monsters', 0.5638251900672913),
 ('sea_serpent', 0.5596416592597961),
 ('breathing_dragon', 0.5569871068000793),
 ('winged_dragon', 0.5439309477806091),
 ('serpent', 0.5434231162071228)]

# Save so I can convert to a tensor

In [48]:
bigram.wv.save('../models/bigram_word2vec.kv')

In [41]:
bigram.save("../models/bigram_word2vec.model")

In [42]:
unigram.save("../models/unigram_word2vec.model")

In [50]:
unigram.wv.save('../models/unigram_word2vec.kv')

Source: https://radimrehurek.com/gensim/scripts/word2vec2tensor.html

# Visualization

In [23]:
# #Source: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#visualising-word-embeddings

# def reduce_dimensions(model):
#     num_dimensions = 2  # final num dimensions (2D, 3D, etc)

#     # extract the words & their vectors, as numpy arrays
#     vectors = np.asarray(model.wv.vectors)
#     labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

#     # reduce using t-SNE
#     tsne = TSNE(n_components=num_dimensions, random_state=0)
#     vectors = tsne.fit_transform(vectors)

#     x_vals = [v[0] for v in vectors]
#     y_vals = [v[1] for v in vectors]
#     return x_vals, y_vals, labels


In [24]:
# x_vals, y_vals, labels = reduce_dimensions(bigram)


In [25]:
# def plot_with_plotly(x_vals, y_vals, labels, n, plot_in_notebook=True):
#     from plotly.offline import init_notebook_mode, iplot, plot
#     import plotly.graph_objs as go
    
#     x_vals = x_vals[0:n]
#     y_vals = y_vals[0:n]
#     labels = labels[0:n]
    
#     trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
#     data = [trace]

#     if plot_in_notebook:
#         init_notebook_mode(connected=True)
#         iplot(data, filename='word-embedding-plot')
#     else:
#         plot(data, filename='word-embedding-plot.html')


# def plot_with_matplotlib(x_vals, y_vals, labels, n):
#     import matplotlib.pyplot as plt
#     import random
    
#     x_vals = x_vals[0:n]
#     y_vals = y_vals[0:n]
#     labels = labels[0:n]
    
#     random.seed(0)

#     plt.figure(figsize=(12, 12))
#     plt.scatter(x_vals, y_vals)

#     #
#     # Label randomly subsampled 25 data points
#     #
#     indices = list(range(len(labels)))
#     selected_indices = random.sample(indices, 25)
#     for i in selected_indices:
#         plt.annotate(labels[i], (x_vals[i], y_vals[i]))


In [26]:
# try:
#     get_ipython()
# except Exception:
#     plot_function = plot_with_matplotlib
# else:
#     plot_function = plot_with_plotly

# plot_function(x_vals, y_vals, labels, n=1000)

In [27]:
# #Source: https://www.kaggle.com/code/yixuanzhou94/gensim-word-vector-visualization/notebook
# def display_pca_scatterplot(model,words=None,sample=0):
#     if words == None:
#         if sample > 0:
#             words = np.random.choice(list(model.vocab.keys()),sample)
#         else:
#             words = [word for word in model.key]
    
#     word_vectors = np.array([model[w] for w in words])
#     twodim = PCA().fit_transform(word_vectors)[:,:2]
    
#     plt.figure(figsize=(6,6))
#     plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
#     for word, (x,y) in zip(words, twodim):
#         plt.text(x+0.05, y+0.05, word)