# WordEmbeddings
In this notebook we'll create a word embedding (using Gensim) representation of a corpus of texts in English. Also, we'll visualize the representation using T-SNE

### Download and read dataset

In [2]:
import wget
import os
import tarfile

filename = "dataset/aclImdb_v1.tar.gz"

dataset_link = "http://ai.stanford.edu/~amaas/data/sentiment/{}".format("aclImdb_v1.tar.gz")
try:
    os.mkdir("dataset")
except OSError:
    pass

if not os.path.isfile(filename):
    file = wget.download(dataset_link, out='dataset/aclImdb_v1.tar.gz')

    tar = tarfile.open(filename, "r:gz")
    tar.extractall("dataset")
    tar.close()

In [3]:
dataset_path = 'dataset/aclImdb'
train_positive_files = ['train/pos/'+f for f in os.listdir(dataset_path+'/train/pos') \
                        if os.path.isfile(os.path.join(dataset_path+'/train/pos', f))]

train_negative_files = ['train/neg/'+f for f in os.listdir(dataset_path+'/train/neg') \
                        if os.path.isfile(os.path.join(dataset_path+'/train/neg', f))]

test_positive_files = ['test/pos/'+f for f in os.listdir(dataset_path+'/test/pos') \
                       if os.path.isfile(os.path.join(dataset_path+'/test/pos', f))]

test_negative_files = ['test/neg/'+f for f in os.listdir(dataset_path+'/test/neg') \
                       if os.path.isfile(os.path.join(dataset_path+'/test/neg', f))]

all_files = list(set().union(train_positive_files,train_negative_files, test_positive_files, test_negative_files))

corpus = []

for file in all_files:
    with open(os.path.join(dataset_path, file), 'r') as text_file:
        corpus.append(text_file.readlines()[0])

### pre-process corpus

In [4]:
import re
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_texts(text):
    text = REPLACE_NO_SPACE.sub("", text.lower())
    text = REPLACE_WITH_SPACE.sub(" ", text)
    
    return text.split()

In [5]:
from multiprocessing import Pool
from math import floor

agents = 4
chunksize = floor(len(corpus)/4)
with Pool(processes=agents) as pool:
    processed_corpus = pool.map(preprocess_texts, corpus, chunksize)

In [6]:
print(processed_corpus[0:2])

[['the', 'beautiful', 'story', 'of', 'stardust', 'is', 'written', 'by', 'by', 'neil', 'gaiman', 'writer', 'of', 'mirrormask', 'and', 'its', 'really', 'a', 'good', 'story', 'i', 'think', 'it', 'would', 'appeal', 'to', 'any', 'labyrinth', 'princess', 'bride', 'or', '10th', 'kingdom', 'fan', 'and', 'yet', 'its', 'totally', 'unique', 'and', 'stands', 'up', 'on', 'its', 'own', 'and', 'i', 'feel', 'the', 'film', 'adaptation', 'of', 'this', 'story', 'has', 'a', 'far', 'better', 'ending', 'than', 'what', 'was', 'presented', 'in', 'the', 'original', 'novel', 'by', 'neil', 'gaiman', 'i', 'wont', 'spoil', 'it', 'for', 'you', 'the', 'main', 'character', 'tristan', 'tristran', 'in', 'the', 'novel', 'is', 'the', 'son', 'of', 'a', 'mortal', 'and', 'a', 'faerie', 'slave', 'kept', 'by', 'a', 'witch', 'in', 'the', 'realm', 'of', 'faerie', 'the', 'story', 'begins', 'in', 'a', 'town', 'near', 'a', 'wall', 'that', 'separates', 'the', 'magical', 'world', 'from', 'the', 'human', 'world', 'when', 'there', 'is

# Creating Word2Vec
See other parameters at https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec

In [7]:
from gensim.models import Word2Vec, Doc2Vec
w2vmodel = Word2Vec(sentences=processed_corpus,
                    size=300,
                    min_count=5,
                    workers=3)

In [8]:
w2vmodel.wv.most_similar(positive=['actress', 'man'], negative=['woman'])

  if np.issubdtype(vec.dtype, np.int):


[('actor', 0.7551474571228027),
 ('performer', 0.573062539100647),
 ('role', 0.5399695634841919),
 ('comedian', 0.5142716765403748),
 ('performance', 0.5095571875572205),
 ('artist', 0.47670966386795044),
 ('villain', 0.45249801874160767),
 ('singer', 0.42914724349975586),
 ('oscar', 0.4257918894290924),
 ('achievement', 0.4153568744659424)]

In [9]:
w2vmodel.wv.most_similar('matrix')

[('godzilla', 0.6869165301322937),
 ('phantasm', 0.5817844867706299),
 ('twilight', 0.5758272409439087),
 ('godfather', 0.5639734864234924),
 ('blade', 0.5633026361465454),
 ('hostel', 0.561491847038269),
 ('phantom', 0.5545967817306519),
 ('conan', 0.544568657875061),
 ('wars', 0.5444141626358032),
 ('craze', 0.5411075353622437)]

In [10]:
w2vmodel.wv.get_vector('movie')

array([ 1.25333130e+00,  7.25100219e-01, -3.39769006e-01, -1.11589932e+00,
        4.39446926e-01, -7.25901067e-01,  9.25769925e-01, -4.58653897e-01,
       -1.70235968e+00,  8.19877088e-01,  1.07927121e-01, -4.51163709e-01,
        1.87763906e+00,  7.80995131e-01, -5.41266561e-01, -1.27781188e+00,
        1.20189083e+00, -2.92429864e-01,  1.18349063e+00, -5.95851243e-01,
        3.48397017e-01,  4.78480399e-01,  1.95853043e+00, -1.31650472e+00,
       -9.05684352e-01,  7.20012367e-01, -1.99470270e+00,  1.34943461e+00,
        3.70051146e-01,  1.87643504e+00,  2.03323984e+00,  4.12070066e-01,
        8.53172839e-01, -1.07866945e-03,  6.12409413e-01,  4.72545207e-01,
        1.67925611e-01,  1.57392204e-01,  2.67814130e-01, -6.30028665e-01,
        1.52726376e+00,  8.71291816e-01, -1.98622167e-01, -3.32597882e-01,
        2.53938913e-01,  9.69515920e-01, -5.53712785e-01,  1.38318050e+00,
        1.19589925e+00,  1.99329138e+00,  1.39446747e+00, -2.60554552e-01,
       -2.44315267e-01, -

# Visualizing 

In [None]:
from sklearn.manifold import TSNE
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline  

### get infos from w2v model

In [None]:
vocab = list(w2vmodel.wv.vocab)
X = w2vmodel[vocab]

### project n-dimensional into 2-dimensional space using T-SNE

In [None]:
tsne = TSNE(n_components=2, verbose=1, n_iter=250)
X_tsne = tsne.fit_transform(X)

### create a dataframe and plot

In [None]:
df = pd.DataFrame(X_tsne, index=vocab, columns=['x', 'y'])

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

ax.scatter(df['x'], df['y'])

In [None]:
for word, pos in df.iterrows():
    ax.annotate(word, pos)