# WordEmbeddings
In this notebook we'll create a word embedding (using Gensim) representation of a corpus of texts in English. Also, we'll visualize the representation using T-SNE

### Download and read dataset

In [None]:
import wget
import os
import tarfile

filename = "dataset/aclImdb_v1.tar.gz"

dataset_link = "http://ai.stanford.edu/~amaas/data/sentiment/{}".format("aclImdb_v1.tar.gz")
try:
    os.mkdir("dataset")
except OSError:
    pass

if not os.path.isfile(filename):
    file = wget.download(dataset_link, out='dataset/aclImdb_v1.tar.gz')

    tar = tarfile.open(filename, "r:gz")
    tar.extractall("dataset")
    tar.close()

In [None]:
dataset_path = 'dataset/aclImdb'
train_positive_files = ['train/pos/'+f for f in os.listdir(dataset_path+'/train/pos') \
                        if os.path.isfile(os.path.join(dataset_path+'/train/pos', f))]

train_negative_files = ['train/neg/'+f for f in os.listdir(dataset_path+'/train/neg') \
                        if os.path.isfile(os.path.join(dataset_path+'/train/neg', f))]

test_positive_files = ['test/pos/'+f for f in os.listdir(dataset_path+'/test/pos') \
                       if os.path.isfile(os.path.join(dataset_path+'/test/pos', f))]

test_negative_files = ['test/neg/'+f for f in os.listdir(dataset_path+'/test/neg') \
                       if os.path.isfile(os.path.join(dataset_path+'/test/neg', f))]

all_files = list(set().union(train_positive_files,train_negative_files, test_positive_files, test_negative_files))

corpus = []

for file in all_files:
    with open(os.path.join(dataset_path, file), 'r') as text_file:
        corpus.append(text_file.readlines()[0])

### pre-process corpus

In [None]:
import re
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_texts(text):
    text = REPLACE_NO_SPACE.sub("", text.lower())
    text = REPLACE_WITH_SPACE.sub(" ", text)
    
    return text.split()

In [None]:
from multiprocessing import Pool
from math import floor

agents = 4
chunksize = floor(len(corpus)/4)
with Pool(processes=agents) as pool:
    processed_corpus = pool.map(preprocess_texts, corpus, chunksize)

In [None]:
print(processed_corpus[0:2])

# Creating Word2Vec
See other parameters at https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec

In [None]:
from gensim.models import Word2Vec
w2vmodel = Word2Vec(sentences=processed_corpus,
                    size=100,
                    min_count=5,
                    workers=3)

In [None]:
w2vmodel.wv.most_similar('movie')

# Visualizing 

In [None]:
from sklearn.manifold import TSNE
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline  

### get infos from w2v model

In [None]:
vocab = list(w2vmodel.wv.vocab)
X = w2vmodel[vocab]

### project n-dimensional into 2-dimensional space using T-SNE

In [None]:
tsne = TSNE(n_components=2, verbose=1, n_iter=250)
X_tsne = tsne.fit_transform(X)

### create a dataframe and plot

In [None]:
df = pd.DataFrame(X_tsne, index=vocab, columns=['x', 'y'])

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

ax.scatter(df['x'], df['y'])

In [None]:
for word, pos in df.iterrows():
    ax.annotate(word, pos)