# Word2Vec 

##### Import libraries

In [1]:
import pandas as pd

import numpy as np
from numpy import array
from numpy import zeros

import matplotlib.pyplot as plt

import re
import multiprocessing

# Train, test, split
from sklearn.model_selection import train_test_split

# For handling imbalanced classes
from collections import Counter
from imblearn.over_sampling import SMOTE

# Word2Vec
import gensim 
import gensim.models
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim import utils

import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

##### Load data

In [2]:
posts = pd.read_csv('../data/posts-preprocessed.csv')

In [3]:
posts.head()

Unnamed: 0,author,subreddit,timeframe,text,datetime,words,word_stems
0,sub21036,bulimia,pre-covid,['how can i stop hating myself i have been on...,2017-12-02 16:36:16,"['[', ""'how"", 'stop', 'hating', 'eating', 'dis...","['[', ""'how"", 'stop', 'hating', 'eating', 'dis..."
1,sub12048,bulimia,pre-covid,['new guy here 1 month on it 16m hi guys just...,2017-12-05 19:45:25,"['[', ""'new"", 'guy', '1', 'month', '16m', 'hi'...","['[', ""'new"", 'guy', '1', 'month', '16m', 'hi'..."
2,sub6523,bulimia,pre-covid,['so i just vomited blood what can i eat while...,2017-12-06 16:58:16,"['[', ""'so"", 'vomited', 'blood', 'eat', 'throa...","['[', ""'so"", 'vomited', 'blood', 'eat', 'throa..."
3,sub37858,bulimia,pre-covid,['recovery is expensive during recovery hi i...,2017-12-07 14:07:27,"['[', ""'recovery"", 'expensive', 'recovery', 'h...","['[', ""'recovery"", 'expensive', 'recovery', 'h..."
4,sub21036,bulimia,pre-covid,['extreme tongue pain i purged a few nights a...,2017-12-08 20:16:50,"['[', ""'extreme"", 'tongue', 'pain', 'purged', ...","['[', ""'extreme"", 'tongue', 'pain', 'purged', ..."


##### Binarize targets using get_dummies

Will use each subreddit as target (except mental health)

In [4]:
posts = pd.get_dummies(posts, columns=['subreddit'])

In [5]:
posts.drop(columns='subreddit_mentalhealth', inplace = True)

##  Create Word2Vec model

Using **Continuous Bag of Words (CBOW) model**: CBOW model predicts the current word given context words within specific window. The input layer contains the context words and the output layer contains the current word. The hidden layer contains the number of dimensions in which we want to represent current word present at the output layer.

https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/

In [None]:
# How many cores am I working with?
cores = multiprocessing.cpu_count() # Count the number of cores in this computer
cores

Parameters: 
- size: (default 100) The number of dimensions of the embedding, e.g. the length of the dense vector to represent each token (word).
- window: (default 5) The maximum distance between a target word and words around the target word.
- min_count: (default 5) The minimum count of words to consider when training the model; words with an occurrence less than this count will be ignored.
- workers: (default 3) The number of threads to use while training.
- sg: (default 0 or CBOW) The training algorithm, either CBOW (0) or skip gram (1).

Code help from:
- https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
- https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/

The word2vec algorithm processes documents sentence by sentence. This means we will preserve the sentence-based structure during cleaning.

In [None]:
posts.head(1)

### Training a word2vec model on reddit posts

Preprocessing of sentences using Gensim's pre-processing on posts, so that the input yields one sentence (list of utf8 words) after another. https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

In [None]:
class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in posts['text']:
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [None]:
sentences = MyCorpus()

In [None]:
model = gensim.models.Word2Vec(sentences=sentences, workers=cores-1) # run on all cores minus 1

After the model is fit, we print the size of the learned vocabulary

In [None]:
# summarize vocabulary size in model
vocab = list(model.wv.vocab)
print('Vocabulary size: %d' % len(vocab))

Finally, we save the learned embedding vectors to file using the save_word2vec_format() on the model’s ‘wv‘ (word vector) attribute. The embedding is saved in ASCII format with one word and vector per line.

In [8]:
filename = 'embedding_word2vec.txt'

In [16]:
# save model in ASCII (word2vec) format
model.wv.save_word2vec_format(filename, binary=False)