In [None]:
import pandas as pd
import numpy as np
import gensim

In [None]:
depression = pd.read_csv('data/depression_topics.csv', low_memory=False, lineterminator='\n')
depression.drop(columns=['Unnamed: 0'], inplace=True)

anxiety = pd.read_csv('data/anxiety_topics.csv', low_memory=False, lineterminator='\n')
anxiety.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
depression['date'] = pd.to_datetime(depression.date)
anxiety['date'] = pd.to_datetime(anxiety.date)

In [None]:
# documents = depression.text_title
# tokenized_docs = [gensim.utils.simple_preprocess(d) for d in documents]

In [4]:
import io
import itertools
import numpy as np
import os
import re
import string
import tensorflow as tf
import tqdm

from keras.utils import np_utils
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Dot, Embedding, Flatten, GlobalAveragePooling1D, Reshape
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from keras.preprocessing.text import Tokenizer


In [5]:
SEED = 42 
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [6]:
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in dataset.
    for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
              sequence, 
              vocabulary_size=vocab_size,
              sampling_table=sampling_table,
              window_size=window_size,
              negative_samples=0)
    
    # Iterate over each positive skip-gram pair to produce training examples 
    # with positive context word and negative samples.
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1, 
                num_sampled=num_ns, 
                unique=True, 
                range_max=vocab_size, 
                seed=SEED, 
                name="negative_sampling")
      
      # Build context and label vectors (for one target word)
            negative_sampling_candidates = tf.expand_dims(
            negative_sampling_candidates, 1)

            context = tf.concat([context_class, negative_sampling_candidates], 0)
            label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels

In [54]:
documents = anxiety.text_title
tokenized_docs = [gensim.utils.simple_preprocess(d) for d in documents]

In [55]:
vocab, index = {}, 1 # start indexing from 1
vocab['<pad>'] = 0 # add a padding token 
for doc in tokenized_docs: 
    for token in doc:
        if token not in vocab: 
            vocab[token] = index
            index += 1


In [56]:
vocab_size1 = len(vocab)
vocab_size1
# inverse_vocab = {index: token for token, index in vocab.items()}


70473

In [None]:
# for seq in sequences[:5]:
#     print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

In [57]:
corpus = [' '.join(word) for word in tokenized_docs]

In [58]:
# a = pd.Series(cleaned_text)
a = pd.Series(corpus)
sequence_length = a.str.split().str.len().max()

In [96]:
dataset = tf.data.Dataset.from_tensor_slices(corpus).filter(lambda x: tf.cast(tf.strings.length(x), bool))
print(type(dataset))
# <class 'tensorflow.python.data.ops.dataset_ops.FilterDataset'>

<class 'tensorflow.python.data.ops.dataset_ops.FilterDataset'>


In [97]:
vocab_size = vocab_size1
sequence_length = sequence_length

vectorize_layer = TextVectorization(
#     standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

vectorize_layer.adapt(dataset.batch(10000))

In [98]:
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:55])

['', '[UNK]', 'to', 'and', 'the', 'my', 'it', 'of', 'that', 'me', 'in', 'but', 'for', 'have', 'this', 'is', 'anxiety', 'with', 'was', 'so', 'just', 'like', 'on', 'about', 'or', 'can', 'not', 'feel', 'be', 'you', 'do', 'at', 've', 'get', 'if', 'out', 'don', 'been', 'what', 'had', 'all', 'when', 'as', 'up', 'know', 'because', 'am', 'time', 'really', 'how', 'now', 'they', 'from', 'she', 'he']


In [99]:
def vectorize_text(text):
    text = tf.expand_dims(text, -1)
    return tf.squeeze(vectorize_layer(text))

# Vectorize the data in text_ds.
text_vector_ds = dataset.batch(10000).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [100]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

126724


In [101]:
targets, contexts, labels = generate_training_data(
    sequences=sequences, 
    window_size=2, 
    num_ns=4, 
    vocab_size=vocab_size, 
    seed=SEED)

100%|██████████| 126724/126724 [16:03<00:00, 131.47it/s]


In [102]:
print(len(targets), len(contexts), len(labels))

11745937 11745937 11745937


In [103]:
33708212

33708212

In [105]:
BATCH_SIZE = 10000
BUFFER_SIZE = 100000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset shapes: (((10000,), (10000, 5, 1)), (10000, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [106]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset shapes: (((10000,), (10000, 5, 1)), (10000, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [107]:
class Word2Vec(Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = Embedding(vocab_size, 
                                          embedding_dim,
                                          input_length=1,
                                          name="w2v_embedding", )
        self.context_embedding = Embedding(vocab_size, 
                                           embedding_dim, 
                                           input_length=num_ns+1)
        self.dots = Dot(axes=(3,2))
        self.flatten = Flatten()

    def call(self, pair):
        target, context = pair
        we = self.target_embedding(target)
        ce = self.context_embedding(context)
        dots = self.dots([ce, we])
        return self.flatten(dots)
    
    


In [108]:
def custom_loss(x_logit, y_true):
    return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [109]:
embedding_dim = 200
num_ns=4
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [110]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [114]:
# word2vec.fit(dataset, epochs=15, callbacks=[tensorboard_callback])

In [None]:
%tensorboard --logdir logs


In [112]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [113]:
out_v = io.open('vectors.txt', 'w', encoding='utf-8')
out_m = io.open('metadata.txt', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if  index == 0: continue # skip 0, it's padding.
    vec = weights[index] 
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()