
Notebook contains all code to generate embedding models for mental health related twitter data. The genisim package 
is used along with Word2Vec and FastText to set up the embeddings. Dataset is cleaned according to each models
specification. The notebook leverages functions in the preprocessing and utility script which can be found in the scripts folder. For more details on each model please refer to details in the corresponding papers Appendix.


In [13]:
import sys
sys.path.insert(0, 'scripts/')

In [14]:
import utility
import preprocessing as pp

In [15]:
import pandas as pd
import numpy as np
import string
import re
from emoji import UNICODE_EMOJI
import multiprocessing
from multiprocessing import cpu_count

In [16]:
import gensim as g
from gensim.models.word2vec import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import FastText
import gensim.downloader as api
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.decomposition import PCA


In [17]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
nltk.download('wordnet')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()
cores = multiprocessing.cpu_count()
NOOCC_INDEX = 0
NOOCC_TOKEN = 'NOOCC'
MODEL_NUM = 4
abbreviations = pd.read_csv('data/other/abbreviations.csv')['Abbreviation'].tolist()
abbreviations = [str(a).strip() for a in abbreviations]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gregoryverghese/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gregoryverghese/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gregoryverghese/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [48]:
min_count = [2, 3, 4, 5]
window = [2, 3 , 4, 5]
size = [25, 50, 100, 200]
sample=6e-5
alpha=0.05
min_alpha=0.0007 
negative=20
sgTrain = 1
cbowTrain = 0
m1Clean=['Tokens', 'Lemma', 'Stopwords', 'Phrases', 'Emoticons']
m2Clean=['Tokens', 'Lemma', 'Stopwords', 'Phrases']
m1LClean=['Tokens', 'Lemma', 'Stopwords', 'Phrases', 'Lowercase']
m2LClean=['Tokens', 'Lemma', 'Stopwords', 'Phrases']
emArgs = [min_count[0], window[0], size[3], sample, alpha, min_alpha, negative]

## Data Load

In [19]:
fileSchiz = 'data/dataIn/schiz/nonAnnFinalSchiz.csv'
textSchiz = pp.getFile(fileSchiz, 'Tweet')

In [20]:
fileStigma = 'data/dataIn/stigma/nonAnnFinalStig.csv'
textStigma = pp.getFile(fileStigma, 'Tweet')

In [21]:
textAll = pd.concat((textSchiz, textStigma))

## Word2Vec and FastText Models

In [45]:
'''
embedding class contains code to train and save down a embedding model
using Gensim Library. Code is compatible with either Word2Vec or FastText
libraries.
'''
class EmbeddingModel():
    def __init__(self, library, tokens, name):
        self.lib = library
        self.tokens = tokens
        self.name = name
        self.model = None

    '''
    train embedding models using arguments in emArgs
    '''
    def getEmbeddings(self, trainMethod, emArgs):
        
        print(self.lib)
        
        model = self.lib(min_count=emArgs[0], window=emArgs[1], size=emArgs[2], sample=emArgs[3], alpha=emArgs[4], min_alpha=emArgs[5], negative=emArgs[6], workers=20, sg=trainMethod)
        model.build_vocab(self.tokens, progress_per=10000)
        model.train(self.tokens, total_examples=model.corpus_count, epochs=30, report_delay=1)
        model.init_sims(replace=True)
        self.save(model)
        self.model = model
    
    '''
    save model down
    '''
    def save(self, model):
        model.save(self.name)
    
    '''
    return similar words
    '''
    def getSimilar(self, model, word):
        return self.model.wv.most_similar(positive=[word])
    
    '''
    get the word index using the vocabulary and add an OOV token (NOOCC_TOKEN)
    for word out of vocabulary
    '''
    def getWordIndex(self, newWord=NOOCC_TOKEN, newIndex=NOOCC_INDEX ):
        
        wordIndex = {k: v.index for k, v in self.model.wv.vocab.items()}
        self.model.wv.vectors = np.insert(self.model.wv.vectors, [newIndex], self.model.wv.vectors.mean(0), axis=0)
        wordIndex = {word:(index+1) if index>= newIndex else index for word, index in wordIndex.items()}
        wordIndex[newWord] = newIndex

        return wordIndex

    '''
    get train data using wordIndex
    '''
    def getIndexData(self, xText, labels, wordIndex):
        xTrain = [[wordIndex[tok] if tok in wordIndex else wordIndex[NOOCC_TOKEN] for tok in s] for s in xText]
        return (np.array(xTrain), np.array(labels))

    '''
    count number of missing words from model
    '''
    def countMissing(self, text, wordIndex):
        return sum([1 for s in text for tok in s if tok not in wordIndex])

## Model Preparation

In [52]:
'''
call preprocessing class and set up cleaning object
'''
def initializePreProcessing(text, tokenType, cleanMethods=['Tokens', 'Lemma', 'Stopwords', 'Phrases', 'Emoticons']):

    social = pp.SocialPreProcessing(text, tokenType)
    socialClean = social.clean(cleanMethods)
    return socialClean

'''
call embedding class and set up embedding object
'''
def initializeEmbedding(library, tokens, trainMethod, name, emArg):
    
    emModel = EmbeddingModel(library, tokens, name)
    emModel.getEmbeddings(trainMethod, emArg)
  
    return emModel


'''
train all embedding models and get their word vocabulary
'''
def getEmbeddings(libraries, embeddingText, tokenTypes, cleanSchedule, trainSchedule, names, modelNum, emArgs):
  
    
    embeddingTexts = map(initializePreProcessing, [embeddingText]*modelNum, tokenTypes, cleanSchedule)
    emModels = map(initializeEmbedding, libraries, embeddingTexts, trainSchedule, names, emArgs)
    indicies = map(lambda x: x.getWordIndex(), emModels)
    return indicies, emModels


'''
set up word models to get word embeddings
'''
def getWordModels(libraries, embeddingText, cleanSchedule, trainSchedule, wordNames, emArgs):
    modelNum = len(cleanSchedule)
    tokenTypes = [False]*modelNum
    emArgs = modelNum*[emArgs]
    wordIndicies, wordModels = getEmbeddings(libraries, embeddingText, tokenTypes, cleanSchedule, trainSchedule, wordNames, modelNum, emArgs)
    return wordIndicies, modelNum
    
'''
set up char models to get char embeddings
'''
def getCharModels(libraries, embeddingText, cleanSchedule, trainSchedule, charNames, emArgs):
    modelNum = len(cleanSchedule)
    tokenTypes = [True]*modelNum
    emArgs = modelNum*[emArgs]
    charIndicies, charModels = getEmbeddings(libraries, embeddingText, tokenTypes, cleanSchedule, trainSchedule, charNames, modelNum, emArgs)
    return charIndicies, charModels


'''
call code to get models focused on emoticons. Return both character and word models
'''
def initializeEmoticonModels(libraries, embeddingText, cleanSchedule, trainSchedule, args, wNames, cNames):
  
    wordIndicies, wordModels = getWordModels(libraries, embeddingText, cleanSchedule, trainSchedule, wNames, args)
    charIndicies, charModels = getCharModels(libraries, embeddingText, cleanSchedule, trainSchedule, cNames, args)
  
    return (wordModels, charModels)


'''
call code to get models focused on capitalized words. Return both character and word models
'''
def initializeLowerModels(libraries, embeddingText, cleanSchedule, trainSchedule, emArgs, wNames, cNames):
  
    wordIndicies, wordModels = getWordModels(libraries, embeddingText, cleanSchedule, trainSchedule, wNames, emArgs)
    charIndicies, charModels = getCharModels(libraries, embeddingText, cleanSchedule, trainSchedule, cNames, emArgs)
  
    return (wordModels, charModels)


'''
return tweets with emoticons
'''
def getEmoticonText(tokens, emojis):

    emojiLines = list(map(lambda x: any(i in emojis for i in x), tokens))
    tokens = np.array(tokens)
    emojiLines = np.array(emojiLines)
  
    tokens = tokens[emojiLines]
 
    return tokens

## Mental health Stigma Dataset - Not in project

In [20]:
trainSchedule = [cbowTrain, sgTrain,] * 2
cleanSchedule = [m2Clean, m2Clean] *2
libraries = [Word2Vec]* 2 + [FastText] * 2
args = [min_count[0], window[0], size[3], sample, alpha, min_alpha, negative]

In [21]:
wordNames = ['w2vWCBStig', 'w2vWCBEmStig', 'ftWSGStig', 'ftWSGEmStig']
charNames = ['w2vCCBStig', 'w2vCCBEMStig', 'ftCSGStig', 'ftCSGEmStig']
paths = ['embeddings/Word2Vec/stigma/']*2 + ['embeddings/FastText/stigma/']*2 
wordNames = getFilePath(paths, wordNames)
charNames = getFilePath(paths, charNames)

In [None]:
modelsStigma = initializeEmoticonModels(libraries, textStigma, cleanSchedule, trainSchedule, args, wordNames, charNames)

## Schizophrenia Stigma Dataset

Concentrate on embedding models with emoticons and without emoticons. Model code below can be found in the appendix
of the paper. The pipelines below take the data and perform all the NLP preprocessing according to the model being
constructed. For example the model 'w2vWCBSchiz' doesn't have the letters Em in it and therfore it will use m1Clean to remove the emoticons. Train schedule also determines if Skip Gram or CBOW is used.

### Emoticons This first set of models looks at all data and remove emoticons accordingly

In [24]:
trainSchedule = [cbowTrain, cbowTrain, sgTrain, sgTrain] * 2
cleanSchedule = [m1Clean, m2Clean, m1Clean, m2Clean]* 2
libraries = [Word2Vec]*4 + [FastText] * 4

In [25]:
wordNames = ['w2vWCBSchiz', 'w2vWCBEmSchiz', 'w2vWSGSchiz', 'w2vWSGEmSchiz', 'ftWCBSchiz', 'ftWCBEmSchiz', 'ftWSGSchiz', 'ftWSGEmSchiz']
charNames = ['w2vCCBSchiz', 'w2vCCBEMSchiz', 'w2vCSGSchiz', 'w2vCSGEmSchiz', 'ftCCBSchiz', 'ftCCBEMSchiz', 'ftCSGSchiz', 'ftCSGEmSchiz']
path1 = ['embeddings/Word2Vec/schiz/']
path2 = ['embeddings/FastText/schiz/']
wordNames = utility.getFilePath(path1, path2, wordNames)
charNames = utility.getFilePath(path1, path2, charNames)
libraries = [Word2Vec]*4 + [FastText]*4

In [None]:
modelsSchiz = initializeEmoticonModels(libraries, textSchiz, cleanSchedule, trainSchedule, emArgs, wordNames, charNames)

In [38]:
#emWordNames = ['w2vemWCBSchiz', 'w2vemWCBEmSchiz', 'w2vemWSGSchiz', 'w2vemWSGEmSchiz', 'ftemWCBSchiz', 'ftemWCBEmSchiz', 'ftemWSGSchiz', 'ftemWSGEmSchiz']
#emCharNames = ['w2vemCCBSchizSchiz', 'w2vemCCBEMSchiz', 'w2vemCSGSchiz', 'w2vemCSGEmSchiz', 'ftemCCBSchizSchiz', 'ftemCCBEMSchiz', 'ftemCSGSchiz', 'ftemCSGEmSchiz']
#paths = ['embeddings/Word2Vec/schiz/emoticons/']*4 + ['embeddings/FastText/schiz/emoticons/']*4
#emWordNames = getFilePath(paths, emWordNames)
#emWharNames = getFilePath(paths, emCharNames)
#emojiTexts = getModelsTwo(textSchiz, UNICODE_EMOJI)
#modelsStigma = initializeEmoticonModels(libraries, textSchiz, cleanSchedule, trainSchedule, emArgs, emWordNames, emCharNames)

### Emoticon work around for NLTK problem. Load the data saved down by Spacey pre-Tokenized

Due NLTK tokenizer not tokenizing all emoticons, I use SpaceyMoji. Unfortunately this is a python 3 package so it is in a separate notebook, 08 SpaceyMoji, and I save down the tokenized data. Then I load in the SpaceyMoji tokenized data and then create all the relevant embedding models here. Create embedding models for both word and character tokens

In [28]:
fileEmoji = 'data/dataOut/schiz/emoji/nonAnnFinalSchizEmoji.csv'
textEmoji = pp.getFile(fileEmoji, 'Tweet')
textEmoji = textEmoji.apply(lambda x: x[1:len(x)-1])
textEmoji = textEmoji.apply(lambda x: x.split(', '))

In [29]:
fileEmojiChar = 'data/dataOut/schiz/emoji/nonAnnFinalSchizEmojiChar.csv'
textEmojiChar = pp.getFile(fileEmojiChar, 'Tweet')
textEmojiChar = textEmojiChar.apply(lambda x: x[1:len(x)-1])
textEmojichar = textEmojiChar.apply(lambda x: x.split(', '))

In [30]:
emWordNames = ['w2vemWCBSchiz', 'w2vemWCBEmSchiz', 'w2vemWSGSchiz', 'w2vemWSGEmSchiz', 'ftemWCBSchiz', 'ftemWCBEmSchiz', 'ftemWSGSchiz', 'ftemWSGEmSchiz']
emCharNames = ['w2vemCCBSchizSchiz', 'w2vemCCBEMSchiz', 'w2vemCSGSchiz', 'w2vemCSGEmSchiz', 'ftemCCBSchizSchiz', 'ftemCCBEMSchiz', 'ftemCSGSchiz', 'ftemCSGEmSchiz']
path1 = ['embeddings/Word2Vec/schiz/02 emoticons2/']
path2 = ['embeddings/FastText/schiz/02 emoticons2/']
m1Clean=['Lemma', 'Stopwords', 'Phrases', 'Emoticons']
m2Clean=['Lemma', 'Stopwords', 'Phrases']
cleanSchedule = [m1Clean, m2Clean, m1Clean, m2Clean]* 2
trainSchedule = [cbowTrain, cbowTrain, sgTrain, sgTrain] * 2
libraries = [Word2Vec]*4 + [FastText]*4
emWordNames = utility.getFilePath(path1, path2, emWordNames)
emCharNames = utility.getFilePath(path1, path2, emCharNames)

In [None]:
#WORD TOKENS
wordIndicies, wordModels = getWordModels(libraries, textEmoji, cleanSchedule, trainSchedule, emWordNames, emArgs)

In [None]:
#CHARACTER TOKENS
charIndicies, charModels = getCharModels(libraries, textEmojiChar, cleanSchedule, trainSchedule, emCharNames, emArgs)

### Capitalized Words

This follows the same logic as above except we add in the lowercase cleaning function if we are looking at models that should all be lowercase. Same pipeline is followed, NLP prepocessing and then we train the embeddings. These are saved down in the lowercase folder.

In [49]:
lCharNames = ['w2vCCBSchiz', 'w2vCCBlSchiz', 'w2vCSGSchiz', 'w2vCSGlSchiz''ftCCBSchiz', 'ftCCBlSchiz', 'ftCSGSchiz', 'ftCSGlSchiz']
lWordNames = ['w2vWCBSchiz','w2vWCBlSchiz', 'w2vWSGSchiz', 'w2vWSGlSchiz', 'ftWCBSchiz','ftWCBlSchiz', 'ftWSGSchiz', 'ftWSGlSchiz']
path1 = ['embeddings/Word2Vec/schiz/01 lowercase/all/']
path2 = ['embeddings/FastText/schiz/01 lowercase/all/']
lWordNames = utility.getFilePath(path1, path2, lWordNames)
lCharNames = utility.getFilePath(path1, path2, lCharNames)
libraries = [Word2Vec]*4 + [FastText]*4
trainSchedule = [cbowTrain, cbowTrain, sgTrain, sgTrain] * 2
cleanSchedule = [m1LClean, m2LClean, m1LClean, m2LClean] * 2

In [None]:
models = initializeLowerModels(libraries, textSchiz, cleanSchedule, trainSchedule, emArgs, lWordNames, lCharNames)

here we train embedding models with only tweets that contain lowercase words. Given this led to around a small number of tweets ca.3000, it was decided to not include this in the paper because the size of the dataset became the overwhelming factor in the classification performance

In [None]:
lCharNames = ['w2vlCCBSchiz', 'w2vlCCBlSchiz', 'w2vlCSGSchiz', 'w2vlCSGlSchiz', 'ftlCCBSchiz', 'ftlCCBlSchiz', 'ftlCSGSchiz', 'ftlCSGlSchiz']
lWordNames = ['w2vlWCBSchiz', 'w2vlWCBlSchiz', 'w2vlWSGSchiz', 'w2vlWSGlSchiz', 'ftlWCBSchiz', 'ftlWCBlSchiz', 'ftlWSGSchiz', 'ftlWSGlSchiz']
lowerTexts = getModelsTwo(textSchiz)
models = initializeLowerModels(libraries, textSchiz, cleanSchedule, trainSchedule, emArgs, lWordNames, lCharNames)

## Combined schizophrenia and stigma - Not used in the final paper

Combined the two datasets for the embedding models to see if this increased the performance

In [None]:
emWordNames = ['w2vWCBAll', 'w2vWCBEmAll', 'w2vWSGAll', 'w2vWSGEmAll', 'ftWCBAll', 'ftWCBEmAll', 'ftWSGAll', 'ftWSGEmAll']
emCharNames = ['w2vCCBAll', 'w2vCCBEMAll', 'w2vCSGAll', 'w2vCSGEmAll', 'ftCCBAll', 'ftCCBEMAll', 'ftCSGAll', 'ftCSGEmAll']
trainSchedule = [cbowTrain, cbowTrain, sgTrain, sgTrain] * 2
cleanSchedule = [m2Clean, m2Clean, m2Clean, m2Clean] * 2
modelsStigma = initializeEmoticonModels(libraries, textAll, cleanSchedule, trainSchedule, emArgs, emWordNames, emCharNames)

In [0]:
#Get training data
trainText = map(initializePreProcessing, [text])
getIndexData = map(lambda f, x, y, z: f.getIndexData(x, y, z), emModels, trainText, [labels], wordIndicies)

In [0]:
#Check number of missing words
total = sum([1 for s in trainText[0] for tok in s])
missing = map(lambda f, x, y: f.countMissing(x, y), emModels, trainText, wordIndicies)
missing = map(lambda x, y: x/float(y), missing, [total]*len(missing))

### Size of Embedding

This was used to test how size affects the embedding models performance. This is in the final paper

In [32]:
'''
loops through the data and incrementally and splits it up in multiples of 100
'''
def getSizeEmbeddings(textSet, modelPath, libraries):

    for i in range(len(textSet)):
        text = textSet[i]
        text = pd.Series(text)
        models = [w+str(i) for w in modelPath]
        x, y = getWordModels(libraries, text, cleanSchedule, trainSchedule, models, emArgs)

In [20]:
textSchizLst = textSchiz.tolist()
schizSets = [textSchizLst[0:i+100] for i in range(0, len(textSchizLst), 100)]

In [30]:
m2Clean=['Tokens', 'Lemma', 'Stopwords', 'Phrases']
trainSchedule = [sgTrain]
cleanSchedule = [m2Clean]
libraries = [Word2Vec]
w2vWordPath = ['embeddings/size/w2v/w2vWSGEmSchiz']
ftWordNames = ['embeddings/size/ft/ftWSGEmSchiz']
librariesFt = [FastText]

In [None]:
getSizeEmbeddings(schizSets, ftWordNames, librariesFt)

### ELMO - This is not in the project

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import time
import pickle
import tensorflow_hub as hub
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [None]:
trainList = embeddingStigma.tolist()
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [0]:
def getELMO(elmo, text):
    
    embeddings = elmo(text, signature='default', as_dict=True)['elmo']

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    embedding = sess.run(tf.reduce_mean(embeddings,1))
    sess.close()
    
    return embedding

In [0]:
xTrain = [[sent] for sent in trainList]
elmoEmbed = [getELMO(elmo, sent) for sent in xTrain]

In [0]:
elmoEmbed

In [0]:
pickle_out = open("elmo_train_03032019.pickle","wb")
pickle.dump(elmoEmbed, pickle_out)
pickle_out.close()