This notebook contains main code for binary classification using SVM, RandomForest, GradientBoost and KNN
using saved down embedding models. Makes use of ml, evaluation, preprocessing, feature_engineering and utlity scripts. Again we set up a pipeline so we can train multiple models at once and use similar logic to 03 Embeddings notebook with NLP preprocessing happening first.

In [1]:
import sys
sys.path.insert(0, 'scripts/')

In [2]:
import pandas as pd
import numpy as np
from emoji import UNICODE_EMOJI

In [3]:
import utility
import evaluation as ev
import ml
import preprocessing as pp
import ml_config as mlc
import feature_engineering as fe
import functools

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gregoryverghese/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gregoryverghese/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gregoryverghese/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gregoryverghese/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
'''
Set global parameters and file paths
'''
trials = 1
cbowTrain = 0
sgTrain = 1
wordToken = False
charToken = True
m1EClean=['Tokens', 'Lemma', 'Stopwords', 'Phrases', 'Emoticons']
m2EClean=['Tokens', 'Lemma', 'Stopwords', 'Phrases']
m1LClean=['Tokens', 'Lemma', 'Stopwords', 'Phrases', 'Lowercase']
m2LClean=['Tokens', 'Lemma', 'Stopwords', 'Phrases']
gloveInFileName = 'data/glove/glove.twitter.27B.'
gloveOutFileName = 'gensim_glove_vectors_'
gloveDim = ['25d.txt', '50d.txt', '100d.txt', '200d.txt']
fileNameStig = 'data/dataOut/stigma/annFinalStig.csv'
fileNameSchiz1 = 'data/dataOut/schiz/annFinalSchiz_1.csv'
fileNameSchiz2 = 'data/dataOut/schiz/annFinalSchiz_2.csv'
abbreviations = pd.read_csv('data/other/abbreviations.csv')['Abbreviation'].tolist()
abbreviations = [str(a).strip() for a in abbreviations]

In [6]:
'''
Read in three datasets. 1. Stigma data 2. 1000 labelled schizophrenia data 3. 500 labelled schizophrenia data
'''
socialDf = pd.read_csv(fileNameStig, encoding='utf-8')
textStigma = socialDf['Tweet']
labelsStigma = socialDf['Classification']

socialDf = pd.read_csv(fileNameSchiz1, encoding='utf-8')
textSchiz1 = socialDf['Tweet']
labelsSchiz1 = socialDf['Classification']

socialDf = pd.read_csv(fileNameSchiz2, encoding='utf-8')
textSchiz2 = socialDf['Tweet']
labelsSchiz2 = socialDf['Classification']

In [7]:
'''
Parameter settings stored in ml_config file. Parameters have 
been tuned using RandomSearch from Scikit learn package
'''
rfRandomGrid = mlc.rfRandomGrid
gbRandomGrid = mlc.gbRandomGrid
knnRandomGrid = mlc.knnRandomGrid
svmRandomGrid = mlc.svmRandomGrid

wordParameters = mlc.wordParameters
charParameters = mlc.charParameters

randomGrids = [rfRandomGrid, gbRandomGrid, svmRandomGrid, knnRandomGrid]
methods = ['RandomForest', 'GradientBoost', 'SVM', 'KNN']

In [8]:
a = Word2Vec.load('embeddings/Word2Vec/stigma/w2vWCBEmStig')

In [11]:
v = a.wv.vocab.items()
len(v)

5962

In [18]:
import nltk
b = textSchiz2.apply(lambda x: nltk.word_tokenize(x))
c = b.tolist()
x = [t for s in c for t in s if t not in v]

In [19]:
len(x)

11900

In [9]:
'''
calls preprocessing class in pp script
'''
def initializePreProcessing(text, tokenType, cleanMethods=['Tokens', 'Lemma', 'Stopwords', 'Phrases', 'Emoticons']):

    social = pp.SocialPreProcessing(text, tokenType)
    socialClean = social.clean(cleanMethods)
    
    return socialClean


'''
calculates average embedding vector for a sentence/post
'''
def getAvgEmbeddings(sentence, model):
    
    totalEmbedding = sum([model[tok] for tok in sentence if tok in model])
    avgEmbedding = totalEmbedding/float(len(sentence))
    
    return np.array(avgEmbedding)


'''
gets embedding data for all posts
'''
def getEmbeddingData(model, posts):
    
    numFeatures = model.vector_size
    postFeatureVec = np.zeros((len(posts), numFeatures), dtype='float32')
    
    for i in range(len(posts)):
        postFeatureVec[i] = getAvgEmbeddings(posts[i], model)
    
    return postFeatureVec


'''
initializes baseML class from ml script and calls passed 
classifiers with corresponding parameters
'''
def initializeML(features, labels, methods, parameters):
    
    mLearn = ml.baseML(features, labels)
    mLearn.classifiers = [getattr(mLearn,'get'+f)(p) for f, p in zip(methods, parameters)]
    return mLearn.getAllPredictions(methods, parameters)


'''
trains tf-idf matrix on train/test data and
concatenates with corresponding embedding data

'''
def initializeFeatures(text, xTrainSets):
    
    text = text.to_frame(name='tweets')
    fEng = fe.FeatureEngineering(text)
    fBoost = fEng.getFeatures('tweets')
    fBoost = fBoost.drop(columns=['tweets'])
    
    tfidfVect = TfidfVectorizer(encoding='utf-8', lowercase=False, stop_words='english', analyzer='word')
    tfidf = tfidfVect.fit(text['tweets'])
    tfidf = tfidf.transform(text['tweets'])
    tfidfDf = pd.DataFrame(tfidf.toarray(), columns=tfidfVect.get_feature_names())
    features = pd.concat([fBoost.reset_index(drop=True), tfidfDf], axis=1)
    
    xTrainSets = [np.concatenate((features.to_numpy(), x), axis=1) for x in xTrainSets]
    
    return xTrainSets


'''
get classification for each of the passed embedding models
'''
def getMLresult(methods, parameters, text, labels, tokenTypes, cleanSchedule, modelFileNames=None, models=None, featureBoost=False):
     
    print(wordToken)
        
    modelNum = len(cleanSchedule)
    
    if models == None:
        models = map(Word2Vec.load, modelFileNames)
       
    trainTexts = map(initializePreProcessing, [text]*modelNum, tokenTypes, cleanSchedule)
    
    xTrainSets = map(lambda x, y: getEmbeddingData(x, y.tolist()), models, trainTexts)
    
    if featureBoost:
        f = initializeFeatures(text, xTrainSets)
        
    mlResults = map(initializeML, xTrainSets, [labels]*modelNum, [methods]*modelNum, parameters)
    
    return mlResults


'''
begins parameter tuning for each of the passed classifiers
'''
def initializeTuning(methods, randomGrids, features, labels):
    
    parameters = [{}]*len(methods)
    
    mLearn = ml.baseML(features, labels)
    mLearn.classifiers = [getattr(mLearn,'get'+f)(p) for f, p in zip(methods, parameters)]
    
    parameters = mLearn.getHyperParams(randomGrids)
    
    return parameters


'''
sets up models and data ready for parameter tuning. If featureBoost,
then concatenate tf-idf with embedding data
'''
def getMLParameters(methods, randomGrids, features, labels, tokenTypes, cleanSchedule, modelNames, models, featureBoost):

    modelNum = len(cleanSchedule)
    
    if models == None:
        models = map(Word2Vec.load, modelNames)
    
    trainTexts = map(initializePreProcessing, [features]*modelNum, tokenTypes, cleanSchedule)
    xTrainSets = map(lambda x, y: getEmbeddingData(x, y.tolist()), models, trainTexts)
    
    
    if featureBoost:
        xTrainSets = initializeFeatures(text, xTrainSets)
        
    mlParams = map(initializeTuning, [methods]*modelNum, [randomGrids]*modelNum, xTrainSets, [labels]*modelNum)
    
    return mlParams


'''
get best parameters from tuning process then pass to getMLResults
with test data for classificaton
'''
def getClassification(trainText, trainLabels, testText, testLabels, randomGrids, cleanSchedule, token, methods, modelNames=None, models=None, featureBoost=False):
    
    tokenTypes = utility.getTokenTypes(token, emWordModels)
    tunedParams = getMLParameters(methods, randomGrids, trainText, trainLabels, tokenTypes, cleanSchedule, modelNames, models, featureBoost)
    parameters = [[p.best_params_ for p in m] for m in tunedParams]
    results = getMLresults(methods, parameters, testText, testLabels, tokenTypes, cleanSchedule, modelNames, models, featureBoost)
    
    return (tunedParams, results) 

'''
get ML results for n trials. Save down average results in csv
'''
def getResults(modelPaths, modelNames, token, cleanSchedule, parametersLst, textSchiz2, labelsSchiz2, resultsPath, featureBoost=None):
    
    tokenTypes = utility.getTokenTypes(token, modelNames)
    
    x = functools.partial(getMLresult, methods, parametersLst, textSchiz2, labelsSchiz2, tokenTypes, cleanSchedule, modelPaths, featureBoost=False)
    resultsSchiz = [x() for i in range(trials)]
    
    getEvalTrials(resultsSchiz, modelNames, resultsPath)

In [10]:
'''
return filePath
'''
def getFilePath(paths, names):
    return [p + n for p, n in zip(paths, names)]

'''
get for each metric for model across the number of trials
'''
def getAverage(results, metric):

    classifiers = zip(*results)
    average = [np.array([trial[metric].mean() for trial in clf]).mean() for clf in classifiers]

    return average


'''
get average for each metric for each model and save each model 
down in a separate csv file.
'''
def getEvalTrials(results, names, path):

    results = zip(*results)
    

    for m in range(len(results)):
        accuracy = getAverage(results[m], 'test_accuracy')
        precision = getAverage(results[m], 'test_precision')
        recall = getAverage(results[m], 'test_recall')
        f1 = getAverage(results[m], 'test_f1')
        roc_auc = getAverage(results[m], 'test_roc_auc')

        evalDict = {'accuracy':accuracy, 'precision':precision, 'recall':recall, 'f1':f1, 'roc_auc':roc_auc}
        evalDf = pd.DataFrame(evalDict, index=methods)

        evalDf.to_csv(path + names[m] + '.csv')

## Stigma Mental Health - Not used in this paper

In [11]:
modelNames = ['w2vWCBStig', 'w2vWCBEmStig', 'ftWSGStig', 'ftWSGEmStig']
path1 = ['embeddings/Word2Vec/stigma/']
path2 = ['embeddings/FastText/stigma/'] 
models = utility.getFilePath(path1, path2, modelNames)
cleanSchedule = [m2EClean, m2EClean] *2
tokenTypes = [False, False, False, False]
xTrain, xTest, yTrain, yTest = train_test_split(textStigma, labelsStigma, test_size=0.3, random_state=42)
wordParametersLst = utility.getParameters(wordParameters, models)

In [72]:
#modelStig = getClassification(xTrain, yTrain, xTest, yTest, randomGrids, cleanSchedule, tokenTypes, methods,  modelNames)
#bestParamsStig = getBestParameters(modelStig)

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


In [None]:
resultsStig = getMLresult(methods, wordParametersLst, xTest, yTest, tokenTypes, cleanSchedule, modelNames)

## Schizophrenia stigma data

Below we perform the classification task for each of the models. We do it for word data, and char data. Word2Vec 
and FastText do it in the same pipeline. We perform word classification for Glove separately

### Emoticon Models - All

Lets consider models with all data and emoticons stripped out.

In [12]:
cleanSchedule = [m2EClean, m2EClean]
resultsPath = 'results/neural embeddings/all/'
path1 = ['embeddings/Word2Vec/schiz/00 emoticons/all/']
path2 = ['embeddings/FastText/schiz/00 emoticons/all/']

In [20]:
'''
Schizophrenia models, focus on emoticons 1.Word 2.Character 3.Glove
'''

'''
get word models
'''
def getSchizWordData(cleanSchedule, resultsPath, path1, path2):

    #emWordModels = ['w2vWCBSchiz', 'w2vWCBEmSchiz', 'w2vWSGSchiz', 'w2vWSGEmSchiz', 'ftWCBSchiz', 'ftWCBEmSchiz', 'ftWSGSchiz', 'ftWSGEmSchiz']
    emWordModels = ['w2vWSGEmSchiz', 'ftWSGEmSchiz']
    emWordPaths = utility.getFilePath(path1, path2, emWordModels)
    wordParametersLst = utility.getParameters(wordParameters, emWordPaths)
    getResults(emWordPaths, emWordModels, wordToken, cleanSchedule, 
                                        wordParametersLst, textSchiz2, labelsSchiz2, resultsPath)

'''
get character models
'''  
def getSchizCharData(cleanSchedule, resultsPath, path1, path2):

    emCharModels = ['w2vCCBSchiz', 'w2vCCBEMSchiz', 'w2vCSGSchiz', 'w2vCSGEmSchiz', 'ftCCBSchiz', 'ftCCBEMSchiz', 'ftCSGSchiz', 'ftCSGEmSchiz']
    emCharPaths = utility.getFilePath(path1, path2, emCharModels)
    charParametersLst = utility.getParameters(charParameters, emCharPaths)
    getResults(emCharPaths, emCharModels, charToken, cleanSchedule, 
                                        charParametersLst, textSchiz2, labelsSchiz2, resultsPath)

'''
get Glove models
'''
def getSchizGlove(cleanSchedule, resultsPath, path1, path2):
    
    gloveModels = ['glove200']
    cleanSchedule = [m2EClean]
    gloveModels = [KeyedVectors.load_word2vec_format('embeddings/glove/glove200', binary=False)]
    gloveParametersLst = utility.getParameters(wordParameters, GloveModels)
    getResults(emCharPaths, gloveModels, wordToken, cleanSchedule, 
                                        wordParametersLst, textSchiz2, labelsSchiz2, resultsPath)

In [None]:
getSchizWordData(cleanSchedule, resultsPath, path1, path2)

In [None]:
getSchizCharData(cleanSchedule, resultsPath, path1, path2)

In [None]:
getSchizGlove(cleanSchedule, resultsPath, path1, path2)

### Emoji WorkAround due to NLTK tokenizer problem (see README.txt)

In the same way as '03 Embeddings' we use the Spacey Package as a workaround the NLTK emoji tokenize issue. First we have to load in the csv file that has been tokenized by Spacey and then convert it into a tractable form that captures this tokenization. Note in the M1EClean list we do not have the Tokenize cleaning command becasue it is already tokenized

In [34]:
fileEmoji = 'data/dataOut/schiz/emoji/annSchiz2.csv'
textEmoji = pp.getFile(fileEmoji, 'Tweet')
textEmoji = textEmoji.apply(lambda x: x[1:len(x)-1])
textEmoji = textEmoji.apply(lambda x: x.split(', '))

In [35]:
tEmoji = textEmoji.tolist()
tEmoji = [[tok[1:len(tok)-1] for tok in sent] for sent in tEmoji]
textEmoji = pd.Series(tEmoji)

In [27]:
fileEmojiChar = 'data/dataOut/schiz/emoji/annSchizChar2.csv'
textEmojiChar = pp.getFile(fileEmojiChar, 'Tweet')
textEmojiChar = textEmojiChar.apply(lambda x: x[1:len(x)-1])
textEmojichar = textEmojiChar.apply(lambda x: x.split(', '))

In [28]:
m1EClean=['Lemma', 'Stopwords', 'Phrases', 'Emoticons']
m2EClean=['Lemma', 'Stopwords', 'Phrases']
cleanSchedule = [m1EClean, m2EClean, m1EClean, m2EClean]*2
resultsPath = 'results/neural embeddings/emojiAll/'
path1 = ['embeddings/Word2Vec/schiz/02 emoticons2/']
path2 = ['embeddings/FastText/schiz/02 emoticons2/']

In [29]:
'''
get word models
'''

def getSchizWordData2(text, labels, cleanSchedule, resultsPath, path1, path2):

    emWordModels = ['w2vemWCBSchiz', 'w2vemWCBEmSchiz', 'w2vemWSGSchiz', 'w2vemWSGEmSchiz', 'ftemWCBSchiz', 'ftemWCBEmSchiz', 'ftemWSGSchiz', 'ftemWSGEmSchiz']
    emWordPaths = utility.getFilePath(path1, path2, emWordModels)
    wordParametersLst = utility.getParameters(wordParameters, emWordPaths)
    getResults(emWordPaths, emWordModels, wordToken, cleanSchedule, 
                                        wordParametersLst, text, labels, resultsPath)

 '''
 get character models
 '''   
def getSchizCharData2(text, labels, cleanSchedule, resultsPath, path1, path2):

    emCharModels = ['w2vemCCBSchiz', 'w2vemCCBEMSchiz', 'w2vemCSGSchiz', 'w2vemCSGEmSchiz', 'ftemCCBSchiz', 'ftemCCBEMSchiz', 'ftemCSGSchiz', 'ftemCSGEmSchiz']
    emCharPaths = utility.getFilePath(path1, path2, emCharModels)
    charParametersLst = utility.getParameters(charParameters, emCharPaths)
    getResults(emCharPaths, emCharModels, charToken, cleanSchedule, 
                                        charParametersLst, text, labelsSchiz2, resultsPath)

In [None]:
getSchizWordData2(textEmoji, labelsSchiz2, cleanSchedule, resultsPath, path1, path2)

In [None]:
getSchizCharData2(textEmoji, labelsSchiz2, cleanSchedule, resultsPath, path1, path2)

In [None]:
getSchizGlove2(cleanSchedule, resultsPath, path1, path2)

For this next part we only consider tweets containing the emoticons or capitalized word in the classification, this is a much smaller number. 56 for emoticons and ca. 3000 for capitalixed words.

In [75]:
import nltk

'''
return emoticon only tweets
'''
def getEmoticonText(tokens, labels, emojis):

    emojiLines = list(map(lambda x: any(i in emojis for i in x), tokens))
    
    labels = np.array(labels)
    tokens = np.array(tokens)
    
    emojiLines = np.array(emojiLines)
    
    tokens = tokens[emojiLines]
    labels = labels[emojiLines]

    return tokens, labels


'''
return tweets only containiong capitalized words. Use the self compiled abbreivations list
'''
def getLowerText(tokens, labels):

    lowerLines = list(map(lambda x: any(i.isupper() and len(i) > 1
                                      and i != 'RT' and i not in abbreviations for i in x), tokens))
    
    print(lowerLines)
    
    tokens = np.array(tokens)
    labels = np.array(labels)
    
    lowerLines = np.array(lowerLines)
    
    tokens = tokens[lowerLines]
    labels = labels[lowerLines]

    return tokens, labels


'''
get either emoticon tweets or lowercase tweets depending
on emojis argument
'''
def getModelsTwo(embeddingText, labels, emojis=None):

    tokens = embeddingText.apply(lambda x: nltk.word_tokenize(x))
    tokens = tokens.tolist()
    
    if emojis == None:
        tokens, labels = getLowerText(tokens, labels) 
    else:
        tokens, labels = getEmoticonText(tokens, emojis)
     
    text = pd.Series(tokens)
    text = text.apply(lambda x: ' '.join(x))
    labels = pd.Series(labels)

    return text, labels

In [66]:
tokens, labels = getEmoticonText(tEmoji, labelsSchiz2, UNICODE_EMOJI)
textEmoji = pd.Series(tokens)
labels = pd.Series(labels)

In [219]:
m1EClean=['Lemma', 'Stopwords', 'Phrases', 'Emoticons']
m2EClean=['Lemma', 'Stopwords', 'Phrases']
cleanSchedule = [m1EClean, m2EClean, m1EClean, m2EClean]*2
resultsPath = 'results/neural embeddings/emojiAll/'
path1 = ['embeddings/Word2Vec/schiz/02 emoticons2/']
path2 = ['embeddings/FastText/schiz/02 emoticons2/']
resultsPath = 'results/neural embeddings/emoji/'

In [None]:
getSchizWordData2(textEmoji, labels, cleanSchedule, resultsPath, path1, path2)

### Emoticon Models - emoticon only tweets

In [None]:
#Looking at just emoticon tweets
path1 = ['embeddings/Word2Vec/schiz/02 emoticons2/']
path2 = ['embeddings/FastText/schiz/02 emoticons2/']
resultsPath = 'results/neural embeddings/emoji/'
cleanSchedule = [m1EClean, m2EClean] * 2

In [None]:
'''
Schizophrenia models with only tweets containing emoticons 1.Word 2.Character 3.Glove
'''

def getSchizEmojiWordData(cleanSchedule, resultsPath, path1, path2):

    wordParametersLst = getParameters(wordParameters, emWordModels)
    emWordModels = ['w2vemWCBSchiz', 'w2vemWCBEmSchiz', 'ftemWCBSchiz', 'ftemWCBEmSchiz']
    emWordPaths = getFilePath(path1, path2, emWordModels)
    wordParametersLst = getParameters(wordParameters, emWordPaths)
    getSchizResults(emWordPaths, emWordModels, wordToken, cleanSchedule, 
                                        wordParametersLst, textSchiz2, labelsSchiz2)
    
def getSchizEmojiCharData(cleanSchedule, resultsPath, path1, path2):

    charParametersLst = getParameters(charParameters, emWordModels)
    emCharModels = ['w2vemCCBSchiz', 'w2vemCCBEMSchiz', 'ftemCCBSchiz', 'ftemCCBEMSchiz']
    emCharPaths = getFilePath(path1, path2, emCharModels)
    charParametersLst = getParameters(charParameters, emCharPaths)
    getSchizResults(emCharPaths, emCharModels, charToken, cleanSchedule, 
                                        charParametersLst, textSchiz2, labelsSchiz2)

def getSchizEmojiGlove(cleanSchedule, resultsPath, path1, path2):
    
    gloveModels = ['glove200']
    cleanSchedule = [m2EClean]
    gloveModels = [KeyedVectors.load_word2vec_format('embeddings/glove/glove200', binary=False)]
    gloveParametersLst = getParameters(wordParameters, GloveModels)
    getSchizResults(emCharPaths, gloveModels, wordToken, cleanSchedule, 
                                        wordParametersLst, textSchiz2, labelsSchiz2)

In [None]:
getSchizEmojiWordData(cleanSchedule, resultsPath, path1, path2)

In [None]:
getSchizEmojiCharData(cleanSchedule, resultsPath, path1, path2)

In [None]:
getSchizEmojiGlove(cleanSchedule, resultsPath, path1, path2)

### Capitalized Models - all

Here is the code for capitalized models. Start with classification on all data using the capitalized embedding models
then we look at just the subset of tweets which contains capitalized words.

In [36]:
path1 = ['embeddings/Word2Vec/schiz/01 lowercase/all/']
path2 = ['embeddings/FastText/schiz/01 lowercase/all/']
cleanSchedule = [m2LClean, m1LClean, m2LClean, m1LClean]
resultsPath = 'results/neural embeddings/lower/'

In [39]:
'''
Schizophrenia models, focus on capitalized words 1.Word 2.Character 3.Glove
'''

'''
word models
'''
def getSchizLowerWordData(cleanSchedule, resultsPath, path1, path2):

    lWordModels = ['w2vWCBlSchiz', 'w2vWCBSchiz', 'w2vWSGlSchiz', 'w2vWSGSchiz']
    lWordPaths = utility.getFilePath(path1, path2, lWordModels)
    wordParametersLst = utility.getParameters(wordParameters, lWordModels)
    getResults(lWordPaths, lWordModels, wordToken, cleanSchedule, 
                                        wordParametersLst, textSchiz2, labelsSchiz2, resultsPath)

'''
char models
'''
def getSchizLowerCharData(cleanSchedule, resultsPath, path1, path2):

    lCharModels = ['w2vWCBlSchiz', 'w2vWCBSchiz', 'w2vWSGlSchiz', 'w2vWSGSchiz']
    lCharPaths = utility.getFilePath(path1, path2, lWordModels)
    charParametersLst = utility.getParameters(charParameters, lCharModels)
    getResults(lCharPaths, lCharModels, charToken, cleanSchedule, 
                                        charParametersLst, textSchiz2, labelsSchiz2, resultsPath)

'''
glove models
'''
def getSchizLowerGlove(cleanSchedule, resultsPath, path1, path2):
    
    gloveModels = ['glove200']
    cleanSchedule = [m2EClean]
    gloveModels = [KeyedVectors.load_word2vec_format('embeddings/glove/glove200', binary=False)]
    gloveParametersLst = getParameters(wordParameters, GloveModels)
    getSchizResults(emCharPaths, gloveModels, wordToken, cleanSchedule, 
                                        wordParametersLst, textSchiz2, labelsSchiz2)

### Capitalized Models - lowercase tweets only

In [None]:
#Looking at just lowercase tweets
lOnlyWordModels = ['w2vlWCBlSchiz', 'w2vlWCBSchiz', 'w2vlWSGlSchiz', 'w2vlWSGSchiz', 'ftlWCBlSchiz', 'ftlWCBSchiz', 'ftlWSGlSchiz', 'ftlWSGSchiz']
lOnlyCharModes = ['']
path1 = ['embeddings/Word2Vec/schiz/01 lowercase/lowerOnly/']
path2 = ['embeddings/FastText/schiz/01 lowercase/lowerOnly/']
lOnlyWordModels = utility.getFilePath(path1, path2, lWordModels)
lOnlyCharModels  utility.getFilePath(path1, path2, lCharModels)
cleanSchedule = [m2LClean, m1LClean, m2LClean, m1LClean]*2

In [1]:
'''
Schizophrenia models with only tweets containing sequence of capitalized words 1.Word 2.Character 3.Glove
'''
def getSchizLowerData(cleanSchedule, resultsPath, path1, path2):

    lOnlyWordModels = ['w2vlWCBlSchiz', 'w2vlWCBSchiz', 'w2vlWSGlSchiz', 'w2vlWSGSchiz', 'ftlWCBlSchiz', 'ftlWCBSchiz', 'ftlWSGlSchiz', 'ftlWSGSchiz']
    
    wordParametersLst = getParameters(wordParameters, lOnlyWordModels)
    lWordModels = getFilePath(path1, path2, lWordModels)
    wordParametersLst = getParameters(wordParameters, emWordPaths)
    getSchizResults(emWordPaths, emWordModels, wordToken, cleanSchedule, 
                                        wordParametersLst, textSchiz2, labelsSchiz2)
    
def getSchizLowerData(cleanSchedule, resultsPath, path1, path2):

    charParametersLst = getParameters(charParameters, emWordModels)
    emCharPaths = getFilePath(path1, path2, emCharModels)
    charParametersLst = getParameters(charParameters, emCharPaths)
    getSchizResults(emCharPaths, emCharModels, charToken, cleanSchedule, 
                                        charParametersLst, textSchiz2, labelsSchiz2)

def getSchizLowerGlove(cleanSchedule, resultsPath, path1, path2):
    
    gloveModels = ['glove200']
    cleanSchedule = [m2EClean]
    gloveModels = [KeyedVectors.load_word2vec_format('embeddings/glove/glove200', binary=False)]
    gloveParametersLst = getParameters(wordParameters, GloveModels)
    getSchizResults(emCharPaths, gloveModels, wordToken, cleanSchedule, 
                                        wordParametersLst, textSchiz2, labelsSchiz2)

In [None]:
path1 = ['embeddings/Word2Vec/schiz/01 lowercase/all/']
path2 = ['embeddings/FastText/schiz/01 lowercase/all/']
cleanSchedule = [m2LClean, m1LClean, m2LClean, m1LClean]
resultsPath = 'results/neural embeddings/lower/'
textSchiz = [[s] for s in textSchiz2.tolist()]
lowerText = getLowerText(textSchiz2, labelsSchiz2)

In [65]:
'''
word model
'''
def getSchizLowerData(cleanSchedule, resultsPath, path1, path2):

    lOnlyWordModels = ['w2vlWCBlSchiz', 'w2vlWCBSchiz', 'w2vlWSGlSchiz', 'w2vlWSGSchiz', 'ftlWCBlSchiz', 'ftlWCBSchiz', 'ftlWSGlSchiz', 'ftlWSGSchiz']
    
    wordParametersLst = getParameters(wordParameters, lOnlyWordModels)
    lWordModels = getFilePath(path1, path2, lWordModels)
    wordParametersLst = getParameters(wordParameters, emWordPaths)
    getSchizResults(emWordPaths, emWordModels, wordToken, cleanSchedule, 
                                        wordParametersLst, textSchiz2, labelsSchiz2)
    
'''
lower model
'''  
def getSchizLowerGlove(cleanSchedule, resultsPath, path1, path2):
    
    lOnlyWordModels = ['w2vlWCBlSchiz', 'w2vlWCBSchiz', 'w2vlWSGlSchiz', 'w2vlWSGSchiz', 'ftlWCBlSchiz', 'ftlWCBSchiz', 'ftlWSGlSchiz', 'ftlWSGSchiz']

    gloveModels = ['glove200']
    cleanSchedule = [m2EClean]
    gloveModels = [KeyedVectors.load_word2vec_format('embeddings/glove/glove200', binary=False)]
    gloveParametersLst = getParameters(wordParameters, GloveModels)
    getSchizResults(lOnlyWordModels, gloveModels, wordToken, cleanSchedule, 
                                        wordParametersLst, textSchiz2, labelsSchiz2)

### Combined Datasets not used in the paper

In [9]:
emCombWordModels = ['w2vWCBAll', 'w2vWCBEmAll', 'w2vWSGAll', 'w2vWSGEmAll', 'ftWCBAll', 'ftWCBEmAll', 'ftWSGAll', 'ftWSGEmAll']
emCombCharModels = ['w2vCCBAll', 'w2vCCBEMAll', 'w2vCSGAll', 'w2vCSGEmAll', 'ftCCBAll', 'ftCCBEMAll', 'ftCSGAll', 'ftCSGEmAll']
path1 = ['embeddings/Word2Vec/combined/']
path2 = ['embeddings/FastText/combined/']
emCombWordModels = getFilePath(path1, path2, emCombWordModels)
emCombCharModels = getFilePath(path1, path2, emCombCharModels)
cleanSchedule = [m1EClean, m2EClean, m1EClean, m2EClean] * 2
charParametersLst = getParameters(charParameters, emCombCharModels)
wordParametersLst = getParameters(wordParameters, emCombWordModels)

In [None]:
tokenTypes = [False, False, False, False] * 2
modelCombWords = getMLresult(methods, wordParametersLst, textSchiz2, labelsSchiz2, tokenTypes, cleanSchedule, emCombWordModels)

In [None]:
tokenTypes = [True, True, True, True] * 2
charCombWords = getMLresults(methods, charParametersLst, textSchiz2, labelsSchiz2, tokenTypes, cleanSchedule, emCombCharModels)

### Combining embeddings with other our baseline set of features

Here we combine the embedding models with a tf-idf matrix and the descroptive featuers such as sentiment and run them through the classifiers.

In [22]:
path1 = ['embeddings/Word2Vec/schiz/00 emoticons/all/']
path2 = ['embeddings/FastText/schiz/00 emoticons/all/']
cleanSchedule = [m1EClean, m2EClean, m1EClean, m2EClean] * 2
resultsPath = 'results/neural embeddings/combined/'

In [27]:
'''
word model
'''

def getSchizCombWordData(cleanSchedule, resultsPath, path1, path2):

    emCombWordModels = ['w2vWCBSchiz', 'w2vWCBEmSchiz', 'w2vWSGSchiz', 'w2vWSGEmSchiz', 'ftWCBSchiz', 'ftWCBEmSchiz', 'ftWSGSchiz', 'ftWSGEmSchiz']
    emWordCombPaths = utility.getFilePath(path1, path2, emCombWordModels)
    wordParametersLst = utility.getParameters(wordParameters, emCombWordModels)
    getResults(emWordCombPaths, emCombWordModels, wordToken, cleanSchedule, 
                                        wordParametersLst, textSchiz2, labelsSchiz2, resultsPath, featureBoost=True)
 '''
Character data
 '''   
def getSchizCombCharData(cleanSchedule, resultsPath, path1, path2):

    emCombCharModels = ['w2vCCBSchiz', 'w2vCCBEMSchiz', 'w2vCSGSchiz', 'w2vCSGEmSchiz', 'ftCCBSchiz', 'ftCCBEMSchiz', 'ftCSGSchiz', 'ftCSGEmSchiz']
    emCombCharPaths = utility.getFilePath(path1, path2, emCombCharModels)
    charParametersLst = utility.getParameters(charParameters, emCombCharModels)
    getResults(emCombCharPaths, emCombCharModels, charToken, cleanSchedule, 
                                        charParametersLst, textSchiz2, labelsSchiz2, resultsPath, featureBoost=True)
'''
Glove data
'''
def getSchizCombGlove(cleanSchedule, resultsPath, path1, path2):
    
    gloveModels = ['glove200']
    cleanSchedule = [m2EClean]
    gloveModels = [KeyedVectors.load_word2vec_format('embeddings/glove/glove200', binary=False)]
    gloveParametersLst = getParameters(wordParameters, GloveModels)
    getSchizResults(emCharPaths, gloveModels, wordToken, cleanSchedule, 
                                        wordParametersLst, textSchiz2, labelsSchiz2)

In [None]:
getSchizCombWordData(cleanSchedule, resultsPath, path1, path2)

In [None]:
getSchizCombCharData(cleanSchedule, resultsPath, path1, path2)

In [None]:
getSchizCombGlove(cleanSchedule, resultsPath, path1, path2)

# Different size Embeddings

In [82]:
m2EClean = ['Lemma', 'Stopwords', 'Phrases']
cleanSchedule = [m2EClean]
resultsPath = 'results/neural embeddings/size/ft/'
x = pd.read_csv('data/dataIn/schiz/nonAnnFinalSchiz.csv')
fileNum = len(x)
modelW2v = 'w2vWSGEmSchiz'
modelPathW2v = 'embeddings/size/w2v/w2vWSGEmSchiz'
modelFt = 'ftWSGEmSchiz'
modelPathFt = 'embeddings/size/ft/ftWSGEmSchiz'

In [83]:
'''
get the word classification results
'''
def getWordData(modelPaths, models, text, labels, cleanSchedule, resultsPath):

    wordParametersLst = utility.getParameters(wordParameters, models)
    getResults(modelPaths, models, wordToken, cleanSchedule, 
                                        wordParametersLst, text, labels, resultsPath)
    
def getSizeEmbeddings(textSet, modelPath, libraries):

    for i in range(len(textSet)):
        text = textSet[i]
        text = pd.Series(text)
        models = [w+str(i) for w in modelPath]
        x, y = getWordModels(libraries, text, cleanSchedule, trainSchedule, models, emArgs)

In [None]:
'''
loop through different sized w2v models
'''

for i in range(fileNum):
    modelPaths = [modelPathW2v+str(i)]
    models = [modelW2v+str(i)]
    getWordData(modelPaths, models, textSchiz2, labelsSchiz2, cleanSchedule, resultsPath)

In [None]:
'''
loop through for ft different sized models
'''

for i in range(fileNum):
    modelPaths = [modelPathFt+str(i)]
    models = [modelFt+str(i)]
    getWordData(modelPaths, models, textEmoji, labelsSchiz2, cleanSchedule, resultsPath)

## Stacked Embeddings - This is not used in the final paper

In [151]:
def stackEmbeddings():
    embeddingMatrix = np.concatenate((embeddingsSet), axis=1) 
    return embeddingMatrix

def getPCA(embedding):
    pca = PCA(n_components=200)
    pComponents = pca.fit_transform(embedding)
    return pComponents

In [95]:
testw2v = Word2Vec.load('embeddings/Word2Vec/schiz/00 emoticons/all/w2vCCBSchiz')

In [96]:
testFt = Word2Vec.load('embeddings/FastText/schiz/00 emoticons/all/ftCCBSchiz')

In [97]:
testGlove = KeyedVectors.load_word2vec_format('embeddings/glove/glove200', binary=False)

In [98]:
tokensSchiz1 = initializePreProcessing(textSchiz1, False, cleanMethods=['Tokens', 'Lemma', 'Stopwords', 'Phrases', 'Emoticons'])

Tokens
Lemma
Stopwords
Phrases
Emoticons


In [153]:
models = [testFt, testGlove, testw2v]
embeddings = [getEmbeddingData(m, tokensSchiz1) for m in models]

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
