In [1]:
import pandas as pd
import numpy as np
import datetime 
import re
from urlextract import URLExtract
import spacy
import time
import pickle
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim


### Preprocessing and document tokenization

In [3]:
# remove all urls from posts

extractor = URLExtract()
def replace_urls(x):
    urls = extractor.find_urls(x)
    if urls:
        x_new = replace_urls(x.replace(urls[0],''))
        return x_new
    else:
        return x

In [4]:
# r_the_donald.title.isna().sum()

In [5]:
# loading r/the_donald
r_the_donald = pd.read_csv('data/the_donald_full.csv', low_memory=False)
r_the_donald['date'] = pd.to_datetime(r_the_donald.created_utc, unit='s')
r_the_donald['text_title'] = r_the_donald.title.fillna('') + ' ' + r_the_donald.selftext.fillna('')
r_the_donald['text_title'] = r_the_donald.text_title.apply(lambda x: replace_urls(x))
r_the_donald.text_title.dropna(inplace=True)

In [6]:
r_the_donald.shape

(1178929, 35)

In [7]:
# tokenize original posts for r/depression
donald_tokenized_posts = [gensim.utils.simple_preprocess(p) for p in r_the_donald.text_title if len(p) >= 50]

In [8]:
print('r/the_donald:',len(donald_tokenized_posts))

r/the_donald: 798533


### Defining our word2vec models

In [9]:
import multiprocessing
from tqdm import tqdm
from sklearn import utils
from gensim.models.callbacks import CallbackAny2Vec

In [10]:
# defining a class to log our loss for each epoch
# note that gensim has some issues/bugs with the compute loss parameter
# so this is being used to loosely identify the elbow in our loss

class LossLogger(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1
        
loss_logger = LossLogger()

# also to speed things up we will want to use all available cores
cores = multiprocessing.cpu_count()

In [11]:
# after looking at the data words that appear less than 9 times are predominantly uncommon mispellings
# so our min count for r/depression will be 9 in the 10-15 range we see less mispellings/valid uncommon words
# with r/depression data anything over 10 epochs will give good results

r_the_donald_model = gensim.models.Word2Vec(
                                            donald_tokenized_posts,
                                            size=300,
                                            window=5,
                                            min_count=5,
                                            sg=1,
                                            callbacks=[loss_logger],
                                            compute_loss=True,
                                            iter=35,
                                            workers=cores
                                            )

Loss after epoch 0: 11159144.0
Loss after epoch 1: 8893936.0
Loss after epoch 2: 8081706.0
Loss after epoch 3: 7140054.0
Loss after epoch 4: 5229868.0
Loss after epoch 5: 5168140.0
Loss after epoch 6: 5111432.0
Loss after epoch 7: 5061340.0
Loss after epoch 8: 4982492.0
Loss after epoch 9: 5007932.0
Loss after epoch 10: 1471588.0
Loss after epoch 11: 272280.0
Loss after epoch 12: 273304.0
Loss after epoch 13: 269120.0
Loss after epoch 14: 264264.0
Loss after epoch 15: 268144.0
Loss after epoch 16: 262648.0
Loss after epoch 17: 256080.0
Loss after epoch 18: 255136.0
Loss after epoch 19: 249632.0
Loss after epoch 20: 248176.0
Loss after epoch 21: 247440.0
Loss after epoch 22: 243176.0
Loss after epoch 23: 237888.0
Loss after epoch 24: 230432.0
Loss after epoch 25: 230248.0
Loss after epoch 26: 223160.0
Loss after epoch 27: 218296.0
Loss after epoch 28: 218168.0
Loss after epoch 29: 216848.0
Loss after epoch 30: 215160.0
Loss after epoch 31: 211688.0
Loss after epoch 32: 210328.0
Loss aft

In [23]:
r_the_donald_model.save("models/r_the_donald.model")

In [None]:
# r_anxiety_model.save("models/r_anxiety.model")

### Examining the learned word embeddings for our models

In [22]:
r_the_donald_model.wv.most_similar(['social'], topn=10)

[('media', 0.47440338134765625),
 ('platforms', 0.4521333873271942),
 ('smaca', 0.4504384994506836),
 ('censorship', 0.41543710231781006),
 ('garfunkel', 0.4153199791908264),
 ('mytaxcut', 0.41112202405929565),
 ('isolationism', 0.4109939634799957),
 ('justice', 0.4057742953300476),
 ('trumptown', 0.4027055501937866),
 ('giants', 0.3964461088180542)]

In [None]:
r_depression_model.wv.most_similar(['tired'], topn=10)

In [None]:
r_depression_model.wv.most_similar(['love'], topn=10)

In [None]:
# anxiety[anxiety.selftext.str.contains(' reddit ')].values[2]

In [None]:
r_depression_model.wv.most_similar(positive=['social', 'media', 'account'], topn=10)

In [None]:
r_depression_model.wv.most_similar(['river'], topn=30)

In [None]:
r_depression_model.wv.most_similar(['hate'], topn=30)

In [None]:
r_anxiety_model.wv.most_similar(positive=['psychiatrist'], negative=['prescribed'], topn=20)

In [None]:
r_depression_model.wv.most_similar(positive=['psychiatrist'], negative=['prescribed'], topn=20)

In [None]:
r_depression_model.wv.most_similar(positive=['comment'], topn=10)

In [None]:
r_anxiety_model.wv.most_similar(positive=['comment'], topn=10)

In [None]:
r_depression_model.wv.most_similar(positive=['boy', 'woman'], negative=['man'],topn=10)

sideffects (insomnia libido), prescriptions(adderall, antidepressant/ssri)

gender = king - queen , gender = man - woman

king - x = man - woman
man - woma - king = - x
woman - man + king = x

man + royal = king
woman + royal = queen



In [None]:
r_depression_model.wv.most_similar(positive=['effects'],topn=20)

In [None]:
r_depression_model.wv.most_similar(positive=['appetite'],topn=40)

In [None]:
r_depression_model.wv.most_similar(positive=['appetite', 'antidepressants'], negative=['stimulants'],topn=20)

In [None]:
r_depression_model.wv.most_similar(positive=['appetite', 'ssris'], negative=['stimulants'],topn=20)

In [None]:
# diff = model1.wv['libido'] + model1.wv['antidepressant']
# model2.wv.most_similar(['girl'],topn=100)

In [None]:
# diff = model.wv['stimulants'] - model.wv['anxiety'] + model.wv['antidepressants']
# model.wv.similar_by_vector(diff,topn=40)
# model2.wv.most_similar(positive=['appetite', 'zoloft'], negative=['adderall'], topn=40)

In [None]:
# sanity check
print(r_depression_model.wv.most_similar(positive=['guy', 'woman'], negative=['girl'], topn=10))
print()
print(r_depression_model.wv.most_similar(positive=['bad', 'happy'], negative=['good'], topn=10))
# print(model2.wv.most_similar(positive=['yeezy', 'obama'], negative=['japan'], topn=20))


In [None]:
print(r_depression_model.wv.most_similar(positive=['manga', 'america'], negative=['japan'], topn=20))
print()
print(r_depression_model.wv.most_similar(positive=['anime', 'america'], negative=['japan'], topn=20))

In [None]:
print(r_anxiety_model.wv.most_similar(positive=['manga', 'america'], negative=['japan'], topn=20))
print()
print(r_anxiety_model.wv.most_similar(positive=['anime', 'america'], negative=['japan'], topn=20))

In [None]:
wt = r_depression_model.wv.most_similar(positive=['man', 'talks'], negative=['woman'], topn=30)
mt = r_depression_model.wv.most_similar(positive=['woman', 'talks'], negative=['man'], topn=20)
mt

In [None]:
wt = r_depression_model.wv.most_similar(positive=['guy', 'talks'], negative=['girl'], topn=20)
mt = r_depression_model.wv.most_similar(positive=['guy', 'talks'], negative=['girl'], topn=20)
wt

In [None]:
r_depression_model.wv.most_similar(positive=['dog', 'barks'], negative=['cat'],topn=30)

In [None]:
r_depression_model.wv.most_similar(positive=['girl', 'engineer'], negative=['guy'],topn=30)

In [None]:
# print(model2.wv.most_similar(positive=['man', 'gives'], negative=['woman'], topn=10))
wt = r_anxiety_model.wv.most_similar(positive=['man', 'talks'], negative=['woman'], topn=20)
mt = r_anxiety_model.wv.most_similar(positive=['woman', 'talks'], negative=['man'], topn=20)
mt

In [None]:
print(r_anxiety_model.wv.most_similar(positive=['guy', 'gives'], negative=['girl'], topn=20))
print()
print(r_anxiety_model.wv.most_similar(positive=['guy', 'gives'], negative=['girl'], topn=20))

In [None]:
print(r_depression_model.wv.most_similar(positive=['guy', 'gives'], negative=['girl'], topn=20))
print()
print(r_depression_model.wv.most_similar(positive=['girl', 'gives'], negative=['guy'], topn=20))

In [None]:
print(r_depression_model.wv.most_similar(positive=['guy', 'talks'], negative=['girl'], topn=15))


In [None]:
print(r_depression_model.wv.most_similar(positive=['guy', 'asks'], negative=['girl'], topn=15))


In [None]:
# wt = r_depression_model.wv.most_similar(positive=['guy', 'talks'], negative=['girl'], topn=12)
# mt = r_anxiety_model.wv.most_similar(positive=['guy', 'talks'], negative=['girl'], topn=12)

wt = r_depression_model.wv.most_similar(positive=['anime', 'america'], negative=['japan'], topn=12)
mt = r_anxiety_model.wv.most_similar(positive=['anime', 'america'], negative=['japan'], topn=12)

funny_list = [[item[0], round(item[1], 4)] for item in mt]
funny_list2 = [[item[0], round(item[1], 4)] for item in wt]


x = [item[0] for item in funny_list]
y = [item[1] for item in funny_list]

x2 = [item[0] for item in funny_list2]
y2 = [item[1] for item in funny_list2]

plt.figure(figsize=[10, 7])

fig, ax = plt.subplots(figsize=[10, 8])
plt.title('Man - Woman + Talks\n', color='black', fontsize=13)
ax.barh(x, y, color = 'salmon')
plt.xlabel('Cosine Similarity', fontsize=12, color='black', fontname='Osaka')
plt.grid(False)
# plt.xlim([.24, .42])
ax.axes.tick_params(axis="y", colors='black', labelsize=12)
ax.axes.tick_params(axis="x", colors="black", labelsize=12)


plt.figure(figsize=[10, 7])

fig, ax = plt.subplots(figsize=[10, 8])
plt.title('Man - Woman + Talks\n', color='black', fontsize=13)
ax.barh(x2, y2, color = 'salmon')
plt.xlabel('Cosine Similarity', fontsize=12, color='black', fontname='Osaka')
plt.grid(False)
# plt.xlim([.24, .42])
ax.axes.tick_params(axis="y", colors='black', labelsize=12)
ax.axes.tick_params(axis="x", colors="black", labelsize=12)
# plt.savefig('MARKIPLIER.png', transparent=True, dpi=300)

In [None]:
r_depression_model.wv.most_similar(['energy'], topn=20)

In [None]:
r_depression_model.wv.most_similar(['insomnia'], topn=30)

In [None]:
r_depression_model.wv.most_similar(['circadian'], topn=30)

In [None]:
r_anxiety_model.wv.most_similar(['circadian'], topn=30)

In [None]:
r_depression_model.wv.most_similar(positive=['sleep'], negative=['insomnia'], topn=20)

In [None]:
# model.wv.similar_by_vector(negative=['psychiatrist', 'prescribed'],topn=20)
# model2.wv.most_similar(positive=['psychiatrist'], negative=['prescribed'], topn=20)
# model2.wv.most_similar(positive=['addict'], negative=['drugs'], topn=20)
r_depression_model.wv.most_similar(positive=['alcoholic'], negative=['alcohol'], topn=20)



In [None]:
r_depression_model.wv.most_similar(positive=['alcoholic'], topn=20)


In [None]:
r_anxiety_model.wv.most_similar(positive=['alcoholic'], negative=['alcohol'], topn=20)


In [None]:
# r_depression_model.wv.most_similar(positive=['sad', 'happy'], negative=['tears'], topn=20)

In [None]:
print(r_anxiety_model.wv.most_similar(positive=['alcoholic'], topn=20))
print()
print(r_anxiety_model.wv.most_similar(positive=['addict'], negative=['drugs'], topn=20))

In [None]:
print(r_depression_model.wv.most_similar(positive=['alcoholic'], topn=20))
print()
print(r_depression_model.wv.most_similar(positive=['addict'], negative=['drugs'], topn=20))

In [None]:
# doc_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
# doc_model

In [None]:
# import seaborn as sns

# sns.set_theme(style="whitegrid")
# a = r_depression_model.wv['man']
# b = r_depression_model.wv['woman']

# c = r_depression_model.wv['asked']
# d = r_depression_model.wv['told']



# # a = np.array([-1,1])
# # b = np.array([1,1])
# x = a + c - b

# x2 = r_depression_model.wv['therapist']
# x3 = r_depression_model.wv.similar_by_vector(x)

# # fig = plt.figure(figsize=[15,15])
# # ax = ax.gca(projection='3d')
# fig, ax = plt.subplots(figsize=[10, 8])
# V = np.array([a, b])
# origin = np.array([[0, 0],[0, 0]]) # origin point

# ax.quiver(*origin, V[:,0], V[:,1], color=['r','b','g'], scale=1.2)
# # plt.xlim([-.002, .002])
# # plt.show()
# # model.wv.similar_by_vector(x)

# ax.axes.tick_params(axis="y", colors='white', labelsize=10)
# ax.axes.tick_params(axis="x", colors="white", labelsize=10)
# plt.grid(False)
# # plt.savefig('vecs11.png', transparent=True, dpi=300)


In [None]:
# from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# from nltk.tokenize import word_tokenize

# text = data_copy.text_title
# tokenized_self_text = [gensim.utils.simple_preprocess(d) for d in text]

# author = data_copy.author
# tokenized_title = [gensim.utils.simple_preprocess(d) for d in author]


# tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
# tagged_data[1:10]

In [None]:
# docs = []
# for index, doc in enumerate(tokenized_self_text):
#     tagged = TaggedDocument(words=doc, tags=[index])
#     docs.append(tagged)

In [None]:
# # define parameters for doc2vec
# doc_model = gensim.models.Doc2Vec(
#                                   vector_size=300,
#                                   window=5,
#                                   min_count=5,
#                                   workers=cores,
#                                   epochs=10,
#                                   alpha=.025,
#                                   min_alpha = 0.0001)
# doc_model.build_vocab(docs)

In [None]:
# train model
# doc_model.train(docs, total_examples=doc_model.corpus_count, epochs=doc_model.epochs)

In [None]:
# a = '''
# Slowly isolating myself
# Sorry just wanted to say something to anyone really
# but i used to talk to a couple people about my issues
# and how i was doing but i feel like everytime i do i
# annoy them and that im being boring or just a pain to be around.
# I always overthink situations and im probably doing the same here but
# im sat here just wishing i could talk to those i trust but i
# also dont want to be a burden all the time. I cant talk to many people
# and i struggle to talk to new people. Also there some of my best friends
# i dont want to lose them but i also know i need to express how I feel.
# Thank you for reading my rambling if you did..
# '''

In [None]:
# testing = gensim.utils.simple_preprocess(a)

In [None]:
# inferred_vector = doc_model.infer_vector(testing)
# sims = doc_model.docvecs.most_similar([inferred_vector], topn=len(doc_model.docvecs))
# sims[:10]

In [None]:
# testing = '''
# i really need advice on my medication i just starting taking zoloft'''

# testing = gensim.utils.simple_preprocess(testing)
# testing

In [None]:
# vec_df = pd.DataFrame(doc_model.docvecs.vectors_docs)

In [None]:
# doc_model.docvecs.vectors_docs.shape

### Saving our word embeddings to a tensorflow friendly format so we can load embeddings into embedding projector

In [24]:

words = []
counts =[]
w2v = gensim.models.Word2Vec.load("models/r_the_donald.model")
for index in range(len(w2v.wv.index2word)):
    word = w2v.wv.index2word[index]
    words.append(word)
    counts.append(w2v.wv.vocab[word].count)
    
pd.DataFrame({'word':words, 'count':counts}).to_csv('/Users/collinswestnedge/programming/project_05/tensor_data/r_the_donald_meta.tsv', index=False,sep='\t' )

import io

# Vector file, `\t` seperated the vectors and `\n` seperate the words
# """
# 0.1\t0.2\t0.5\t0.9
# 0.2\t0.1\t5.0\t0.2
# 0.4\t0.1\t7.0\t0.8
# """

path2 = '/Users/collinswestnedge/programming/project_05/tensor_data/r_the_donald_'

out_v = io.open(path2+'vecs.tsv', 'w', encoding='utf-8')

# Meta data file, `\n` seperated word
# """
# token1
# token2
# token3
# """
# out_m = io.open(path2+'meta.tsv', 'w', encoding='utf-8')

# Write meta file and vector file
for index in range(len(w2v.wv.index2word)):
    word = w2v.wv.index2word[index]
    vec = w2v.wv.vectors[index]
    count = w2v.wv.vocab[word].count
#     out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
# out_m.close()

In [None]:
from collections import defaultdict

unique_words = defaultdict(int)

for index in range(len(w2v.wv.index2word)):
    word = w2v.wv.index2word[index]
    unique_words[w2v.wv.index2word[index]] = w2v.wv.vocab[word].count

In [None]:
len(unique_words)

### Examining co-occurrences for schizophrenic and psychotic

In [None]:
comments_schizophrenic = (dep_post_comments[(dep_post_comments.body.isna() == False)&
                                            (dep_post_comments.body.str.contains('schizophrenic'))]
                          .body)

posts_schizophrenic = (depression[(depression.text_title.isna() == False)&
                                            (depression.text_title.str.contains('schizophrenic'))]
                       .text_title)

In [None]:
# this is a very unaggressive tokenization so we may have to do more later.

# posts_schizophrenic_tokenized = [gensim.utils.simple_preprocess(p) for p in posts_schizophrenic]
# comments_schizophrenia_tokenized = [gensim.utils.simple_preprocess(p) for p in comments_schizophrenia]
# schizophrenic_tokenized = posts_schizophrenic_tokenized + comments_schizophrenia_tokenized

schizophrenic_full = pd.concat([posts_schizophrenic,comments_schizophrenic])

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en")

def process_text(nlp, text, pos_list, lemma=False):

    text_full = [] 
    for doc in nlp.pipe(text, disable=['parser', 'ner', 'tagger']):
        # if part of speech list isnt empty return matches for pos
        if pos_list:
            tokens = [(ent.pos_) for ent in doc if not ent.is_stop and not ent.is_punct and ent.pos_ in pos_list]
            cleaned_text = " ".join(tokens)
            text_full.append(cleaned_text)
        elif lemma == True:
            tokens = [(ent.lemma_) for ent in doc if not ent.is_stop and not ent.is_punct]
            cleaned_text = " ".join(tokens)
            text_full.append(cleaned_text)
        else:
            tokens = [(ent.text) for ent in doc if not ent.is_stop and not ent.is_punct]
            cleaned_text = " ".join(tokens)
            text_full.append(cleaned_text)
            
    return text_full

corp = schizophrenic_full
# corp = useable_text.text_title
processed_text = process_text(nlp, corp, pos_list=[], lemma=False)
len(processed_text)

In [None]:
from nltk import bigrams
import itertools
import collections

tokenized_text = [doc.split() for doc in processed_text]
terms_bigram = [list(bigrams(doc)) for doc in tokenized_text]

schizophrenic_bigrams = []
for item in terms_bigram:
    for bigram in item:
        if 'schizophrenic' in list(bigram):
            schizophrenic_bigrams.append(bigram)

In [None]:
bigram_counts = collections.Counter(schizophrenic_bigrams)
bigram_counts.most_common(50)

In [None]:
schizophrenic_full[schizophrenic_full.str.contains('father')].values[1]

In [None]:
schizophrenic_full[schizophrenic_full.str.contains('father')].values[2]

In [None]:
schizophrenic_full[schizophrenic_full.str.contains('father')].values[5]

In [None]:
schizophrenic_full[schizophrenic_full.str.contains('father')].values[6]

In [None]:
schizophrenic_full[schizophrenic_full.str.contains('homicidal')].values[0]

In [None]:
schizophrenic_full[schizophrenic_full.str.contains('homicidal')].values[1]

In [None]:
schizophrenic_full[schizophrenic_full.str.contains('homicidal')].values[2]

In [None]:
schizophrenic_full[schizophrenic_full.str.contains('psychopath')].values[1]

In [None]:
schizophrenic_full[schizophrenic_full.str.contains('psychopath')].values[2]

In [None]:
comments_schizophrenic = (dep_post_comments[(dep_post_comments.body.isna() == False)&
                                            (dep_post_comments.body.str.contains('schizophrenic'))]
                          [['created_utc_comment','body']])

posts_schizophrenic = (depression[(depression.text_title.isna() == False)&
                                            (depression.text_title.str.contains('schizophrenic'))]
                       [['date','text_title']])

In [None]:
comments_schizophrenic['date'] = pd.to_datetime(comments_schizophrenic.created_utc_comment, unit='s')

In [None]:
posts_schizophrenic.groupby(posts_schizophrenic.date.dt.year).size()

In [None]:
# depression[depression.selftext.str.contains('the_donald')]