In [1]:
import requests
import json
from tqdm import tqdm
from convokit import Corpus, User, Utterance, download

In [2]:
mh_corpus = Corpus(filename=download("subreddit-depressed"))
mh_corpus.load_info('utterance',['parsed'])

Dataset already exists at /Users/Emilie/.convokit/downloads/subreddit-depressed


In [3]:
#a = Corpus(filename=download("subreddit-Anxiety"))

In [4]:
e = Corpus(filename=download('subreddit-depressed'))

Dataset already exists at /Users/Emilie/.convokit/downloads/subreddit-depressed


In [5]:
all_data = mh_corpus.merge(e)

In [6]:
#https://gist.github.com/xandaschofield/3c4070b2f232b185ce6a09e47b4e7473 

import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer as CV
import string

exclude = set(string.punctuation)


def basic_sanitize(in_string):
    '''Returns a very roughly sanitized version of the input string.'''
    return_string = ''.join([ch for ch in in_string if ord(ch) < 128 and ch not in exclude]).lower()
    return_string = ' '.join(return_string.split())
    return return_string

def bayes_compare_language(l1, l2, ngram=1, prior=.01, cv=None, sig_val=2.573):
    '''
    Arguments:
    - l1, l2; a list of strings from each language sample
    - ngram; an int describing up to what n gram you want to consider (1 is unigrams,
    2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.
    - prior; either a float describing a uniform prior, or a vector describing a prior
    over vocabulary items. If you're using a predefined vocabulary, make sure to specify that
    when you make your CountVectorizer object.
    - cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.
    Returns:
    - A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.'''
    if cv is None and type(prior) is not float:
        print("If using a non-uniform prior:")
        print("Please also pass a count vectorizer with the vocabulary parameter set.")
        quit()
    l1 = [basic_sanitize(l) for l in l1]
    l2 = [basic_sanitize(l) for l in l2]
    if cv is None:
        cv = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,ngram),
                binary = False,
                max_features = 15000)
    counts_mat = cv.fit_transform(l1+l2).toarray()
    # Now sum over languages...
    vocab_size = len(cv.vocabulary_)
    print("Vocab size is {}".format(vocab_size))
    if type(prior) is float:
        priors = np.array([prior for i in range(vocab_size)])
    else:
        priors = prior
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
    a0 = np.sum(priors)
    n1 = 1.*np.sum(count_matrix[0,:])
    n2 = 1.*np.sum(count_matrix[1,:])
    print("Comparing language...")
    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
        term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))        
        delta = term1 - term2
        #compute variance on delta
        var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
        #store final score
        z_scores[i] = delta/np.sqrt(var)
    index_to_term = {v: k for k, v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = [(index_to_term[i], z_scores[i]) for i in sorted_indices]
    
    x_vals = count_matrix.sum(axis=0)
    y_vals = z_scores
    sizes = abs(z_scores) * 2
    neg_color, pos_color, insig_color = ('orange', 'purple', 'grey')
    colors = []
    annots = []
    for i, y in enumerate(y_vals):
        if y > sig_val:
            colors.append(pos_color)
            annots.append(index_to_term[i])
        elif y < -sig_val:
            colors.append(neg_color)
            annots.append(index_to_term[i])
        else:
            colors.append(insig_color)
            annots.append(None)

    fig, ax = plt.subplots()
    ax.scatter(x_vals, y_vals, c=colors, s=sizes, linewidth=0)
    for i, annot in enumerate(annots):
        if annot is not None:
            ax.annotate(annot, (x_vals[i], y_vals[i]), color=colors[i], size=sizes[i])
    ax.set_xscale('log')
    
    plt.savefig('test.pdf')

In [7]:
from nltk.corpus import stopwords
stopwords  = stopwords.words('english')

def removeStopwords(wordlist, stopwords):
    return [w for w in wordlist if w not in stopwords]

def wordListToFreqDict(wordlist):
    wordfreq = [wordlist.count(p) for p in wordlist]
    return dict(list(zip(wordlist,wordfreq)))

In [8]:
mh_convos = [] 
for i in (list(mh_corpus.iter_conversations())):
    mh_convos.append((i.meta['title']))

mh_wordlist = [i.split() for i in mh_convos]
mh_clean_wordlist = removeStopwords(mh_wordlist, stopwords)

In [9]:
e_convos = [] 
for i in (list(e.iter_conversations())):
    e_convos.append((i.meta['title']))

e_wordlist = [i.split() for i in e_convos]
e_clean_wordlist = removeStopwords(e_wordlist, stopwords)

In [10]:
e_convos

["Wasn't sure where else to post this... feeling terrible",
 'In my head',
 'Boyfriend just broke up with me today;I thought he was the one.',
 'I have 6 months to live',
 'Really Depressed',
 'Progression of depression...',
 'A story of Depression',
 'Nobody to talk to.',
 "I'm doing it tomorrow...",
 'I need help',
 'Tired of life',
 'I hate myself',
 "I've done everything and it wasn't enough",
 '"You should be happy"',
 "i Don't even think I can withstand life itself anymore",
 "Apparently I can't even bring up the most basic social skills.",
 'Not quite sure how to put it all',
 'What goes on in my head',
 'What happened?',
 'Follow up: "I\'m doing it tomorrow..."',
 "Realizing I'm have depression.",
 'What do I do?',
 'Its not the same anymore.',
 'Writing this helped me cry then I felt better so here we go',
 "I don't want to be like this anymore",
 'Best friend left me.',
 'Human',
 'Just tears at my heart...',
 'So depressed',
 'Is this it? Is this what I have to look foward t

In [11]:
bayes_compare_language(mh_convos, e_convos)

Vocab size is 53
Comparing language...


Topic Modeling 

In [12]:
#https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/topic_modeling_Gensim.ipynb 
# https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


[nltk_data] Downloading package wordnet to /Users/Emilie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Emilie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
text_data = []
for line in e_convos: 
    tokens = prepare_text_for_lda(line) 
    text_data.append(tokens)

In [14]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')


## Try 5 topics

In [15]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [16]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.026*"please" + 0.026*"depression" + 0.018*"enough" + 0.018*"month"')
(1, '0.029*"everyone" + 0.029*"really" + 0.022*"happy" + 0.015*"everything"')
(2, '0.051*"feeling" + 0.022*"nothing" + 0.022*"break" + 0.022*"stick"')
(3, '0.090*"depression" + 0.063*"anymore" + 0.029*"depress" + 0.022*"school"')
(4, '0.064*"depress" + 0.042*"friend" + 0.035*"anyone" + 0.035*"someone"')


In [17]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

[]
[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
(0, '0.041*"depress" + 0.036*"anymore" + 0.022*"everything" + 0.021*"friend"')
(1, '0.024*"anyone" + 0.019*"depress" + 0.015*"thought" + 0.015*"alone"')
(2, '0.090*"depression" + 0.019*"feeling" + 0.018*"story" + 0.018*"month"')


In [18]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.042*"fail" + 0.042*"tomorrow" + 0.022*"convince" + 0.022*"getting"')
(1, '0.240*"depression" + 0.032*"depress" + 0.017*"night" + 0.017*"found"')
(2, '0.059*"feeling" + 0.059*"really" + 0.040*"afraid" + 0.040*"thinking"')
(3, '0.094*"depress" + 0.022*"nothing" + 0.022*"right" + 0.022*"today"')
(4, '0.032*"write" + 0.032*"alone" + 0.032*"stick" + 0.017*"anyone"')
(5, '0.053*"suck" + 0.053*"think" + 0.053*"school" + 0.036*"lonely"')
(6, '0.045*"someone" + 0.045*"thought" + 0.031*"depress" + 0.031*"reddit"')
(7, '0.054*"everything" + 0.041*"break" + 0.041*"everyone" + 0.027*"month"')
(8, '0.050*"anyone" + 0.050*"happy" + 0.034*"enough" + 0.018*"friend"')
(9, '0.132*"anymore" + 0.031*"another" + 0.031*"quite" + 0.030*"somebody"')


In [19]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [20]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [21]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [22]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
