In [2]:
import requests
import json
from tqdm import tqdm
from convokit import Corpus, User, Utterance

In [3]:
mh_corpus = Corpus(filename=download("subreddit-mentalhealth"))
mh_corpus.load_info('utterance',['parsed'])

Dataset already exists at /Users/Emilie/.convokit/downloads/subreddit-mentalhealth


In [5]:
#a = Corpus(filename=download("subreddit-Anxiety"))

In [6]:
e = Corpus(filename=download('subreddit-EatingDisorders'))

Downloading subreddit-EatingDisorders to /Users/Emilie/.convokit/downloads/subreddit-EatingDisorders
Downloading subreddit-EatingDisorders from http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/corpus-zipped/EarthPornErrors~-~Ebay_deals/EatingDisorders.corpus.zip (6.7MB)... Done


In [20]:
all_data = mh_corpus.merge(e)

{'subreddit': 'EatingDisorders',
 'num_posts': 3457,
 'num_comments': 18755,
 'num_user': 4812}

In [27]:
#https://gist.github.com/xandaschofield/3c4070b2f232b185ce6a09e47b4e7473 

import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer as CV
import string

exclude = set(string.punctuation)


def basic_sanitize(in_string):
    '''Returns a very roughly sanitized version of the input string.'''
    return_string = ''.join([ch for ch in in_string if ord(ch) < 128 and ch not in exclude]).lower()
    return_string = ' '.join(return_string.split())
    return return_string

def bayes_compare_language(l1, l2, ngram=1, prior=.01, cv=None, sig_val=2.573):
    '''
    Arguments:
    - l1, l2; a list of strings from each language sample
    - ngram; an int describing up to what n gram you want to consider (1 is unigrams,
    2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.
    - prior; either a float describing a uniform prior, or a vector describing a prior
    over vocabulary items. If you're using a predefined vocabulary, make sure to specify that
    when you make your CountVectorizer object.
    - cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.
    Returns:
    - A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.'''
    if cv is None and type(prior) is not float:
        print("If using a non-uniform prior:")
        print("Please also pass a count vectorizer with the vocabulary parameter set.")
        quit()
    l1 = [basic_sanitize(l) for l in l1]
    l2 = [basic_sanitize(l) for l in l2]
    if cv is None:
        cv = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,ngram),
                binary = False,
                max_features = 15000)
    counts_mat = cv.fit_transform(l1+l2).toarray()
    # Now sum over languages...
    vocab_size = len(cv.vocabulary_)
    print("Vocab size is {}".format(vocab_size))
    if type(prior) is float:
        priors = np.array([prior for i in range(vocab_size)])
    else:
        priors = prior
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
    a0 = np.sum(priors)
    n1 = 1.*np.sum(count_matrix[0,:])
    n2 = 1.*np.sum(count_matrix[1,:])
    print("Comparing language...")
    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
        term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))        
        delta = term1 - term2
        #compute variance on delta
        var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
        #store final score
        z_scores[i] = delta/np.sqrt(var)
    index_to_term = {v: k for k, v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = [(index_to_term[i], z_scores[i]) for i in sorted_indices]
    
    x_vals = count_matrix.sum(axis=0)
    y_vals = z_scores
    sizes = abs(z_scores) * 2
    neg_color, pos_color, insig_color = ('orange', 'purple', 'grey')
    colors = []
    annots = []
    for i, y in enumerate(y_vals):
        if y > sig_val:
            colors.append(pos_color)
            annots.append(index_to_term[i])
        elif y < -sig_val:
            colors.append(neg_color)
            annots.append(index_to_term[i])
        else:
            colors.append(insig_color)
            annots.append(None)

    fig, ax = plt.subplots()
    ax.scatter(x_vals, y_vals, c=colors, s=sizes, linewidth=0)
    for i, annot in enumerate(annots):
        if annot is not None:
            ax.annotate(annot, (x_vals[i], y_vals[i]), color=colors[i], size=sizes[i])
    ax.set_xscale('log')
    
    plt.savefig('test.pdf')

In [32]:
from nltk.corpus import stopwords
stopwords  = stopwords.words('english')

def removeStopwords(wordlist, stopwords):
    return [w for w in wordlist if w not in stopwords]

def wordListToFreqDict(wordlist):
    wordfreq = [wordlist.count(p) for p in wordlist]
    return dict(list(zip(wordlist,wordfreq)))

In [33]:
mh_convos = [] 
for i in (list(mh_corpus.iter_conversations())):
    mh_convos.append((i.meta['title']))

mh_wordlist = [i.split() for i in mh_convos]
mh_clean_wordlist = removeStopwords(mh_wordlist, stopwords)

In [34]:
e_convos = [] 
for i in (list(e.iter_conversations())):
    e_convos.append((i.meta['title']))

e_wordlist = [i.split() for i in e_convos]
e_clean_wordlist = removeStopwords(e_wordlist, stopwords)

In [40]:
e_convos

['GF has been having stomach problems recently, just yesterday admitted to me she was bulimic for 4yrs before we were together(x-post from r/health)',
 'BED is destroying me.',
 'People Say Stupid Things',
 'Alcoholism + Eating Disorders = Drunkorexia?',
 'Eating Disorders and Holiday Guilt',
 'Help Me Help Make ED Treatment More Accessible to All Who Need It!',
 "I don't know what it's considered but I don't want help with my ED.",
 'New to /r/EatingDisorders. If anyone needs someone to listen, you can PM me.',
 'First World Eating Disorder Problems',
 'What does recovery look like for you?',
 "I don't know what to do...",
 "I don't know what to do",
 'Eating Disorders and the Media: Anti-Airbrush Movement',
 "I'm not sure if I have an eating disorder, some advice?",
 'The long-term effects...',
 "I don't know what to do",
 'ED Sufferer seeks healthy ways to lose weight. Needs to stop the yo-yoing, any advice?',
 "I've been having trouble eating for a little over a year.",
 'Your opin

In [37]:
bayes_compare_language(mh_convos, e_convos)

Vocab size is 413
Comparing language...
