## Topic Modeling - Post Body and Post Title

This was an attempt to bucket post body and post title topics.  Unfortunately, I was unable to come up with any meaningful topics or identify significant differences between the r/Depression and r/SuicideWatch subreddits

In [18]:
import pandas as pd
import numpy as np
import seaborn as sns

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk

In [46]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)][0]

def preprocess(doc):
    result = []
    for word in doc:
        for token in gensim.utils.simple_preprocess(word):
            if token not in gensim.parsing.preprocessing.STOPWORDS:
                result.append(lemmatize_text(token))
    return result


In [84]:
df_d_c = pd.read_csv('depression_comments.csv').drop('Unnamed: 0', axis = 1)
df_s_c = pd.read_csv('suicidewatch_comments.csv').drop('Unnamed: 0', axis = 1)

df_d_c = df_d_c.drop_duplicates(['p_id'], keep='last')
df_s_c = df_s_c.drop_duplicates(['p_id'], keep='last')

print(df_d_c.shape)
print(df_s_c.shape)

(934, 41)
(980, 41)


In [91]:
sw = nltk.corpus.stopwords.words('english')

newStopWords = ['feel','want', 'dont', 'think', 'need', 'feeling']
sw.extend(newStopWords)

In [92]:
df_d_c['title_sw_p'] = df_d_c['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)])).astype(str).str.replace('[^\w\s]', '').str.lower()
df_d_c['title_pre'] = df_d_c['title_sw_p'].str.split(' ')
processed_docs = df_d_c['title_pre'].map(preprocess)
processed_docs[:10]

469                             [regular, checkin, post]
517    [mostbroken, leastunderstood, rule, helper, in...
554              [depressed, people, way, nicer, people]
617                                        [browse, sub]
629                                       [fix, problem]
637    [head, depression, isnt, real, excuse, one, go...
639    [purposely, sad, miserable, loop, selffailure,...
660                     [actually, going, school, today]
678                  [love, wearing, oversized, hoodies]
696                                  [life, short, long]
Name: title_pre, dtype: object

In [93]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


0 checkin
1 post
2 regular
3 contact
4 explain
5 helper
6 invite
7 leastunderstood
8 mostbroken
9 new
10 private


In [94]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[]


In [100]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=2, 
                                       id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.183*"im" + 0.088*"help" + 0.078*"life" + 0.067*"depressed" + 0.058*"day" + 0.052*"time" + 0.051*"like" + 0.047*"sad" + 0.045*"today" + 0.044*"tired"
Topic: 1 
Words: 0.127*"depression" + 0.111*"like" + 0.110*"im" + 0.076*"people" + 0.068*"life" + 0.066*"know" + 0.059*"dont" + 0.054*"hate" + 0.046*"die" + 0.042*"going"


In [99]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=2, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.142*"depression" + 0.100*"help" + 0.079*"people" + 0.071*"im" + 0.064*"know" + 0.055*"sad" + 0.053*"anymore" + 0.047*"time" + 0.046*"thing" + 0.046*"today"
Topic: 1 Word: 0.160*"im" + 0.117*"life" + 0.110*"like" + 0.064*"depressed" + 0.048*"hate" + 0.045*"day" + 0.045*"dont" + 0.044*"friend" + 0.042*"going" + 0.040*"feeling"


In [101]:
df_s_c['title_sw_p'] = df_s_c['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)])).astype(str).str.replace('[^\w\s]', '').str.lower()
df_s_c['title_pre'] = df_s_c['title_sw_p'].str.split(' ')
processed_docs = df_s_c['title_pre'].map(preprocess)
processed_docs[:10]

131    [new, wiki, avoid, accidentally, encouraging, ...
171    [reminder, absolutely, activism, kind, allowed...
192     [store, ice, cream, kill, idk, ill, decide, car]
219                                             [failed]
247        [fear, whats, keep, killing, make, depressed]
258                                                [day]
303                                  [absurd, like, wtf]
321                    [idea, reincarnation, terrifying]
328    [update, told, mom, time, tried, commit, suicide]
336                                 [anonymous, goodbye]
Name: title_pre, dtype: object

In [102]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

0 accidentally
1 avoid
2 covert
3 encouraging
4 incitement
5 new
6 spot
7 suicide
8 wiki
9 absolutely
10 activism


In [103]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 1.0)]


In [108]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=2, 
                                       id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.130*"im" + 0.109*"life" + 0.076*"suicide" + 0.071*"die" + 0.070*"like" + 0.047*"help" + 0.045*"end" + 0.044*"going" + 0.040*"time" + 0.037*"friend"
Topic: 1 
Words: 0.168*"im" + 0.084*"kill" + 0.055*"dont" + 0.051*"people" + 0.048*"anymore" + 0.046*"suicidal" + 0.044*"thought" + 0.044*"today" + 0.039*"suicide" + 0.033*"live"


In [109]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=2, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.099*"suicide" + 0.098*"life" + 0.085*"die" + 0.062*"like" + 0.056*"help" + 0.055*"im" + 0.049*"end" + 0.037*"fucking" + 0.034*"day" + 0.033*"time"
Topic: 1 Word: 0.205*"im" + 0.082*"kill" + 0.052*"going" + 0.051*"anymore" + 0.050*"dont" + 0.047*"know" + 0.045*"people" + 0.044*"today" + 0.042*"suicidal" + 0.038*"help"


In [110]:
df_d_c['title'].unique

<bound method Series.unique of 469                                 Regular Check-In Post
517     Our most-broken and least-understood rules is ...
554     is it just me or are depressed people WAY nice...
617              Anyone else just browse this sub and cry
629              I could fix all my problems, yet I don't
                              ...                        
4030                       Therapists have feelings, too!
4031                    Dexamphetamine for lack of energy
4032                                  Back at square one.
4033                                        what do i do?
4034                                           Depression
Name: title, Length: 934, dtype: object>