In [1]:
import pandas as pd
import numpy as np
import datetime 
import re
from urlextract import URLExtract
import spacy

In [2]:
extractor = URLExtract()
def replace_urls(x):
    urls = extractor.find_urls(x)
    if urls:
        x_new = replace_urls(x.replace(urls[0],''))
        return x_new
    else:
        return x

In [3]:
mh_df =  pd.read_csv('data/mh_rand_large.csv', low_memory=False)
depression = pd.read_csv('data/depression_rand_large.csv', low_memory=False)

mh_df.drop(columns=['Unnamed: 0'], inplace=True)
depression.drop(columns=['Unnamed: 0'], inplace=True)

mh_df['date'] = pd.to_datetime(mh_df.date)
depression['date'] = pd.to_datetime(depression.date)

mh_df.drop_duplicates(subset=['id'], inplace=True)
depression.drop_duplicates(subset=['id'], inplace=True)


In [4]:
df_full = pd.concat([mh_df, depression])

In [5]:
# remove indents and urls and double spaces
df_full['selftext'] = df_full.selftext.map(lambda x: re.sub('\n',' ',str(x)))
df_full['selftext'] = df_full.selftext.map(lambda x: re.sub('  ',' ',str(x)))
df_full['selftext'] = df_full.selftext.map(lambda x: replace_urls(x))

cols_of_interest = ['date', 'id', 'author', 'subreddit', 'title', 'selftext']
df_full_clean = df_full[(df_full.selftext != '[removed]') & (df_full.selftext != '[deleted]')].copy()
df_full_clean[cols_of_interest]

Unnamed: 0,date,id,author,subreddit,title,selftext
0,2019-10-11 16:41:57,dglq7k,liznormal23,mentalhealth,Mental Health Gray Area. Please help.,*TRIGGER WARNING: EATING DISORDER AND SUBSTANC...
1,2019-10-11 15:36:08,dgloc1,tacobean87,mentalhealth,what’s wrong with me ?,every single time i read something my mind blu...
2,2019-10-11 15:09:15,dglieu,AquilaVI,mentalhealth,I'm trying to deal with a deadly car crash. (A...,"Hello, people! A little backstory. I'll try to..."
3,2019-10-11 14:31:43,dglgl6,Mrcoolbaby,mentalhealth,Overwhelmed,I feel like my life is on loop. All the same k...
4,2019-10-11 11:39:06,dglfcf,TheA55M4N,mentalhealth,Anxiety and obsessions,I wrote my story here
...,...,...,...,...,...,...
16695,2020-10-23 13:04:12,jgsc4e,reddeaddoritos,depression,What was that??,More than year ago i had some sort of breakdow...
16696,2020-10-23 13:02:54,jgsb6b,satanslittl3sist3r,depression,I don’t want to be here anymore.,I’m bipolar1 and bpd. I have been doing some t...
16697,2020-10-23 12:59:24,jgs8ht,lIIIlllIIlllllIlllll,depression,I have become addicted,I cut myself on the upper arm I really want to...
16698,2020-10-23 12:58:42,jgs80h,ncorona12,depression,expierences with lexopro? (or any antidepressa...,I was prescribed 10 mg Lexapro yesterday and i...


In [6]:
class NLPPipe:
    
    def __init__(self, vectorizer, tokenizer, disable, pos, lemma=False):
        
        self.vectorizer = vectorizer
        self.tokenize = tokenizer
        self.disable = disable
        self.pos_list = pos
        self.lemma = lemma
    
    def process_text(self, text):

        nlp = spacy.load("en_core_web_sm")
        nlp.vocab["myself"].is_stop = False
        nlp.vocab[" "].is_stop = True
        nlp.vocab["like"].is_stop = True
        nlp.vocab["think"].is_stop = True
        nlp.vocab["know"].is_stop = True

        text_full = [] 
        for doc in nlp.pipe(text, disable=self.disable):
            # if part of speech list isnt empty return matches for pos
            if self.pos_list:
                tokens = [(ent.text) for ent in doc if not ent.is_stop and not ent.is_punct and ent.pos_ in self.pos_list]
                cleaned_text = " ".join(tokens)
                text_full.append(cleaned_text)
            elif self.lemma == True:
                tokens = [(ent.lemma_) for ent in doc if not ent.is_stop and not ent.is_punct]
                cleaned_text = " ".join(tokens)
                text_full.append(cleaned_text)
            else:
                tokens = [(ent.text) for ent in doc if not ent.is_stop and not ent.is_punct]
                cleaned_text = " ".join(tokens)
                text_full.append(cleaned_text)

        return text_full
    
    def fit(self, text):
        clean_text = self.process_text(text)
        return self.vectorizer.fit(clean_text)
    
    def transform(self, text):
        clean_text = self.process_text(text)
        return self.vectorizer.transform(clean_text)
    
    def fit_transform(self, text):
        clean_text = self.process_text(text)
        return self.vectorizer.fit_transform(clean_text)

In [34]:
def process_text(text, pos_list, lemma=False):

    nlp = spacy.load("en_core_web_sm")
    nlp.vocab["myself"].is_stop = False
    nlp.vocab[" "].is_stop = True
    nlp.vocab["like"].is_stop = True
    nlp.vocab["think"].is_stop = True
    nlp.vocab["know"].is_stop = True

    text_full = [] 
    for doc in nlp.pipe(text, disable=['parser', 'ner']):
        # if part of speech list isnt empty return matches for pos
        if pos_list:
            tokens = [(ent.text) for ent in doc if not ent.is_stop and not ent.is_punct and ent.pos_ in pos_list]
            cleaned_text = " ".join(tokens)
            text_full.append(cleaned_text)
        elif lemma == True:
            tokens = [(ent.lemma_) for ent in doc if not ent.is_stop and not ent.is_punct]
            cleaned_text = " ".join(tokens)
            text_full.append(cleaned_text)
        else:
            tokens = [(ent.text) for ent in doc if not ent.is_stop and not ent.is_punct]
            cleaned_text = " ".join(tokens)
            text_full.append(cleaned_text)
            
    return text_full

In [43]:
corpus = df_full_clean.selftext
a = process_text(corpus[:100], pos_list=['NOUN', 'ADJ'])
a

 'time mind chunk category hallucination brain internet diagnoses category psych eval mind chunk text word schizophrenia letters',
 'people things jobs Truck vehicle recovery job interview problem corpses blood horror movies gory picture Today car accident help recovery car site accident ramp highway work car row trees mph estimates bits flesh bones car tree driver thing blur things sight body bag number parts effect way thing childs booster seat wreckage man family grief accident life people suicide crash scooters helmets trunk car pieces car people eyes moment father emotions smell',
 'life loop incidents friends relationship lot failure People friends time pysco guy wierd person relationship friendship people lot fuck person lot relation fuck fucks time life people life lepper game',
 'story',
 'toughie clients church faith technique husband state diagnosis church husband church diagnosis suggestions condition progress church husband',
 '',
 'disappointment suicide times subject blo

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfdif_vect = TfidfVectorizer(
#                              analyzer='word',
#                              stop_words='english',
#                              lowercase = True,
                             max_df = 0.5, 
                             min_df = 10,
                             ngram_range=(1, 1),
#                              max_features=4000,
                             token_pattern = r'\b[a-zA-Z]{3,}\b'
                             )

pipeline = NLPPipe(vectorizer=tfdif_vect,
                  tokenizer=None,
                  disable=["parser", "ner"],
                  pos=['NOUN', 'ADJ'],
                  lemma=False)

word_vect = pipeline.fit_transform(corpus).toarray()

In [10]:
df = pd.DataFrame(word_vect)
df.columns = pipeline.vectorizer.get_feature_names()
df

Unnamed: 0,able,abnormal,absent,absolute,abstract,absurd,abused,abusive,abysmal,academic,...,worthwhile,worthy,wrong,yearly,yell,yellow,young,younger,youngest,youtube
0,0.124983,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.620704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29244,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29245,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29246,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29247,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px

In [12]:
lsa = TruncatedSVD(10, random_state=0)
topic_mat = lsa.fit_transform(word_vect)
lsa.explained_variance_ratio_

array([0.01140351, 0.01415294, 0.01285447, 0.01261209, 0.01184448,
       0.01148517, 0.01065276, 0.00993871, 0.00931763, 0.00914578])

In [13]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [14]:
# display_topics(lsa, pipeline.vectorizer.get_feature_names(), 20)

In [15]:
def plot_topics(topic_model, topic, num_words=20):
    
    num_topics = topic_model.components_.shape[0]
    
    topic_word = pd.DataFrame(topic_model.components_,
                              index=['topic_'+str(i) for i in range(num_topics)],
                              columns=pipeline.vectorizer.get_feature_names())

    topics_df = topic_word.T
    top_words = topics_df['topic_'+str(topic)].sort_values(ascending=False)
    top_words[0:20]

    fig = px.bar(y=top_words[0:num_words].index,
                 x=top_words[0:num_words].values,
    #              color=filtered_topics.toxic,
                 orientation='h',
                 width=500,
                 height=1000)
    fig.show()

In [18]:
plot_topics(lsa, topic=1, num_words=40)

In [17]:
nmf = NMF(10, alpha=.2, l1_ratio=.5, random_state=0)
doc_topic = nmf.fit_transform(word_vect)

In [24]:

# for i in range(10):
#     print('topic', i)
plot_topics(nmf, topic=4, num_words=50)

In [25]:
lda = LatentDirichletAllocation(n_components=20, random_state=0)
lda.fit(word_vect)

LatentDirichletAllocation(n_components=20, random_state=0)

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda, np.matrix(word_vect), pipeline.vectorizer, mds='mmds')