## Steps to mount drive to access folders and install packages

In [None]:
import os
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import Phrases
from gensim.test.utils import datapath
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
nltk.download('wordnet')
stemmer = SnowballStemmer(language='english')

## Extract out documents from Pre-Pandemic Data and Pre-Process

Procedure adapted from: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [None]:
'''
We extract out a sampling of posts from pre- and mid-pandemic mental health subreddits.
These are stemmed and lemmatized. They are then made into a dictionary and are used
to create a bag of words corpus. This corpus and dictionary are used to form the LDA model.

'''

data_folder = './data/input/reddit_mental_health_dataset'
output_path = "./data/output/"

pre_pandemic = True

pre_files = ['suicidewatch_pre_features_tfidf_256.csv',
        'depression_pre_features_tfidf_256.csv',
        'ptsd_pre_features_tfidf_256.csv',
        'anxiety_pre_features_tfidf_256.csv',
        'socialanxiety_pre_features_tfidf_256.csv',
        'bipolarreddit_pre_features_tfidf_256.csv',
        'bpd_pre_features_tfidf_256.csv',
        'schizophrenia_pre_features_tfidf_256.csv',
        'EDAnonymous_pre_features_tfidf_256.csv',
        'alcoholism_pre_features_tfidf_256.csv',
        'addiction_pre_features_tfidf_256.csv',
        'adhd_pre_features_tfidf_256.csv',
        'autism_pre_features_tfidf_256.csv',
        'lonely_pre_features_tfidf_256.csv']

mid_files = ['suicidewatch_post_features_tfidf_256.csv',
        'depression_post_features_tfidf_256.csv',
        'ptsd_post_features_tfidf_256.csv',
        'anxiety_post_features_tfidf_256.csv',
        'socialanxiety_post_features_tfidf_256.csv',
        'bipolarreddit_post_features_tfidf_256.csv',
        'bpd_post_features_tfidf_256.csv',
        'schizophrenia_post_features_tfidf_256.csv',
        'EDAnonymous_post_features_tfidf_256.csv',
        'alcoholism_post_features_tfidf_256.csv',
        'addiction_post_features_tfidf_256.csv',
        'adhd_post_features_tfidf_256.csv',
        'autism_post_features_tfidf_256.csv',
        'lonely_post_features_tfidf_256.csv']


pre_sample = 2700
mid_sample = 1300
sample = pre_sample if pre_pandemic else mid_sample
files = pre_files if pre_pandemic else mid_files

# can use these dates for generating the mid-pandemic model with the acute phase of the pandemic
# to use, uncomment date lines below
beg_date = datetime.datetime(2020, 3, 16)
end_date = datetime.datetime(2020, 4, 20)

health_anxiety_file = 'healthanxiety_pre_features_tfidf_256.csv' if pre_pandemic else 'healthanxiety_post_features_tfidf_256.csv'

data = pd.read_csv(os.path.join(data_folder, health_anxiety_file))
data = data.sample(sample)
# data['date'] = pd.to_datetime(data['date'], format="%Y/%m/%d")
# data = data.loc[(data['date'] >= beg_date)]
# data =  data.loc[(data['date'] < end_date)]
data_text = data.loc[:,['post']]

for file in files:
    data = pd.read_csv(os.path.join(data_folder, file));
    data = data.sample(sample)
#     data['date'] = pd.to_datetime(data['date'], format="%Y/%m/%d")
#     data = data.loc[(data['date'] >= beg_date)]
#     data =  data.loc[(data['date'] < end_date)]
    data_text = pd.concat([data_text, data.loc[:,['post']]], axis=0, ignore_index=True)
data_text['index'] = data_text.index
documents = data_text

In [None]:
# Look at shape of documents and a sample post
print(documents.shape)
# print(documents.iloc[20][0])

In [None]:
# Methods for stemming and lemmatizing all posts
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
# Preprocess all documents to stem and lemmatize the words
processed_docs = documents['post'].map(preprocess)

In [None]:
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(processed_docs, min_count=20)
for idx in range(len(processed_docs)):
    for token in bigram[processed_docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            processed_docs[idx].append(token)

In [None]:
# Use processed documents to create a dictionary of unigrams and bigrams. Filter to include only top 100k.
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

dictionary.filter_extremes(no_below=0.001, no_above=0.5, keep_n=100000)


In [None]:
# Transform all processed documents into bag of words format based on dictionary.
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


## Create and Save LDA Model

In [None]:
# Create LDA model and print out topics
lda_model = gensim.models.LdaMulticore(corpus=bow_corpus, num_topics=10, id2word=dictionary, passes=25, workers=3)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
# Process a sample document to see the topics that are contained in that document
print(processed_docs[5000])

for index, score in sorted(lda_model[bow_corpus[3]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

In [None]:
lda_output_path = os.path.join(output_path, 'lda_models')
model_name = "model_pre_10"
model_path_name = datapath(os.path.join(lda_output_path + model_name))

# Save model to disk.
# lda_model.save(temp_file)

# Load a pretrained model from disk.
lda_model = gensim.models.LdaMulticore.load(model_path_name)
dictionary = gensim.corpora.Dictionary.load(os.path.join(lda_output_path, "{}.id2word".format(model_name)))

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

## Apply LDA Model to all data

Use LDA model created above - apply to all data and create a heatmap.

In [None]:
# labels of LDA topics, depending on the model you have chosen
pre_topics = ['Autism/ADHD + \nSchool/Work', 'Alcohol/Addiction', 'Sleep Issues', 
              'Alcohol/Eating Disorders', 'Social Interaction', 'Schizophrenia', 
              'Medical/Medication', 'Health Anxiety', 'Mental Health Help', 'Life']

mid_topics = ['Health Anxiety', 'Autism/Social', 'ADHD/Diagnosis', 
                             'Work/School/Home', 'Eating Disorder', 'Alcohol/Addiction', 
                             'Family', 'Sleep Issues', 'Social/Life', 'Mental Health/PTSD']

distribution_output_path = os.path.join(output_path, 'lda_distribution')

num_topics = 10

Create heatmap and run LDA model on all pre-pandemic mental health posts

In [None]:
files = ['suicidewatch_pre_features_tfidf_256.csv',
        'depression_pre_features_tfidf_256.csv',
        'ptsd_pre_features_tfidf_256.csv',
        'anxiety_pre_features_tfidf_256.csv',
        'socialanxiety_pre_features_tfidf_256.csv',
        'healthanxiety_pre_features_tfidf_256.csv',
        'bipolarreddit_pre_features_tfidf_256.csv',
        'bpd_pre_features_tfidf_256.csv',
        'schizophrenia_pre_features_tfidf_256.csv',
        'EDAnonymous_pre_features_tfidf_256.csv',
        'alcoholism_pre_features_tfidf_256.csv',
        'addiction_pre_features_tfidf_256.csv',
        'adhd_pre_features_tfidf_256.csv',
         'autism_pre_features_tfidf_256.csv',
         'lonely_pre_features_tfidf_256.csv',
        'mentalhealth_pre_features_tfidf_256.csv'
        ]

pre_mental_df = pd.DataFrame(columns=range(num_topics))

for file in files:
  df = pd.read_csv(os.path.join(data_folder, file));
  posts = df.post
  posts = [dictionary.doc2bow(preprocess(post)) for post in posts]
  scores = np.zeros(shape=num_topics)
  for post in posts:
    topics = lda_model[post]
    for index, score in topics:
      scores[index] += score
  scores /= len(posts)
  # Add a new row at index k with values provided in list
  pre_mental_df.loc[os.path.splitext(file)[0]] = scores.tolist()

pre_mental_df.to_csv(os.path.join(distribution_output_path, 'pre_mentalhealth_distribution.csv'))

plt.figure(figsize=(12, 9))
ylabels = [file.split('_')[0] for file in files]
sns.set(style='white', font_scale=1, palette=sns.color_palette("husl",15))
chart = sns.heatmap(pre_mental_df, vmin=0, vmax=0.75, annot=True, xticklabels=pre_topics, yticklabels=ylabels)
chart.set_xticklabels(
    chart.get_xticklabels(), 
    rotation=45, 
    horizontalalignment='right',
    fontsize=15

)
chart.set_yticklabels(
    chart.get_yticklabels(),
    fontsize=15

)
plt.title('Pre-Pandemic Mental Health LDA')
plt.xlabel('Topics', fontsize=15)
plt.ylabel('Subreddit', fontsize=15)
plt.savefig(os.path.join(distribution_output_path, 'pre_mentalhealth.png'), 
            format='png', dpi=400, bbox_inches='tight')

Create heatmap and run LDA model on all mid-pandemic mental health posts

In [None]:
files = ['suicidewatch_post_features_tfidf_256.csv',
        'depression_post_features_tfidf_256.csv',
        'ptsd_post_features_tfidf_256.csv',
        'anxiety_post_features_tfidf_256.csv',
        'socialanxiety_post_features_tfidf_256.csv',
        'healthanxiety_post_features_tfidf_256.csv',
        'bipolarreddit_post_features_tfidf_256.csv',
        'bpd_post_features_tfidf_256.csv',
        'schizophrenia_post_features_tfidf_256.csv',
        'EDAnonymous_post_features_tfidf_256.csv',
        'alcoholism_post_features_tfidf_256.csv',
        'addiction_post_features_tfidf_256.csv',
        'adhd_post_features_tfidf_256.csv',
         'autism_post_features_tfidf_256.csv',
         'lonely_post_features_tfidf_256.csv',
        'mentalhealth_post_features_tfidf_256.csv'
        ]

mid_mental_df = pd.DataFrame(columns=range(num_topics))
beg_date = datetime.datetime(2020, 3, 16)

for file in files:
  df = pd.read_csv(os.path.join(data_folder, file));
  df['date'] = pd.to_datetime(df['date'], format="%Y/%m/%d")
  df_mid = df.loc[df['date'] >= beg_date]
  posts = df_mid.post
  posts = [dictionary.doc2bow(preprocess(post)) for post in posts]
  scores = np.zeros(shape=num_topics)
  for post in posts:
    topics = lda_model[post]
    for index, score in topics:
      scores[index] += score
  scores /= len(posts)
  # Add a new row at index k with values provided in list
  mid_mental_df.loc[os.path.splitext(file)[0]] = scores.tolist()

mid_mental_df.to_csv(os.path.join(distribution_output_path, 'mid_mentalhealth_distribution.csv'))

plt.figure(figsize=(12, 9))
ylabels = [file.split('_')[0] for file in files]
sns.set(style='white', font_scale=1, palette=sns.color_palette("husl",15))
chart = sns.heatmap(mid_mental_df, vmin=0, vmax=0.75, annot=True, 
                    xticklabels=pre_topics, yticklabels=ylabels)
chart.set_xticklabels(
    chart.get_xticklabels(), 
    rotation=45, 
    horizontalalignment='right',
    fontsize=15

)
chart.set_yticklabels(
    chart.get_yticklabels(),
    fontsize=15

)
plt.title('Mid-Pandemic Mental Health LDA')
plt.xlabel('Topics', fontsize=15)
plt.ylabel('Subreddit', fontsize=15)
plt.savefig(os.path.join(distribution_output_path, 'mid_mentalhealth.png'), 
            format='png', dpi=400, bbox_inches='tight')

Create heatmap and run LDA model on all pre-pandemic non-mental health posts

In [None]:
files = ['meditation_pre_features_tfidf_256.csv',
          'personalfinance_pre_features_tfidf_256.csv',
          'teaching_pre_features_tfidf_256.csv',
          'relationships_pre_features_tfidf_256.csv',
          'legaladvice_pre_features_tfidf_256.csv',
          'fitness_pre_features_tfidf_256.csv',
          'parenting_pre_features_tfidf_256.csv',
          'divorce_pre_features_tfidf_256.csv',
         'conspiracy_pre_features_tfidf_256.csv',
         'guns_pre_features_tfidf_256.csv',
         'jokes_pre_features_tfidf_256.csv']

pre_nonmentalhealth_df = pd.DataFrame(columns=range(num_topics))

for file in files:
  df = pd.read_csv(os.path.join(data_folder, file));
  posts = df.post
  posts = [dictionary.doc2bow(preprocess(post)) for post in posts]
  scores = np.zeros(shape=num_topics)
  for post in posts:
    topics = lda_model[post]
    for index, score in topics:
      scores[index] += score
  scores /= len(posts)
  # Add a new row at index k with values provided in list
  pre_nonmentalhealth_df.loc[os.path.splitext(file)[0]] = scores.tolist()

pre_nonmentalhealth_df.to_csv(os.path.join(distribution_output_path, 'pre_nonmentalhealth_distribution.csv'))

plt.figure(figsize=(12, 9))
ylabels = [file.split('_')[0] for file in files]
sns.set(style='white', font_scale=1, palette=sns.color_palette("husl",15))
chart = sns.heatmap(pre_nonmentalhealth_df, vmin=0, vmax=0.75, annot=True, 
                    xticklabels=pre_topics, yticklabels=ylabels)
chart.set_xticklabels(
    chart.get_xticklabels(), 
    rotation=45, 
    horizontalalignment='right',
    fontsize=15

)
chart.set_yticklabels(
    chart.get_yticklabels(),
    fontsize=15

)
plt.title('Pre-Pandemic Non-Mental Health LDA')
plt.xlabel('Topics', fontsize=15)
plt.ylabel('Subreddit', fontsize=15)
plt.savefig(os.path.join(distribution_output_path, 'pre_nonmentalhealth.png'), 
            format='png', dpi=400, bbox_inches='tight')

Create heatmap and run LDA model on all mid-pandemic control posts

In [None]:
files = ['meditation_post_features_tfidf_256.csv',
          'personalfinance_post_features_tfidf_256.csv',
          'teaching_post_features_tfidf_256.csv',
          'relationships_post_features_tfidf_256.csv',
          'legaladvice_post_features_tfidf_256.csv',
          'fitness_post_features_tfidf_256.csv',
          'parenting_post_features_tfidf_256.csv',
          'divorce_post_features_tfidf_256.csv',
         'conspiracy_post_features_tfidf_256.csv',
         'guns_post_features_tfidf_256.csv',
         'jokes_post_features_tfidf_256.csv']

mid_nonmentalhealth_df = pd.DataFrame(columns=range(num_topics))
end_date = datetime.datetime(2020, 3, 16)

for file in files:
  df = pd.read_csv(os.path.join(data_folder, file));
  df['date'] = pd.to_datetime(df['date'], format="%Y/%m/%d")
  df_mid = df.loc[df['date'] >= end_date]
  posts = df_mid.post
  posts = [dictionary.doc2bow(preprocess(post)) for post in posts]
  scores = np.zeros(shape=num_topics)
  for post in posts:
    topics = lda_model[post]
    for index, score in topics:
      scores[index] += score
  scores /= len(posts)
  # Add a new row at index k with values provided in list
  mid_nonmentalhealth_df.loc[os.path.splitext(file)[0]] = scores.tolist()

mid_nonmentalhealth_df.to_csv(os.path.join(distribution_output_path, 'mid_nonmentalhealth_distribution.csv'))


plt.figure(figsize=(12, 9))
ylabels = [file.split('_')[0] for file in files]
sns.set(style='white', font_scale=1, palette=sns.color_palette("husl",15))
chart = sns.heatmap(mid_nonmentalhealth_df, vmin=0, vmax=0.75, annot=True, 
                    xticklabels=pre_topics, yticklabels=ylabels)
chart.set_xticklabels(
    chart.get_xticklabels(), 
    rotation=45, 
    horizontalalignment='right',
    fontsize=15

)
chart.set_yticklabels(
    chart.get_yticklabels(),
    fontsize=15

)
plt.title('Mid-Pandemic Non-Mental Health LDA')
plt.xlabel('Topics', fontsize=15)
plt.ylabel('Subreddit', fontsize=15)
plt.savefig(os.path.join(distribution_output_path, 'pre_nonmentalhealth.png'), 
            format='png', dpi=400, bbox_inches='tight')

Create heatmap from LDA of pre-pandemic posts on all COVID19_Support posts to determine distribution of topics.

In [None]:
files = ['COVID19_support_post_features_tfidf_256.csv']

covid_topic_df = pd.DataFrame(columns=range(num_topics))
end_date = datetime.datetime(2020, 3, 16)

for file in files:
  df = pd.read_csv(os.path.join(data_folder, file));
  df['date'] = pd.to_datetime(df['date'], format="%Y/%m/%d")
  df_mid = df.loc[df['date'] >= end_date]
  posts = df_mid.post
  posts = [dictionary.doc2bow(preprocess(post)) for post in posts]
  scores = np.zeros(shape=num_topics)
  for post in posts:
    topics = lda_model[post]
    for index, score in topics:
      scores[index] += score
  scores /= len(posts)
  # Add a new row at index k with values provided in list
  covid_topic_df.loc[os.path.splitext(file)[0]] = scores.tolist()

covid_topic_df.to_csv(os.path.join(distribution_output_path, 'covid_distribution_pre_model.csv'))

covid_topic_df_transpose = covid_topic_df.transpose()
fig, ax = plt.subplots()

xlabels = ['COVID19_support']
sns.set(style='white', font_scale=1, palette=sns.color_palette("husl",15))
svm = sns.heatmap(covid_topic_df_transpose, vmin=0, vmax=0.6, annot=True, 
            cbar=False, xticklabels=pre_labels, yticklabels=ylabels, square=True)
plt.ylabel('Topics')

plt.savefig(os.path.join(distribution_output_path, 'covid19_pre_model.png'), 
            format='png', dpi=400, bbox_inches='tight')

Create heatmap from LDA of mid-pandemic posts on all COVID19_Support posts to determine distribution of topics. Requires loading the appropriate model into lda_model and dictionary

In [None]:
files = ['COVID19_support_post_features_tfidf_256.csv']

covid_topic_df = pd.DataFrame(columns=range(num_topics))
end_date = datetime.datetime(2020, 3, 16)

for file in files:
  df = pd.read_csv(os.path.join(data_folder, file));
  df['date'] = pd.to_datetime(df['date'], format="%Y/%m/%d")
  df_mid = df.loc[df['date'] >= end_date]
  posts = df_mid.post
  posts = [dictionary.doc2bow(preprocess(post)) for post in posts]
  scores = np.zeros(shape=num_topics)
  for post in posts:
    topics = lda_model[post]
    for index, score in topics:
      scores[index] += score
  scores /= len(posts)
  # Add a new row at index k with values provided in list
  covid_topic_df.loc[os.path.splitext(file)[0]] = scores.tolist()

covid_topic_df.to_csv(os.path.join(distribution_output_path, 'covid_distribution_mid_model.csv'))

covid_topic_df_transpose = covid_topic_df.transpose()
fig, ax = plt.subplots()

xlabels = ['COVID19_support']
sns.set(style='white', font_scale=1, palette=sns.color_palette("husl",15))
svm = sns.heatmap(covid_topic_df_transpose, vmin=0, vmax=0.6, annot=True, 
            cbar=False, xticklabels=pre_labels, yticklabels=ylabels, square=True)
plt.ylabel('Topics')

plt.savefig(os.path.join(distribution_output_path, 'covid19_mid_model.png'), 
            format='png', dpi=400, bbox_inches='tight')

Run Significance Tests

In [None]:
pre_combined_df = pd.concat([pre_mental_df, pre_nonmentalhealth_df])
pre_combined.to_csv(os.path.join(distribution_output_path, 'pre_distribution.csv'))

In [None]:
mid_combined_df = pd.concat([mid_mental_df, mid_nonmentalhealth_df])
mid_combined_df.to_csv(os.path.join(distribution_output_path, 'mid_distribution.csv'))

In [None]:
import scipy.stats
for i in range(0, 10):
    print(scipy.stats.wilcoxon(pre_combined_df[i], mid_combined_df[i]))