# Reddit Depression Final Project
Link to the paper: https://dl.acm.org/doi/pdf/10.1145/3578503.3583621

Read through the paper fully before starting the assignment!

In [83]:
!pip install happiestfuntokenizing




In [84]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from happiestfuntokenizing.happiestfuntokenizing import Tokenizer
import spacy
import gc
from itertools import chain
from collections import Counter
import string
import itertools

from google.colab import drive
drive.mount('/content/drive')

FILEPATH = 'drive/MyDrive/comp_ling'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Preprocessing

In [85]:
def load(filename):
  """Load pickles. FILEPATH defined above. filename is the reddit depression data pkl"""
  data = pd.read_pickle(f'{FILEPATH}/{filename}.pkl')
  return data

In [86]:
def dataset_generation(data, subreddit_category_mapping):
  """
  Input:
  raw data and a mapping from subreddit to symptom category
  defined by Liu et al.

  Function:
  Creates control dataset based on Gkotsis et al. (2017) framework
  Adds 'Label' and 'Category' features to Dataframe

  Returns:
  Dataframe with added columns
  """

  # Generate control dataset
  # Create a dictionary where keys are authors and values are the first time (created_utc) they made a post in any of the depression subreddits
  author_date = data[data['subreddit'].isin(depression_subreddits)].groupby('author')['created_utc'].min().to_dict()

  # convert 180 days to utc
  seconds_in_180_days = 180 * 24 * 60 * 60

  # Create "Label" feature that assigns value of 1 to posts in a depression category and 0 to posts in the control category and Nan if neither
  # define control categoy as all posts from non-depression subreddits that have a created_utc at least 180 days before the date listed in author-date pairs
  data['Label'] = np.where(
      (~data['subreddit'].isin(depression_subreddits)) &
      (data['author'].isin(author_date)) &
      (data['created_utc'] <= data['author'].map(author_date) - seconds_in_180_days),
      0,  # Label as 0 if the condition is met
      np.where(
          data['subreddit'].isin(depression_subreddits),
          1,  # Label as 1 if subreddit is in depression_subreddits
          np.nan  # Label as NaN if neither condition is met
      )
  )

  # Create "Category" feature that labels "Control" for all control posts and the
  # depression symptom (eg. "Anger") for the deppression-labelled posts based on their subreddits
  data['Category'] = np.where(data['Label'] == 0, 'Control', data['subreddit'].map(subreddit_category_mapping))

  return data

In [87]:
# List of depression subreddits in the paper
depression_subreddits = ["Anger",
    "anhedonia", "DeadBedrooms",
    "Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack",
    "DecisionMaking", "shouldi",
    "bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous",
    "chronicfatigue", "Fatigue",
    "ForeverAlone", "lonely",
    "cry", "grief", "sad", "Sadness",
    "AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou",
    "insomnia", "sleep",
    "cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus",
    "AdultSelfHarm", "selfharm", "SuicideWatch",
    "Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"
]

# depression symptoms; commented out symptoms not included in analysis
depression_symptoms = ["Anger",
                         "Anhedonia",
                         "Anxiety",
                         #"Concentration deficit",
                         "Disordered eating",
                         #"Fatigue",
                         "Loneliness",
                         "Sad mood",
                         "Self-loathing",
                         "Sleep problem",
                         "Somatic complaint",
                         #"Suicidal thoughts and attempts",
                         "Worthlessness"]

# Dictionary of subreddit to category
subreddit_category_mapping = {
    "Anger": "Anger",
    "anhedonia": "Anhedonia",
    "DeadBedrooms": "Anhedonia",
    "AnxietyDepression": "Anxiety",
    "Anxiety": "Anxiety",
    "HealthAnxiety": "Anxiety",
    "PanicAttack": "Anxiety",
    "DecisionMaking": "Concentration deficit",
    "shouldi": "Concentration deficit",
    "bingeeating": "Disordered eating",
    "BingeEatingDisorder": "Disordered eating",
    "EatingDisorders": "Disordered eating",
    "eating_disorders": "Disordered eating",
    "EDAnonymous": "Disordered eating",
    "chronicfatigue": "Fatigue",
    "Fatigue": "Fatigue",
    "ForeverAlone": "Loneliness",
    "lonely": "Loneliness",
    "cry": "Sad mood",
    "grief": "Sad mood",
    "sad": "Sad mood",
    "Sadness": "Sad mood",
    "AvPD": "Self-loathing",
    "SelfHate": "Self-loathing",
    "selfhelp": "Self-loathing",
    "socialanxiety": "Self-loathing",
    "whatsbotheringyou": "Self-loathing",
    "insomnia": "Sleep problem",
    "sleep": "Sleep problem",
    "cfs": "Somatic complaint",
    "ChronicPain": "Somatic complaint",
    "Constipation": "Somatic complaint",
    "EssentialTremor": "Somatic complaint",
    "headaches": "Somatic complaint",
    "ibs": "Somatic complaint",
    "tinnitus": "Somatic complaint",
    "AdultSelfHarm": "Suicidal thoughts and attempts",
    "selfharm": "Suicidal thoughts and attempts",
    "SuicideWatch": "Suicidal thoughts and attempts",
    "Guilt": "Worthlessness",
    "Pessimism": "Worthlessness",
    "selfhelp": "Worthlessness",
    "whatsbotheringyou": "Worthlessness"
}


In [88]:
# Tokenization and Preprocessing function
def tokenize(symptom_data, control_data):
  """
  Input:
  Posts from a specified symptom (or all symptoms) and control posts

  Function:
  Tokenize all symptom and control posts using happiestfuntokenizer

  Returns:
  Tokenized symptom and control posts
  """

  # preserve_keywords=True because reddit posts are likely to have lots of keywords
  #   like hashtags and @[person], and I think it's best to keep those together
  tokenizer = Tokenizer(preserve_keywords=True)

  # Tokenizing posts from both symptom and control datasets
  symptom_tokens = [tokenizer.tokenize(post) for post in symptom_data]
  control_tokens = [tokenizer.tokenize(post) for post in control_data]

  return symptom_tokens, control_tokens

In [89]:
def stop_words(symptom_tokens, control_tokens):
  """
  Input:
  Tokenized symptom and control posts

  Function:
  Find and remove the top 100 most frequent words

  Returns:
  Filtered tokenizes symptom and control posts
  """
  # Find top 100 most common words in control tokens
  word_counter_list = list(chain.from_iterable(control_tokens))
  common_words = {word for word, _ in Counter(word_counter_list).most_common(100)}

  # Filter out common words from the depression and control datasets
  symptom_tokens_filtered = [
      [token for token in post if token not in common_words] for post in symptom_tokens
  ]
  control_tokens_filtered = [
      [token for token in post if token not in common_words] for post in control_tokens
  ]

  return symptom_tokens_filtered, control_tokens_filtered

## Reddit Topics with LDA

 - Don't use MALLET (as the paper does), use some other LDA implementation.

In [90]:
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary

def train_lda_on_full_data(symptom_data, control_data):
  """
  Input:
  Raw symptom and control posts

  Function:
  Train LDA model on the full dataset (control + depression)

  Returns:
  LDA model, total dictionary, total corpus
  """

  # Tokenize the symptom and control datasets
  symptom_tokens, control_tokens = tokenize(symptom_data, control_data)

  # Remove stopwords from both datasets
  symptom_tokens_filtered, control_tokens_filtered = stop_words(symptom_tokens, control_tokens)

  # Combine the tokenized data (symptom + control)
  combined_tokens = symptom_tokens_filtered + control_tokens_filtered

  # Create a Gensim dictionary and corpus
  dictionary = Dictionary(combined_tokens)
  corpus = [dictionary.doc2bow(text) for text in combined_tokens]

  # Train the LDA model
  lda_model = LdaMulticore(corpus, num_topics=200, id2word=dictionary)

  # save LDA model, dictionary, corpus
  pickle.dump(lda_model, open(f'{FILEPATH}/lda_model.pkl', 'wb'))
  pickle.dump(dictionary, open(f'{FILEPATH}/lda_dictionary.pkl', 'wb'))
  pickle.dump(corpus, open(f'{FILEPATH}/lda_corpus.pkl', 'wb'))

  return lda_model, dictionary, corpus

## Zoom in on Dictionary and Corpus

In [91]:
# Initialize inputs
data = load("reddit_depression_data")
data1 = dataset_generation(data, subreddit_category_mapping)
symptom_posts = data1[data1['Category'] == "Anger"]['text'] # Using Anger as an example
control_posts = data1[data1['Category'] == "Control"]['text']
symptom_tokens, control_tokens = tokenize(symptom_posts, control_posts)
symptom_tokens_filtered, control_tokens_filtered = stop_words(symptom_tokens, control_tokens)
combined_tokens = symptom_tokens_filtered + control_tokens_filtered

# Create Gensim dictionary
dictionary = Dictionary(combined_tokens)
print(dictionary)


Dictionary<24106 unique tokens: ['advice', 'after', 'again', 'always', 'anger']...>


In [92]:
len(dictionary)

24106

In [93]:
type(dictionary)

In [94]:
dictionary[0]

'advice'

In [95]:
dictionary[2404]

'pocket'

In [96]:
for token_id, token in dictionary.items():
  if token_id == 10:
    break
  print(f"ID: {token_id}, Token: {token}")


ID: 0, Token: advice
ID: 1, Token: after
ID: 2, Token: again
ID: 3, Token: always
ID: 4, Token: anger
ID: 5, Token: angry
ID: 6, Token: anyone
ID: 7, Token: avoid
ID: 8, Token: barking
ID: 9, Token: becoming


In [97]:
example_combined_tokens = [
    ['anger', 'feel', 'always'],
    ['after', 'anger', 'advice', 'advice']
]

example_dictionary = Dictionary(example_combined_tokens)
print(example_dictionary)

Dictionary<5 unique tokens: ['always', 'anger', 'feel', 'advice', 'after']>


In [98]:
# Create corpus from dictionary
corpus = [dictionary.doc2bow(text) for text in combined_tokens]


In [99]:
type(corpus)

list

In [100]:
print(corpus[:10])

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 3), (5, 2), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 3), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 2), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 2), (40, 2), (41, 2), (42, 3), (43, 1), (44, 1), (45, 2), (46, 3)], [(4, 1), (5, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1)], [(58, 1), (59, 1), (60, 1), (61, 1), (62, 1)], [(0, 1), (33, 1), (60, 1), (63, 1), (64, 1), (65, 2), (66, 1), (67, 3), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 2), (77, 1), (78, 1), (79, 1), (80, 2), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 2), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 2), (98, 1), (99, 1), (100, 1), (101, 1)], [(61, 1), (102, 1), (103, 1), (104, 1)]

In [101]:
len(corpus)

4924

In [102]:
len(symptom_posts), len(control_posts)

(555, 4369)

In [103]:
len(symptom_posts) + len(control_posts)

4924

In [104]:
corpus[0]

[(0, 2),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 3),
 (5, 2),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 2),
 (15, 1),
 (16, 3),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 2),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 2),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 2),
 (40, 2),
 (41, 2),
 (42, 3),
 (43, 1),
 (44, 1),
 (45, 2),
 (46, 3)]

In [105]:
corpus[1]

[(4, 1),
 (5, 1),
 (47, 1),
 (48, 1),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 1)]

In [106]:
example_combined_tokens = [
    ['anger', 'feel', 'always'],
    ['after', 'anger', 'advice', 'advice']
]

example_dictionary = Dictionary(example_combined_tokens)

example_corpus = [example_dictionary.doc2bow(text) for text in example_combined_tokens]
print(example_dictionary)
print(example_corpus)

Dictionary<5 unique tokens: ['always', 'anger', 'feel', 'advice', 'after']>
[[(0, 1), (1, 1), (2, 1)], [(1, 1), (3, 2), (4, 1)]]


In [107]:
for token_id, token in example_dictionary.items():
    print(f"ID: {token_id}, Token: {token}")


ID: 0, Token: always
ID: 1, Token: anger
ID: 2, Token: feel
ID: 3, Token: advice
ID: 4, Token: after


In [108]:
print(example_corpus[0])

[(0, 1), (1, 1), (2, 1)]


In [109]:
print(example_corpus[1])

[(1, 1), (3, 2), (4, 1)]


## End code discussion

In [110]:
def get_topic_distributions_for_posts(lda_model, corpus):
  """
  Input:
  Trained LDA model, corpus of specific symptom + all control posts

  Function:
  Gets the topic distributions for each symptom and control post in a given corpus

  Returns:
  Matrix of topic distributions for each post

  """
  # Infers the topic distribution for each post in the corpus based on the
  #   word-topic probabilities the model has learned.
  #   Minimum_probability=0.0 to make sure each post has 200 topic items even if the
  #   probability of that topic is 0. This is to ensure no missing datapoints when making matrix
  topic_distributions = [lda_model.get_document_topics(post, minimum_probability=0.0) for post in corpus]

  # Convert topic_distributions to a matrix
  topic_distributions = np.array([[topic_prob for _, topic_prob in doc] for doc in topic_distributions])
  return topic_distributions


In [111]:
def prepare_lda_data_for_symptom(symptom_posts, control_posts, lda_model, dictionary):
  """
  Input:
  Raw symptom and control posts, trained LDA model on full dataset, dictionary of full dataset

  Function:
  Gets the topic distributions for each post in single symptom vs control data

  Returns:
  X_lda as the distribution of topics in each post and labels as whether the post was from
  the depression symptom posts (1) or the control posts (0)
  """

  # Tokenize the symptom and control datasets
  symptom_tokens, control_tokens = tokenize(symptom_posts, control_posts)

  # Remove stopwords from both datasets
  symptom_tokens_filtered, control_tokens_filtered = stop_words(symptom_tokens, control_tokens)

  # Combine the tokenized data (symptom + control)
  combined_tokens = symptom_tokens_filtered + control_tokens_filtered

  # Create the corpus for the LDA model
  corpus = [dictionary.doc2bow(text) for text in combined_tokens]

  # Generate label 1 for symptom, 0 for control
  labels = np.concatenate([np.ones(len(symptom_posts)), np.zeros(len(control_posts))])

  # Generate LDA topic distributions for these posts
  X_lda = get_topic_distributions_for_posts(lda_model, corpus)

  return X_lda, labels


## RoBERTa Embeddings

In [112]:
# TODO: Your RoBERTa code!
from transformers import AutoTokenizer, AutoModel
import torch

def extract_roberta_features(symptom_posts, control_posts, model_name='distilroberta-base', batch_size=16, layer=5):
  """
  Input:
  Raw (specific) symptom and control posts, model name (deafult='distilroberta-base'),
  batch_size (default=16), layer (default=5)

  Function:
  Loads DistilRoBERTa tokenizer and model, batches and tokenizes raw posts before
  feeding them to the model. Extracts and averages hidden state embeddings from layer 5
  and stacks them.

  Returns:
  features - np.array of DistilRoBERTa embeddings per post,
  labels - np.array where 1=symptom and 0=control
  """


  # Load tokenizer and model
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModel.from_pretrained(model_name, output_hidden_states=True)

  # set model to eval mode
  model.eval()

  # put model on cuda
  model.to("cuda")

  # Combine raw symptom and control posts
  combined_text = symptom_posts.tolist() + control_posts.tolist()

  features = []
  # loop through combined text 16 posts at a time
  for i in range(0, len(combined_text), batch_size):
      # create batched slice from combined text
      batch_texts = combined_text[i:i+batch_size]

      # pass batch of posts to DistilRoBERTa tokenizer. Include padding and trunctation
      # Padding is used to extend shorter batches with a pad token, so every batch is the same length.
      #   This prevents errors from passing differnt shaped batches to the model
      # Truncation is used because it ensures that the inputted tokens don't exceed the 512 limit of
      #   RoBERTa. If the batch does exceed, then it will be clipped to the first 512 tokens
      tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")

      # Set tokens to "cuda" to allow model to process them using a GPU
      tokens = {key: val.to("cuda") for key, val in tokens.items()}

      # Set model to no_grad to prevent backpropgation
      with torch.no_grad():
          outputs = model(**tokens)

      # Extract embeddings from the specified layer
      hidden_states = outputs.hidden_states[layer]

      # averages hidden state token embeddings from layer 5 to give shape(embedding,) for each post
      batch_features = hidden_states.mean(dim=1).cpu().numpy()
      features.append(batch_features)

  # Combine all batches
  features = np.vstack(features)

  # Generate label 1 for symptom, 0 for control
  labels = np.concatenate([np.ones(len(symptom_posts)), np.zeros(len(control_posts))])

  return features, labels

## Cross Validation

In [113]:
def cross_validation(X, y):
  """
  Input:
  X - features, y - labels

  Function:
  Perform 5-fold cross validation with random forest to evaluate LDA topic distributions or
  RoBERTa embedding performance on predicting symptom vs control.

  Returns:
  cross-validation results across 5 folds
  """

  rf_classifier = RandomForestClassifier()
  cv = KFold(n_splits=5, shuffle=True)
  results = cross_validate(rf_classifier, X=X, y=y, cv=cv, scoring='roc_auc', return_train_score=True)
  return results

## Main

In [114]:
# load dataset
data = load("reddit_depression_data")

In [115]:
def main(data):
  """
  Input:
  data - loaded from reddit depression dataset pickle

  Function:
  Computes AUC scores of each symptom for LDA probability distributions and DistilRoBERTa
  embeddings using a random forest classier and 5-fold cross-validation.

  Returns:
  Table of summary AUC results

  """

  # Add category and label features
  data = dataset_generation(data, subreddit_category_mapping)

  # Preloaded lda model and dictionary from this call:
  #   lda_model, dictionary, corpus = train_lda_on_full_data(data[data['Label'] == 1]['text'], data[data["Label"] == 0]['text'])
  lda_model_preloaded = pd.read_pickle(f'{FILEPATH}/lda_model.pkl')
  lda_dictionary_preloaded = pd.read_pickle(f'{FILEPATH}/lda_dictionary.pkl')

  # Initialize dictionaries to hold AUC info
  LDA_AUC = {}
  ROBERTA_AUC = {}

  # Loop through each symptom
  for symptom in depression_symptoms:
    # Filter the dataset for the current symptom (and control)
    print(f"{symptom} vs control")
    symptom_posts = data[data['Category'] == symptom]['text']
    control_posts = data[data['Category'] == "Control"]['text']

    print("Running LDA...")
    # Prepare the LDA data (topic distributions and labels)
    X_lda, y = prepare_lda_data_for_symptom(symptom_posts, control_posts, lda_model_preloaded, lda_dictionary_preloaded)

    # run cv on LDA topic distributions
    lda_auc = cross_validation(X_lda, y)
    LDA_AUC[symptom] = lda_auc

    print("Running DistilRoBERTa")
    # Prepare the LDA data (topic distributions and labels)
    X_embeddings, y = extract_roberta_features(symptom_posts, control_posts)

    # run cv on LDA topic distributions
    roberta_auc = cross_validation(X_embeddings, y)
    ROBERTA_AUC[symptom] = roberta_auc

  print("Finished!")
  print("Displaying results")
  # Display LDA and RoBERTa AUCs per symptom
  LDA_scores = []
  ROBERTA_scores = []

  # Create dataframe to visualize AUC for LDA and DistilRoBERTa per symptom
  for symptom in LDA_AUC:
    LDA_scores.append(np.mean(LDA_AUC[symptom]['test_score']))
    ROBERTA_scores.append(np.mean(ROBERTA_AUC[symptom]['test_score']))

  scores = {"Symptoms": depression_symptoms,
            "LDA": LDA_scores,
            "DistilRoBERTa": ROBERTA_scores}

  scores_df = pd.DataFrame(scores)
  print(scores_df)
  scores_df.to_csv(f"{FILEPATH}/LDA_vs_DistilRoBERTa_scores.csv")

In [116]:
main(data)

Anger vs control
Running LDA...
Running DistilRoBERTa


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Anhedonia vs control
Running LDA...
Running DistilRoBERTa
Anxiety vs control
Running LDA...
Running DistilRoBERTa
Disordered eating vs control
Running LDA...
Running DistilRoBERTa
Loneliness vs control
Running LDA...
Running DistilRoBERTa
Sad mood vs control
Running LDA...
Running DistilRoBERTa
Self-loathing vs control
Running LDA...
Running DistilRoBERTa
Sleep problem vs control
Running LDA...
Running DistilRoBERTa
Somatic complaint vs control
Running LDA...
Running DistilRoBERTa
Worthlessness vs control
Running LDA...
Running DistilRoBERTa
Finished!
Displaying results
            Symptoms       LDA  DistilRoBERTa
0              Anger  0.930740       0.946597
1          Anhedonia  0.958075       0.958202
2            Anxiety  0.943000       0.956876
3  Disordered eating  0.952881       0.954538
4         Loneliness  0.870590       0.917033
5           Sad mood  0.842127       0.935559
6      Self-loathing  0.919082       0.942580
7      Sleep problem  0.974569       0.959686
8  Somati