<a href="https://colab.research.google.com/github/dantheman6383/reddit-depression-detection/blob/main/Reddit_Depression_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
import string
!pip install happiestfuntokenizing
from happiestfuntokenizing.happiestfuntokenizing import Tokenizer

from google.colab import drive
drive.mount('/content/drive')

FILEPATH = '/content/drive/MyDrive/student.pkl'

Collecting happiestfuntokenizing
  Downloading happiestfuntokenizing-0.0.7.tar.gz (6.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: happiestfuntokenizing
  Building wheel for happiestfuntokenizing (setup.py) ... [?25l[?25hdone
  Created wheel for happiestfuntokenizing: filename=happiestfuntokenizing-0.0.7-py3-none-any.whl size=6711 sha256=7423955a6308d2a08c0fa5ce1feb13bdb3b46f6a49f367a27a2bd3a15f8b00e3
  Stored in directory: /root/.cache/pip/wheels/bf/c9/4d/310f0c60855eb7b428558f29d93cf464dbb64c1b8628753395
Successfully built happiestfuntokenizing
Installing collected packages: happiestfuntokenizing
Successfully installed happiestfuntokenizing-0.0.7
Mounted at /content/drive


## Preprocessing

In [None]:
def load():
  """Load pickles"""
  # open pickle file
  with open(FILEPATH, 'rb') as f:
    # load pickle
    data = pd.read_pickle(f)
  return data

data = load()

In [None]:
depression_subreddits = ["Anger",
    "anhedonia", "DeadBedrooms",
    "Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack",
    "DecisionMaking", "shouldi",
    "bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous",
    "chronicfatigue", "Fatigue",
    "ForeverAlone", "lonely",
    "cry", "grief", "sad", "Sadness",
    "AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou",
    "insomnia", "sleep",
    "cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus",
    "AdultSelfHarm", "selfharm", "SuicideWatch",
    "Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"
]

In [None]:
def dataset_generation():
  """Build control and symptom datasets"""
  datasets = []
  # use dict to associate each symptom to the subreddits that comprise them
  symptom_to_sub = {}

  # get one dataset for each symptom using subreddit column value
  anger = data.loc[data['subreddit'] == "Anger"]
  datasets.append(anger)
  symptom_to_sub["Anger"] = ["Anger"]

  anhedonia = data.loc[(data['subreddit'] == "anhedonia") | (data['subreddit'] == "DeadBedrooms")]
  datasets.append(anger)
  symptom_to_sub["Anhedonia"] = ["anhedonia", "DeadBedrooms"]

  anxiety = data.loc[(data['subreddit'] == "Anxiety") | (data['subreddit'] == "AnxietyDepression") | (data['subreddit'] == "HealthAnxiety") | (data['subreddit'] == "PanicAttack")]
  datasets.append(anxiety)
  symptom_to_sub["Anxiety"] = ["Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack"]

  # excluded symptoms should still be in depression dataset, but should not get their own symptom dataset
  concentration_deficit = data.loc[(data['subreddit'] == "DecisionMaking") | (data['subreddit'] == "shouldi")]

  disordered_eating = data.loc[(data['subreddit'] == "bingeeating") | (data['subreddit'] == "BingeEatingDisorder") | (data['subreddit'] == "EatingDisorders") | (data['subreddit'] == "eating_disorders") | (data['subreddit'] == "EDAnonymous")]
  datasets.append(anxiety)
  symptom_to_sub["Disordered eating"] = ["bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous"]

  fatigue = data.loc[(data['subreddit'] == "chronicfatigue") | (data['subreddit'] == "Fatigue")]

  loneliness = data.loc[(data['subreddit'] == "ForeverAlone") | (data['subreddit'] == "lonely")]
  datasets.append(anxiety)
  symptom_to_sub["Loneliness"] = ["ForeverAlone", "lonely"]

  sad_mood = data.loc[(data['subreddit'] == "cry") | (data['subreddit'] == "grief") | (data['subreddit'] == "sad") | (data['subreddit'] == "Sadness")]
  datasets.append(sad_mood)
  symptom_to_sub["Sad mood"] = ["cry", "grief", "sad", "Sadness"]

  self_loathing = data.loc[(data['subreddit'] == "AvPD") | (data['subreddit'] == "SelfHate") | (data['subreddit'] == "selfhelp") | (data['subreddit'] == "socialanxiety") | (data['subreddit'] == "whatsbotheringyou")]
  datasets.append(self_loathing)
  symptom_to_sub["Self-loathing"] = ["AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou"]

  sleep_problem = data.loc[(data['subreddit'] == "insomnia") | (data['subreddit'] == "sleep")]
  datasets.append(sleep_problem)
  symptom_to_sub["Sleep problem"] = ["insomnia", "sleep"]

  somatic_complaint = data.loc[(data['subreddit'] == "cfs") | (data['subreddit'] == "ChronicPain") | (data['subreddit'] == "Constipation") | (data['subreddit'] == "EssentialTremor") | (data['subreddit'] == "headaches") | (data['subreddit'] == "ibs") | (data['subreddit'] == "tinnitus")]
  datasets.append(somatic_complaint)
  symptom_to_sub["Somatic complaint"] = ["cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus"]

  suicide = data.loc[(data['subreddit'] == "AdultSelfHarm") | (data['subreddit'] == "selfharm") | (data['subreddit'] == "SuicideWatch")]

  worthlessness = data.loc[(data['subreddit'] == "Guilt") | (data['subreddit'] == "Pessimism") | (data['subreddit'] == "selfhelp") | (data['subreddit'] == "whatsbotheringyou")]
  datasets.append(worthlessness)
  symptom_to_sub["Worthlessness"] = ["Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"]

  # depression dataset
  datasets.insert(0, pd.concat([anger, anhedonia, anxiety, disordered_eating, loneliness, sad_mood, self_loathing, sleep_problem, somatic_complaint, worthlessness, concentration_deficit, suicide, fatigue]))

  # map each author to index depression posting
  author_to_index = {}
  for _, post in datasets[0].iterrows():
    # if we've seen the author before, update their index post if the current post is earlier than the post already stored
    if post['author'] in author_to_index:
      author_to_index[post['author']] = min(post['created_utc'], author_to_index[post['author']])
    # haven't seen this author; store current post as their index post thus far
    else:
      author_to_index[post['author']] = post['created_utc']

  # Filter non-depression posts to include only those older than 180 days from the author's earliest depression posting
  c = data.loc[~data['subreddit'].isin(depression_subreddits)]
  c = c.loc[(c['author'].isin(author_to_index.keys())) & (c['author'].map(author_to_index) - c['created_utc'] >= 15552000)]

  datasets.insert(0, c)
  return datasets, symptom_to_sub

datasets, symptom_to_sub = dataset_generation()

315         BuddermanTheAmazing
651         WildernessExploring
730        NeighborhoodPizzaGuy
1354                    xDEDANx
1598                baby_kicked
                   ...         
1968023          QueenGeraldina
1969250              NewMe43893
1969342               briarjohn
1969582       sweetmotherofodin
1969647             Mustang2006
Name: author, Length: 4369, dtype: object


In [None]:
def tokenize():
  dataset_tokens = []
  tokenizer = Tokenizer()

  for dataset in datasets:
    # take the 'text' column, remove punctuation, and then tokenize (which automatically lowercases), and cast to list
    dataset_tokens.append(list(dataset['text'].apply(lambda x: tokenizer.tokenize(x.translate(str.maketrans('', '', string.punctuation))))))
  return dataset_tokens

dataset_tokens = tokenize()

In [None]:
from collections import Counter
def stop_words():
  control = dataset_tokens[0]

  # use Counter to get token frequencies
  counter = Counter([token for row in control for token in row])

  # get top 100 words
  stop_words = counter.most_common(100)
  return stop_words

stop_words = stop_words()
# filter stop words from control and depression tokens
dataset_tokens[0] = [[token for token in row if token not in stop_words] for row in dataset_tokens[0]]
dataset_tokens[1] = [[token for token in row if token not in stop_words] for row in dataset_tokens[1]]

## Reddit Topics with LDA

 - Don't use MALLET (as the paper does), use some other LDA implementation.

In [None]:
from gensim.models import LdaMulticore
from gensim import corpora

=vocab = set()
corpus = dataset_tokens[0] + dataset_tokens[1]

# go through control + depression and build vocab
for post in corpus:
  for token in post:
    vocab.add(token)

# map id to word for use in LdaMulticore
counter = Counter(token for post in corpus for token in post)
temp = [x[0] for x in counter.most_common(len(vocab))]
idx2word = dict(enumerate(temp))

# use corpora.Dictionary to make a mapping based off corpus that can then be turned into BOW for LdaMulticore
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(post) for post in corpus]

# train LdaMulticore on control + depression posts
lda = LdaMulticore(corpus, num_topics=200, id2word=idx2word, minimum_probability=0.05)

In [None]:
# get topic distribution for each post in control + depression set
topics = [lda.get_document_topics(post) for post in corpus]
# example of top 10 words associated with first topic
lda.show_topic(0, topn=10)

[('suggested', 0.2816144),
 ('recovery', 0.24950284),
 ('aside', 0.058846075),
 ('scapegoat', 0.04413658),
 ('the', 0.03955097),
 ('things', 0.026652614),
 ('’', 0.024239523),
 ('nature', 0.022204313),
 ('out', 0.013725635),
 ('her', 0.013718098)]

In [None]:
# build feature matrix
X = np.zeros((len(corpus), 200))

for i in range(len(topics)):
  for topic, prob in topics[i]:
    # each matrix element is the probability of that topic for the post
    X[i, topic] = prob

In [None]:
# build Y (labels matrix)
def build_labels():
  labels = []

  for symptom in ["Anger", "Anhedonia", "Anxiety", "Disordered eating", "Loneliness", "Sad mood", "Self-loathing", "Sleep problem", "Somatic complaint", "Worthlessness"]:
    # store labels for a symptom in 1-d matrix
    Y = np.zeros(len(corpus))
    i = 0

    for _, row in (pd.concat([datasets[0], datasets[1]])).iterrows():
      # if the post's subreddit is associated with the symptom, label it 1, otherwise the label stays 0
      if row.subreddit in symptom_to_sub[symptom]:
        Y[i] = 1
      i = i + 1
    labels.append(Y)
  return labels
Y = build_labels()

## RoBERTa Embeddings

In [None]:
from transformers import RobertaModel, RobertaTokenizer
import torch

tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
model = RobertaModel.from_pretrained('distilroberta-base')

model.to("cuda")
model.eval()

# get the text from each post in control + depression dataset for tokenizing
d = pd.concat([datasets[0], datasets[1]])
text = [post['text'] for _, post in d.iterrows()]

embeddings = []
for post in text:
  # tokenize post text
  inputs = tokenizer(post, return_tensors="pt", truncation=True, padding=True, max_length=512)
  inputs.to("cuda")

  with torch.no_grad():
    # use ** to automatically assign input_ids and attention_mask from BatchEncoding from tokenizer; also get hidden states for next line
    outputs = model(**inputs, output_hidden_states=True)

  # get hidden representation from 5th layer according to handout
  hidden_states = outputs.hidden_states[5]

  embedding = hidden_states.squeeze(0)
  avg_embedding = embedding.mean(dim=0)

  # store embedding on cpu and convert to numpy array
  embeddings.append(avg_embedding.cpu().numpy())

embeddings = np.array(embeddings)

## Main

In [None]:
def main(X, y):
  """
  Here's the basic structure of the main block! It should run
  5-fold cross validation with random forest to evaluate your RoBERTa and LDA
  performance.
  """
  # evaluate each symptom vs control
  for symptom in y:
    # classifier and kfold work together to cross_validate the feature matrix X based on the symptom labels with AUC scoring
    rf_classifier = RandomForestClassifier(n_estimators = 85, max_depth=15, n_jobs=-1)
    cv = KFold(n_splits=5, shuffle=True)
    results = cross_validate(rf_classifier, X=X, y=symptom, cv=cv, scoring='roc_auc', return_train_score=True)

    print("LDA")
    print("Train", results['train_score'])
    print("Test", results['test_score'])

    # do same thing but with embeddings instead of feature matrix
    rf_classifier = RandomForestClassifier(n_estimators=50, max_depth=15, n_jobs=-1)
    cv = KFold(n_splits=5, shuffle=True)
    results = cross_validate(rf_classifier, X=embeddings, y=symptom, cv=cv, scoring='roc_auc', return_train_score=True)

    print("Roberta")
    print("Train", results['train_score'])
    print("Test", results['test_score'])

main(X, Y)

LDA
Train [0.99148455 0.98859688 0.99160357 0.99546661 0.99341526]
Test [0.82961361 0.83996481 0.83610435 0.81059423 0.81556339]
Roberta
Train [0.99999346 0.99999916 0.99999978 0.99999894 0.99999952]
Test [0.84275583 0.82967878 0.83604131 0.82275119 0.86708919]
LDA
Train [0.96978483 0.96948293 0.96851002 0.96977235 0.97041406]
Test [0.93248721 0.93293243 0.92638122 0.93221944 0.92695327]
Roberta
Train [0.99996361 0.99996705 0.99997112 0.99996414 0.99997894]
Test [0.9565865  0.95489873 0.95402909 0.95589525 0.953619  ]
LDA
Train [0.85097463 0.84930155 0.84816622 0.84965519 0.84956431]
Test [0.80946351 0.80546596 0.81206463 0.80964726 0.80051919]
Roberta
Train [0.998358   0.99822777 0.99831063 0.99833068 0.99831906]
Test [0.85812061 0.86245369 0.8597909  0.85712015 0.86529438]
LDA
Train [0.94397735 0.94457728 0.94632828 0.94780974 0.95186543]
Test [0.81522398 0.83316798 0.80939285 0.81502377 0.80447113]
Roberta
Train [0.99990092 0.9999263  0.9998928  0.9999668  0.99981033]
Test [0.904625