<a href="https://colab.research.google.com/github/connorkrchen/cchen127-mstu-mock-deployment/blob/main/Copy_of_F24_Project_1_Reddit_Depression_Stencil.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reddit Depression Final Project
Link to the paper: https://dl.acm.org/doi/pdf/10.1145/3578503.3583621

Read through the paper fully before starting the assignment!

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier

!pip install happiestfuntokenizing
!pip install transformers torch

from google.colab import drive
drive.mount('/content/drive')

# Put filepath here (will store more files here to avoid having to rerun long executions)
FILEPATH = 'drive/MyDrive/CSCI 1460/final project/data/'

Collecting happiestfuntokenizing
  Downloading happiestfuntokenizing-0.0.7.tar.gz (6.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: happiestfuntokenizing
  Building wheel for happiestfuntokenizing (setup.py) ... [?25l[?25hdone
  Created wheel for happiestfuntokenizing: filename=happiestfuntokenizing-0.0.7-py3-none-any.whl size=6711 sha256=448582c2e8fe8ffe28f395c097c6d43f63e37fe2eab8056521810835f28899f7
  Stored in directory: /root/.cache/pip/wheels/bf/c9/4d/310f0c60855eb7b428558f29d93cf464dbb64c1b8628753395
Successfully built happiestfuntokenizing
Installing collected packages: happiestfuntokenizing
Successfully installed happiestfuntokenizing-0.0.7
Mounted at /content/drive


## Preprocessing

In [2]:
def load(filepath: str):
  """
  Load pickles into a pandas DataFrame

  Parameters
  ----------
  filepath : str
      Location of the pickle file

  Returns
  -------
  df : pd.DataFrame
      Pandas DataFrame containing the data
  """
  df = pd.read_pickle(filepath)
  return df
  pass

In [3]:
# Load the dataset
WHOLE_DATASET = load(FILEPATH + 'student.pkl')

In [4]:
# List of depression subreddits in the paper
depression_subreddits = ["Anger",
    "anhedonia", "DeadBedrooms",
    "Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack",
    "DecisionMaking", "shouldi",
    "bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous",
    "chronicfatigue", "Fatigue",
    "ForeverAlone", "lonely",
    "cry", "grief", "sad", "Sadness",
    "AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou",
    "insomnia", "sleep",
    "cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus",
    "AdultSelfHarm", "selfharm", "SuicideWatch",
    "Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"
]

In [5]:
from collections import Counter
def stop_words(control_data: pd.DataFrame) -> list[str]:
  """
  Find top 100 words from Reddit dataset to use as stop words

  Parameters
  ----------
  control_data : pd.DataFrame
      Control_data used to find stop words

  Returns
  -------
  list[str]
      List of top 100 frequent words in control_data to be used as stop words
  """

  word_counts = Counter()
  concatenated_text = ' '.join(control_data['text'])
  word_counts.update(word.lower() for word in concatenated_text.split())

  top_100_words = dict(word_counts.most_common(100))

  return list(top_100_words.keys())

  pass


In [6]:
from happiestfuntokenizing.happiestfuntokenizing import Tokenizer
tokenizer = Tokenizer()

def tokenize(symptom_data: pd.DataFrame, ind_data: list[pd.DataFrame], control_data: pd.DataFrame, stopwords: list[str]):
  """
  Tokenize the datasets

  Parameters
  ----------
  symptom_data : pd.DataFrame
      Data of all depression symptom posts

  ind_data : list[pd.DataFrame]
      List of individual depression symptom DataFrames

  control_data : pd.DataFrame
      Data of all control posts

  stopwords : list[str]
      List of stop words to remove from the text

  Returns
  -------
  symptom_data : pd.DataFrame
      Tokenized symptom_data

  ind_data : list[pd.DataFrame]
      List of tokenized individual depression symptom DataFrames

  control_data : pd.DataFrame
      Tokenized control_data
  """

  processed_symptom_chunks = []
  chunk_size = 5000 # I ran out of RAM when I tried to do the entirety of symptom_data at once so we do it in chunks here
  for i in range(0, len(symptom_data), chunk_size):
    chunk = symptom_data.iloc[i:i + chunk_size].copy()
    chunk['tokens'] = chunk['text'].apply(lambda x: [word for word in tokenizer.tokenize(x) if word.lower() not in stopwords and word.isalnum()])
    processed_symptom_chunks.append(chunk)

  symptom_data = pd.concat(processed_symptom_chunks)

  # ind_data and control_data are small enough where we can just do it all at once
  for df in ind_data:
    df['tokens'] = df['text'].apply(lambda x: [word for word in tokenizer.tokenize(x) if word.lower() not in stopwords and word.isalnum()])

  control_data['tokens'] = control_data['text'].apply(lambda x: [word for word in tokenizer.tokenize(x) if word.lower() not in stopwords and word.isalnum()])

  return symptom_data, ind_data, control_data

  pass

In [7]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

# Dictionary that maps each subreddit to a symptom
subreddit_dict = {
    'Anger': 'ANGER',
    'anhedonia': 'ANHEDONIA',
    'DeadBedrooms': 'ANHEDONIA',
    'Anxiety': 'ANXIETY',
    'AnxietyDepression': 'ANXIETY',
    'HealthAnxiety': 'ANXIETY',
    'PanicAttack': 'ANXIETY',
    'DecisionMaking': 'CONCENTRATION DEFICIT',
    'shouldi': 'CONCENTRATION DEFICIT',
    'bingeeating': 'DISORDERED EATING',
    'BingeEatingDisorder': 'DISORDERED EATING',
    'EatingDisorders': 'DISORDERED EATING',
    'eating_disorders': 'DISORDERED EATING',
    'EDAnonymous': 'DISORDERED EATING',
    'chronicfatigue': 'FATIGUE',
    'Fatigue': 'FATIGUE',
    'ForeverAlone': 'LONELINESS',
    'lonely': 'LONELINESS',
    'cry': 'SAD MOOD',
    'grief': 'SAD MOOD',
    'sad': 'SAD MOOD',
    'Sadness': 'SAD MOOD',
    'AvPD': 'SELF LOATHING',
    'SelfHate': 'SELF LOATHING',
    'selfhelp': 'SELF LOATHING',
    'socialanxiety': 'SELF LOATHING',
    'whatsbotheringyou': 'SELF LOATHING',
    'insomnia': 'SLEEP PROBLEM',
    'sleep': 'SLEEP PROBLEM',
    'cfs': 'SOMATIC COMPLAINT',
    'ChronicPain': 'SOMATIC COMPLAINT',
    'Constipation': 'SOMATIC COMPLAINT',
    'EssentialTremor': 'SOMATIC COMPLAINT',
    'headaches': 'SOMATIC COMPLAINT',
    'ibs': 'SOMATIC COMPLAINT',
    'tinnitus': 'SOMATIC COMPLAINT',
    'AdultSelfHarm': 'SUICIDE',
    'selfharm': 'SUICIDE',
    'SuicideWatch': 'SUICIDE',
    'Guilt': 'WORTHLESSNESS',
    'Pessimism': 'WORTHLESSNESS',
    'selfhelp': 'WORTHLESSNESS',
    'whatsbotheringyou': ['WORTHLESSNESS', 'SELF LOATHING'] # this subreddit caused a lot of issues for me unfortunately, but I decided to assign two symptoms to each instance of this subreddit then .explode the column
}

def dataset_generation(raw_data: pd.DataFrame) -> tuple[pd.DataFrame, list[pd.DataFrame], pd.DataFrame]:
  """
  Build control and symptom datasets

  Parameters
  ----------
  raw_data : pd.DataFrame
      Entire dataset loaded earlier from .load

  Returns
  -------
  symptom_data : pd.DataFrame
      Data of all depression symptom posts

  ind_data : list[pd.DataFrame]
      List of individual depression symptom DataFrames

  control_data : pd.DataFrame
      Data of all control posts
  """
  # NOTE: 1 day is 86400 seconds, so 180 days is 15552000 seconds
  ind_data = []
  symptom_data = raw_data[raw_data['subreddit'].isin(depression_subreddits)] # isolate entire symptom_data

  symptom_data['symptom'] = symptom_data['subreddit'].map(subreddit_dict).dropna()
  symptom_process = symptom_data.copy
  symptom_groups = symptom_data.explode('symptom').groupby('symptom') # split up the whatsbotheringyou cases
  ind_data = [symptom_groups.get_group(symptom) for symptom in symptom_groups.groups] # individual symptom data

  earliest_posts = symptom_data.groupby('author')['created_utc'].min() # get the earliest symptom post for each author
  authors = set(symptom_data['author'].unique()) # get the list of authors

  prelim_control_data = raw_data[~raw_data['subreddit'].isin(depression_subreddits)]
  prelim_control_data = prelim_control_data[prelim_control_data['author'].isin(authors)] # filter control_data to only include posts from authors who have symptom_data posts
  prelim_control_data['earliest_post'] = prelim_control_data['author'].map(earliest_posts)
  prelim_control_data['time_diff'] = prelim_control_data['earliest_post'] - prelim_control_data['created_utc'] # calculate the time difference between earliest symptom post and given control post
  prelim_control_data = prelim_control_data[prelim_control_data['time_diff'] > 15552000]
  control_data = prelim_control_data[~prelim_control_data['author'].isin(['AutoModerator', '[deleted]'])] # get rid of AutoModerator and [deleted] posts (originally a personal choice but realized it resulted in exact same data size as in handout)

  stopwords = stop_words(control_data) # retrieve stop_words

  symptom_data, ind_data, control_data = tokenize(symptom_data, ind_data, control_data, stopwords) # tokenize

  return symptom_data, ind_data, control_data

pass



In [9]:
# FILE GENERATION BENCHMARK

# Generate datasets
symptom_data, ind_data, control_data = dataset_generation(WHOLE_DATASET)

# Save generated datasets to filepath
with open(FILEPATH + 'symptom_data.pkl', 'wb') as symptom_file:
    pickle.dump(symptom_data, symptom_file)

for i, df in enumerate(ind_data):
    with open(FILEPATH + 'ind_data_' + str(i) + '.pkl', 'wb') as ind_file:
        pickle.dump(df, ind_file)

with open(FILEPATH + 'control_data.pkl', 'wb') as control_file:
    pickle.dump(control_data, control_file)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  symptom_data['symptom'] = symptom_data['subreddit'].map(subreddit_dict).dropna()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = df['text'].apply(lambda x: [word for word in tokenizer.tokenize(x) if word.lower() not in stopwords and word.isalnum()])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-vers

In [10]:
# FILE LOADING BENCHMARK

# Load generated datasets from filepath (note same variable names)
with open(FILEPATH + 'symptom_data.pkl', 'rb') as symptom_file:
    symptom_data = load(symptom_file)
    symptom_data = symptom_data.reset_index(drop=True)

ind_data = []
for i in range(13):
    with open(FILEPATH + 'ind_data_' + str(i) + '.pkl', 'rb') as ind_file:
        ind_data.append(load(ind_file))
        ind_data[i] = ind_data[i].reset_index(drop=True)

with open(FILEPATH + 'control_data.pkl', 'rb') as control_file:
    control_data = load(control_file)
    control_data = control_data.reset_index(drop=True)

## Reddit Topics with LDA

 - Don't use MALLET (as the paper does), use some other LDA implementation.

In [10]:
# We highly recommend you using the LdaMulticore interface, but feel free to use any other implementations if you prefer.
from gensim.models import LdaMulticore
from gensim.matutils import Sparse2Corpus
from scipy.sparse import lil_matrix

# TODO: Your LDA code!

def binary_term_doc_matrix(combined_data) -> tuple[np.ndarray[np.float64], dict[int, str]]:
  """
  Generate binary_term_doc_matrix from combined individual data and control_data (since we are training LDA on the entire dataset)

  Parameters
  ----------
  combined_data : pd.DataFrame
      Combined ind_data and control_data

  Returns
  -------
  M : lil_matrix
      Binary term-document matrix

  idx2word : dict[int, str]
      Dictionary mapping index to word
  """
  # From my assignment 2 work
  word2idx = {}
  idx2word = {}
  vocab = Counter()

  for text in combined_data['tokens']:
    vocab.update(text)

  sorted_vocab = vocab.most_common()
  print(sorted_vocab[10:])

  for i, word_tuple in enumerate(sorted_vocab):
      word = word_tuple[0]
      word2idx[word] = i
      idx2word[i] = word

  M = lil_matrix((len(combined_data), len(vocab)))

  doc_index = 0

  for text in combined_data['tokens']:
    for word in text:
      M[doc_index, word2idx[word]] += 1
    doc_index += 1

  return M, idx2word

def extract_lda_embeddings(combined_data) -> np.ndarray[np.float64]:
  """
  Extract embeddings from combined_data using Latent Dirichlet Allocation

  Parameters
  ----------
  combined_data : pd.DataFrame
      Combined ind_data and control_data

  Returns
  -------
  lda_features : np.ndarray[np.float64]
      LDA embeddings
  """

  M, idx2word = binary_term_doc_matrix(combined_data)
  print(f"binary term-document matrix shape: {M.shape}")
  print(f"vocab size: {len(idx2word)}")

  corpus = Sparse2Corpus(M, documents_columns=False) # Convert the matrix into a corpus

  lda = LdaMulticore(corpus, num_topics=200, id2word=idx2word, workers=4, passes=2) # Perform LDA multicore on the corpus

  lda_features = np.zeros((len(corpus), 200)) # Initialize the matrix

  for i, doc in enumerate(corpus):
      doc_topics = lda.get_document_topics(doc)
      for topic, prob in doc_topics:
          lda_features[i, topic] = prob

  return lda_features


## RoBERTa Embeddings

In [14]:
# TODO: Your RoBERTa code!

from transformers import AutoTokenizer, AutoModel
import torch

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
model = AutoModel.from_pretrained('distilroberta-base', output_hidden_states=True)

def extract_roberta_embeddings(data: pd.DataFrame, control: pd.DataFrame, model, tokenizer, batch_size=8):
  """
  Extract embeddings from a given individual symptom data and control_data using RoBERTa

  Parameters
  ----------
  data : pd.DataFrame
      Individual symptom Dataframe
  control_data : pd.DataFrame
      Control_data
  model
      RoBERTa model
  tokenizer
      RoBERTa tokenizer
  batch_size : int
      Batch size for embedding extraction

  Returns
  -------
  symptom_data : pd.DataFrame
      Data of all depression symptom posts

  ind_data : list[pd.DataFrame]
      List of individual depression symptom DataFrames

  control_data : pd.DataFrame
      Data of all control posts
  """

  model.eval()
  device = "cuda" if torch.cuda.is_available() else "cpu"
  model = model.to(device)

  embeddings_list = []

  combined_data = pd.concat([data, control]) # Concat the individual symptom data and the control_data
  combined_data.reset_index(drop=True, inplace=True)

  for i in range(0, len(combined_data), batch_size): # I also ran out of RAM for this part so I had to do batching again
    batch = combined_data.iloc[i : i + batch_size]
    inputs = tokenizer([" ".join(tokens) for tokens in batch['tokens']], return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.hidden_states # outputs.last_hidden_state
        embeddings = hidden_states[4] # retrieve 5th layer for embeddings

    averaged_embeddings = torch.mean(embeddings, dim=1)
    embeddings_list.append(averaged_embeddings.cpu().numpy())

  return np.concatenate(embeddings_list, axis=0)

## Main

In [11]:
# Prepare combined_data (used for both LDA and RoBERTa embedding extraction)
combined_data = pd.concat(ind_data + [control_data])
combined_data.reset_index(drop=True, inplace=True)

In [12]:
# Prepare loading all of the saved embeddings

# Dictionary that maps an int to a symptom
symptom_dict = {0: 'anger',
                1: 'anhedonia',
                2: 'anxiety',
                3: 'disordered_eating',
                4: 'loneliness',
                5: 'sad_mood',
                6: 'self-loathing',
                7: 'sleep_problem',
                8: 'somatic_complaint',
                9: 'worthlessness',
                10: 'CONTROL'}

# Dictionary that maps each int (mapped above) to a list of its indices in the combined_data
indices_dict = {
    0: combined_data.index[combined_data['symptom'] == "ANGER"].tolist(),
    1: combined_data.index[combined_data['symptom'] == "ANHEDONIA"].tolist(),
    2: combined_data.index[combined_data['symptom'] == "ANXIETY"].tolist(),
    3: combined_data.index[combined_data['symptom'] == "DISORDERED EATING"].tolist(),
    4: combined_data.index[combined_data['symptom'] == "LONELINESS"].tolist(),
    5: combined_data.index[combined_data['symptom'] == "SAD MOOD"].tolist(),
    6: combined_data.index[combined_data['symptom'] == "SELF LOATHING"].tolist(),
    7: combined_data.index[combined_data['symptom'] == "SLEEP PROBLEM"].tolist(),
    8: combined_data.index[combined_data['symptom'] == "SOMATIC COMPLAINT"].tolist(),
    9: combined_data.index[combined_data['symptom'] == "WORTHLESSNESS"].tolist(),
    10: combined_data.index[~combined_data['subreddit'].isin(depression_subreddits)].tolist()
}

# Define indices
anger_indices = indices_dict[0]
anhedonia_indices = indices_dict[1]
anxiety_indices = indices_dict[2]
disordered_eating_indices = indices_dict[3]
loneliness_indices = indices_dict[4]
sad_mood_indices = indices_dict[5]
self_loathing_indices = indices_dict[6]
sleep_problem_indices = indices_dict[7]
somatic_complaint_indices = indices_dict[8]
worthlessness_indices = indices_dict[9]
control_indices = indices_dict[10]

# Add to ind_indices list
ind_indices = [anger_indices, anhedonia_indices, anxiety_indices, disordered_eating_indices, loneliness_indices, sad_mood_indices, self_loathing_indices, sleep_problem_indices, somatic_complaint_indices, worthlessness_indices]


In [13]:
# FILE GENERATION BENCHMARK

# Generate LDA embeddings
lda_embeddings = extract_lda_embeddings(combined_data)
np.save(FILEPATH + 'lda_embeddings.npy', lda_embeddings)

binary term-document matrix shape: (99006, 77321)
vocab size: 77321


In [15]:
# FILE GENERATION BENCHMARK

excluded_indxs = [3, 5, 11] # Don't include these indices because not enough data; [CONCENTRATION DEFICIT, FATIGUE, SUICIDAL]

for i, symptom_df in enumerate([data for idx, data in enumerate(ind_data) if idx not in excluded_indxs]):
  roberta_embeddings = extract_roberta_embeddings(symptom_df, control_data, model, tokenizer)
  np.save(FILEPATH + 'roberta_embeddings_' + symptom_dict[i] + '.npy', roberta_embeddings)

In [14]:
# FILE LOADING BENCHMARK

# # Load LDA embeddings and prep for iteration with main function
lda_embeddings = np.load(FILEPATH + 'lda_embeddings.npy')
X_lda_embeddings = []
y_lda = []

for i in range(10):
  X_lda_embeddings.append(np.vstack((lda_embeddings[indices_dict[i]], lda_embeddings[control_indices]))) # extract desired indices of the entire lda_embeddings matrix
  y_lda.append(np.hstack((np.ones(len(indices_dict[i])), np.zeros(len(control_indices)))))

# Load RoBERTa embeddings and prep for iteration with main function
X_roberta_embeddings = []
y_roberta = []
for i in range(10):
  X_roberta_embeddings.append(np.load(FILEPATH + 'roberta_embeddings_' + symptom_dict[i] + '.npy'))
  y_roberta.append(np.hstack((np.ones(len(indices_dict[i])), np.zeros(len(control_indices)))))


In [19]:
def main(X, y):
  """
  Here's the basic structure of the main block! It should run
  5-fold cross validation with random forest to evaluate your RoBERTa and LDA
  performance.
  """
  rf_classifier = RandomForestClassifier()
  cv = KFold(n_splits=5, shuffle=True)
  results_dict = {}
  results = cross_validate(rf_classifier, X=X, y=y, cv=cv, scoring='roc_auc', return_train_score=True)

  # TODO: Print your training and testing scores!
  print("Training AUC:", results['train_score'])
  print("Testing AUC:", results['test_score'])
  print(f"Average Testing AUC: {np.mean(results['test_score'])}")
  pass

print("LDA accuracies:")
for i, lda_embedding in enumerate(X_lda_embeddings):
  print("\n")
  print(f"Symptom - {symptom_dict[i]}:")
  main(X_lda_embeddings[i], y_lda[i])

print("\n")

print("RoBERTa accuracies:")
for i, roberta_embedding in enumerate(X_roberta_embeddings):
  print("\n")
  print(f"Symptom - {symptom_dict[i]}:")
  main(X_roberta_embeddings[i], y_roberta[i])


LDA accuracies:


Symptom - anger:
Training AUC: [0.99998093 0.99998819 0.99989175 0.99999619 0.99990401]
Testing AUC: [0.83609098 0.80768304 0.80630883 0.80954582 0.77657038]
Average Testing AUC: 0.8072398137999945


Symptom - anhedonia:
Training AUC: [0.99896354 0.99893515 0.99890629 0.99893924 0.99892111]
Testing AUC: [0.93848141 0.93410769 0.94232365 0.93176425 0.93842407]
Average Testing AUC: 0.937020213563953


Symptom - anxiety:
Training AUC: [0.99955619 0.99949234 0.99953514 0.99959409 0.99951654]
Testing AUC: [0.87643744 0.87736957 0.88123034 0.88010123 0.87051183]
Average Testing AUC: 0.877130079455265


Symptom - disordered_eating:
Training AUC: [0.99829876 0.99857178 0.99846814 0.99859836 0.99852256]
Testing AUC: [0.94192776 0.92771787 0.93534824 0.93598072 0.92996673]
Average Testing AUC: 0.9341882623330825


Symptom - loneliness:
Training AUC: [0.99907016 0.99914043 0.99918004 0.99909527 0.99913545]
Testing AUC: [0.83208367 0.82915461 0.85362572 0.84275566 0.83817563]
Ave