<a href="https://colab.research.google.com/github/gmiserani/mentalhealth/blob/main/mentalhealth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
USE_TPU = 0
USE_GPU = 1

assert (USE_TPU == 0) | (USE_GPU == 0)

In [None]:
!pwd

/content/drive/My Drive/MentalHealthShared


In [None]:
%cd /content/drive/MyDrive/MentalHealthShared

/content/drive/MyDrive/MentalHealthShared


In [None]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
ON_COLAB=False
if ON_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    BASEDIR='/content/drive/MyDrive/MentalHealthShared'
    PYTHONDIR=BASEDIR+'src'
    RESULTSDIR=BASEDIR+'results/'
    MODELSDIR=BASEDIR+'model/'
    DATADIR=BASEDIR+'data/'
    import utils 
    from pre_processing import findLast, convert_utc, check_missing_threads
else:
    import os
    BASEDIR = os.getcwd() + "/"
    dirs = ["results","model","data"]
    for dirc in dirs:
        if dirc not in os.listdir(): 
            os.makedirs(os.path.join(BASEDIR,dirc))
    PYTHONDIR=BASEDIR+'src/'
    RESULTSDIR=BASEDIR+'results/'
    MODELSDIR=BASEDIR+'model/'
    DATADIR=BASEDIR+'data/'
    from src import utils
    from src.pre_processing import findLast, convert_utc, check_missing_threads

In [None]:
if USE_TPU:
      !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
      !python pytorch-xla-env-setup.py #--version $VERSION

      # imports pytorch
      import torch
      # imports the torch_xla package
      import torch_xla
      import torch_xla.core.xla_model as xm

      device = xm.xla_device()
      torch_xla._XLAC._xla_set_default_device(str(device))

else:
      import torch
      device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
      print(device)

cuda


In [None]:
import sys
sys.path.append(PYTHONDIR)
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import pickle
import time
import pdb
import importlib
import pprint
import re

import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
# loading DistilBERT to obtain embeddings
!pip install -q transformers
# Use DistilBERT para representar cada post e comentario
from transformers import DistilBertModel, DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# loading VADER for sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
nltk.download('punkt')

# initialize VADER
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from datetime import timedelta

def areIntervalsShort(post, seqlen_series):
  postid = post.name
  print(postid)
  if postid in seqlen_series.index:
    seqlen = seqlen_series.loc[postid]
    print(seqlen)
    # from post (created_utc[0]) to last comment (created_utc[seqlen])
    timestamps = np.array(post.created_utc[:seqlen])
    print(timestamps)
    timedeltas = timestamps[1:] - timestamps[:-1]
    print(timedeltas)
    print(timedelta(days=1))
    return np.all(timedeltas < timedelta(days=1))#.total_seconds())


In [None]:
def create_subreddit_df(reddit, infile_extension, suffix=''):
    infile_extension = infile_extension.lstrip('.')

    print('Loading dicts...')
    with open(f'{DATADIR}{reddit}_post2data{suffix}.{infile_extension}','rb') as infile:
        post2data = pickle.load(infile)
    with open(f'{DATADIR}{reddit}_post2comments{suffix}.{infile_extension}','rb') as infile:
        post2comments = pickle.load(infile)
    with open(f'{DATADIR}{reddit}_comment2data{suffix}.{infile_extension}','rb') as infile:
        comment2data = pickle.load(infile)

    outpath=f'{DATADIR}{reddit}{suffix}_distilbert_filtered_posts.parquet'
    print(f'Dataframe will be saved in {outpath}')

    print('creating df from post2data...')
    df = pd.DataFrame(list(post2data.values()), index=post2data.keys())
    df = df.rename( columns={'selftext':'text'})
    print(1, len(df))


    #df = df.drop(columns='num_comments')

    # join with list of comments
    print('joining with post2comments...')
    tmp = pd.Series(post2comments,name='comments')
    df = df.join(tmp, how='inner')
    print(2, len(df))

    post_df = df

    print('creating df from comment2data...')
    df = pd.DataFrame(list(comment2data.values()), index=comment2data.keys())
    df = df.rename( columns={'body':'text'})
    # THE FOLLOWING 2 LINES MUST BE USED ONLY IF THE LINK_IDS HAVE A PREFIX
    
    #df["link_id"] = df["link_id"].apply(lambda x:x[3:])
    #df["parent_id"] = df["parent_id"].apply(lambda x:x[3:]) 
    print(3, len(df))


    print('remove comments that correspond to inexistent threads...')
    #post_ids = set(post_df.index)
    #print(post_df.comments)
    #print(df.head())
    #pdb.set_trace()
    lst = post_df.comments.values.tolist()
    flat_list = [item for sublist in lst for item in sublist]
    post_ids = pd.DataFrame(flat_list, columns = ['col'])
    df = df[df.link_id.index.isin(post_ids.col)]
    print(4, len(df))

    #link dos dois contenham o mesmo id do post e do comentario pra ver se o autor da thread escreveu o comentario
    print('compute is_post_author...')
    post_df2 = post_df.explode('comments')
    #pdb.set_trace()
    df = post_df2.join(df, on = 'comments', how = 'right' , rsuffix='_post')
    
    #df = df.merge(post_df['author'], right_on=['index'], left_on=['comments'] , how="inner", suffixes=('_post'))
    df['is_post_author'] = (df.author == df.author_post)
    #print(df)
    df = df.drop('author_post', axis=1)
    post_df = post_df.filter(items=df.index, axis=0)

    comment_df = df

    print('remove  posts without text...')
    post_df = post_df.dropna()
    # post_df.num_comments = post_df.num_comments.astype(int)
    print(5, len(post_df))

    print('remove posts with less than 2 comments...')
    # remove posts with less than 2 comments
    post_df['num_comments'] = post_df['comments'].apply(len) # use this as num_comments
    post_df = post_df[post_df['num_comments'] >= 2]
    print(6, len(post_df))

    print('remove posts with small text...')
    post_df = post_df[post_df['text'].apply(len) >= 2]
    print(7, len(post_df))

    print('remove removed or deleted posts...')
    post_df = post_df[(post_df['text'] != '[removed]') & (post_df['text'] != '[deleted]')]
    print(8, len(post_df))


    print('remove deleted authors...')
    post_df = post_df[(post_df['author'] != '[deleted]')]
    print(9, len(post_df))

    #pdb.set_trace()
    
    print('calculate sequence length when there are other commenters followed by post author...')
    #pdb.set_trace()
    comment_df['post_id'] = comment_df.index
    comment_df.set_index('comments', inplace = True)
    check_missing_threads(post_df,comment_df)
    temp = post_df.apply(lambda x: utils.getThreadLen(x, comment_df),axis=1)
    #post_df['seq_len']

    print('remove invalid threads...')
    post_df = post_df.dropna()
    print(10, len(post_df))

    print('remove comments without body...')
    print(comment_df.loc[comment_df.index == "dbwc0z7"])
    #comment_df = comment_df.dropna()
    print(11,len(comment_df))

    print('remove comments that correspond to deleted threads...')
    post_ids = set(post_df.index)
    comment_df = comment_df[comment_df.post_id.isin(post_ids)]
    print(12,len(comment_df))

    print('computing score...')
    post_df['score']    =    post_df['text'].apply(lambda x: utils.calculateParagraphScore(x,sid))
    comment_df['score'] = comment_df['text'].apply(lambda x: utils.calculateParagraphScore(x,sid))

    print("remove posts whose scores and posts whose comments' scores couldn't be computed...")
    posts_to_remove = list(post_df[post_df.score.isna()].index)
    posts_to_remove += list(comment_df[comment_df.score.isna()]['link_id'].values)
    post_df = post_df.drop(posts_to_remove)
    post_df = post_df.drop_duplicates(subset="created_utc", keep='first')
    print(13,len(post_df))

    print('remove comments whose posts were removed...')
    comment_df = comment_df[~comment_df.link_id.isin(posts_to_remove)]
    print(14,len(comment_df))

    print('concatenating post and comment columns...')
    #print(comment_df.loc[comment_df.index == 'dbwc0z7'])
    #post_df['parent_id'] = post_df.apply(lambda x: [comment_df.loc[cid, 'parent_id'] for cid in x.comments], axis=1)
    post_df['is_post_author'] = post_df.apply(lambda x: [True] + [comment_df.loc[cid, 'is_post_author'] for cid in x.comments], axis=1)
    post_df.score = post_df.apply(lambda x: [x.score] + [comment_df.loc[cid, 'score'] for cid in x.comments], axis=1)
    # post_df.score_noavg = post_df.apply(lambda x: [x.score_noavg] + [comment_df.loc[cid, 'score_noavg'] for cid in x.comments], axis=1)
    post_df.created_utc = post_df.apply(lambda x: [x.created_utc] + [comment_df.loc[cid, 'created_utc'] for cid in x.comments], axis=1)
    print(post_df)

    # # ATTENTION! new seq_len includes post and last_comment
    # # for instance sequence [p, c0, c1, c2, ..., cn]
    # # where cn is the last comment made by author has seq_len=n+2

    # compute new seq_len:
    print('computing seq_len...')
    post_df['seq_len'] = post_df.is_post_author.apply(findLast)+1
    post_df['seq_len'] = post_df.seq_len.astype(int)

    print('computing DistilBERT features...')
    post_df['features'] = utils.extractFeatures(post_df, tokenizer, model, device, batch_size = 512)
    comment_df['features'] = utils.extractFeatures(comment_df, tokenizer, model, device, batch_size = 512)

    print('concatenating post and comment features...')
    post_df.features = post_df.apply(lambda x: [x.features] + [comment_df.loc[cid, 'features'] for cid in x.comments], axis=1)

    print('computing additional features for pre-filtering...')
    seqlen_series = utils.getNonOverlappingThreads(post_df)
    #   post_df = post_df.apply(lambda x: convert_utc(x),axis=1)
    #print(post_df)
    #are_intervals_short = post_df.apply(lambda x: areIntervalsShort(x,seqlen_series), axis=1)
    #are_intervals_short = are_intervals_short.dropna()
    

    #seqlen_series = seqlen_series[(seqlen_series>=2) & are_intervals_short]
    seqlen_series = seqlen_series[(seqlen_series>=2)]
    seqlen_series.name = 'filtered_seqlen'
    post_df = post_df.join(seqlen_series)

    validbranches_series = post_df.apply(utils.getValidBranches,axis=1)
    validbranches_series.name = 'valid_branches'
    post_df = post_df.join(validbranches_series)

    columns_to_drop = [column for column in ['pos','neg','neu','compound','num_comments','score_noavg'] if column in post_df.columns]
    if columns_to_drop:
        print('Dropping superfluous columns...')
        post_df = post_df.drop(columns=columns_to_drop)

    pdb.set_trace()
    post_df.to_parquet(outpath)

    return post_df 

In [None]:

import pdb
from tqdm import tqdm
SUBREDDITS = ["SuicideWatch"]
for subreddit in SUBREDDITS:
    create_subreddit_df(subreddit,'pkl')

Loading dicts...
Dataframe will be saved in /content/drive/MyDrive/MentalHealthShared/data/SuicideWatch_distilbert_filtered_posts.parquet
creating df from post2data...
1 250
joining with post2comments...
2 233
creating df from comment2data...
3 1475
remove comments that correspond to inexistent threads...
4 1475
compute is_post_author...
remove  posts without text...
5 1475
remove posts with less than 2 comments...
6 1432
remove posts with small text...
7 1401
remove removed or deleted posts...
8 854
remove deleted authors...
9 770
calculate sequence length when there are other commenters followed by post author...
Checking for missing threads in comment_df...


60it [00:00, 596.87it/s]

Comment:dbv0gei is missing from comment_df... Deleting thread
Comment:dbv0p23 is missing from comment_df... Deleting thread
Comment:dbv192h is missing from comment_df... Deleting thread
Comment:dbv1hxo is missing from comment_df... Deleting thread
Comment:dbv23cf is missing from comment_df... Deleting thread
Comment:dbv2741 is missing from comment_df... Deleting thread
Comment:dbv6ioo is missing from comment_df... Deleting thread
Comment:dbvgjdu is missing from comment_df... Deleting thread
Comment:dbvut35 is missing from comment_df... Deleting thread


120it [00:00, 557.92it/s]

Comment:dbuzq9e is missing from comment_df... Deleting thread
Comment:dbv0dp3 is missing from comment_df... Deleting thread
Comment:dbv1zr9 is missing from comment_df... Deleting thread


176it [00:00, 526.24it/s]

Comment:dbv0gei is missing from comment_df... Deleting thread
Comment:dbv0p23 is missing from comment_df... Deleting thread
Comment:dbv192h is missing from comment_df... Deleting thread
Comment:dbv1hxo is missing from comment_df... Deleting thread
Comment:dbv23cf is missing from comment_df... Deleting thread
Comment:dbv2741 is missing from comment_df... Deleting thread
Comment:dbv6ioo is missing from comment_df... Deleting thread
Comment:dbvgjdu is missing from comment_df... Deleting thread
Comment:dbvut35 is missing from comment_df... Deleting thread
Comment:dbv0gei is missing from comment_df... Deleting thread
Comment:dbv0p23 is missing from comment_df... Deleting thread
Comment:dbv192h is missing from comment_df... Deleting thread
Comment:dbv1hxo is missing from comment_df... Deleting thread
Comment:dbv23cf is missing from comment_df... Deleting thread
Comment:dbv2741 is missing from comment_df... Deleting thread
Comment:dbv6ioo is missing from comment_df... Deleting thread
Comment:

229it [00:00, 492.35it/s]

Comment:dbv0gei is missing from comment_df... Deleting thread
Comment:dbv0p23 is missing from comment_df... Deleting thread
Comment:dbv192h is missing from comment_df... Deleting thread
Comment:dbv1hxo is missing from comment_df... Deleting thread
Comment:dbv23cf is missing from comment_df... Deleting thread
Comment:dbv2741 is missing from comment_df... Deleting thread
Comment:dbv6ioo is missing from comment_df... Deleting thread
Comment:dbvgjdu is missing from comment_df... Deleting thread
Comment:dbvut35 is missing from comment_df... Deleting thread


451it [00:00, 506.47it/s]

Comment:dbv0gei is missing from comment_df... Deleting thread
Comment:dbv0p23 is missing from comment_df... Deleting thread
Comment:dbv192h is missing from comment_df... Deleting thread
Comment:dbv1hxo is missing from comment_df... Deleting thread
Comment:dbv23cf is missing from comment_df... Deleting thread
Comment:dbv2741 is missing from comment_df... Deleting thread
Comment:dbv6ioo is missing from comment_df... Deleting thread
Comment:dbvgjdu is missing from comment_df... Deleting thread
Comment:dbvut35 is missing from comment_df... Deleting thread
Comment:dbv0gei is missing from comment_df... Deleting thread
Comment:dbv0p23 is missing from comment_df... Deleting thread
Comment:dbv192h is missing from comment_df... Deleting thread
Comment:dbv1hxo is missing from comment_df... Deleting thread
Comment:dbv23cf is missing from comment_df... Deleting thread
Comment:dbv2741 is missing from comment_df... Deleting thread
Comment:dbv6ioo is missing from comment_df... Deleting thread
Comment:

652it [00:01, 616.03it/s]

Comment:dbxq8d5 is missing from comment_df... Deleting thread
Comment:dbxq8d5 is missing from comment_df... Deleting thread
Comment:dbxq8d5 is missing from comment_df... Deleting thread
Comment:dbxq8d5 is missing from comment_df... Deleting thread
Comment:dbxq8d5 is missing from comment_df... Deleting thread


770it [00:01, 548.11it/s]


Comment:dbxq8d5 is missing from comment_df... Deleting thread
Comment:dbxq8d5 is missing from comment_df... Deleting thread
Comment:dbxq8d5 is missing from comment_df... Deleting thread
Comment:dbxq8d5 is missing from comment_df... Deleting thread
remove invalid threads...
10 751
remove comments without body...
             author  created_utc  num_comments  \
comments                                         
dbwc0z7   Sbliguess   1483343719             7   

                                                       text  \
comments                                                      
dbwc0z7   I've Always prefered being alone most of the t...   

                                                  text_post  created_utc_post  \
comments                                                                        
dbwc0z7   Yeah, I also realized one day that I was contr...        1483345228   

            link_id  parent_id  is_post_author post_id  
comments                                     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


remove posts whose scores and posts whose comments' scores couldn't be computed...
13 102
remove comments whose posts were removed...
14 751
concatenating post and comment columns...
                      author  \
5lk5tv             Sbliguess   
5ljqgf            Scutigera_   
5ldbdm      throw_away_posts   
5lhnvr        letsplayyatzee   
5ldpz3          SleepyEnigma   
...                      ...   
5lmxx3       autisticshit124   
5lnbh0          Chrisboy1990   
5ln97q  PsyCosocIaL_Stranger   
5lo4jn         glittershines   
5lohqb       throwawaywwwwwe   

                                              created_utc  num_comments  \
5lk5tv  [1483343719, 1483343719, 1483343719, 148334371...             7   
5ljqgf  [1483336883, 1483336883, 1483336883, 148333688...             5   
5ldbdm  [1483241256, 1483241256, 1483241256, 148324125...             8   
5lhnvr  [1483310407, 1483310407, 1483310407, 148331040...            30   
5ldpz3  [1483248019, 1483248019, 1483248019, 148324801...

100%|██████████| 1/1 [00:00<00:00,  1.24it/s]
100%|██████████| 2/2 [00:05<00:00,  2.87s/it]


concatenating post and comment features...
computing additional features for pre-filtering...


100%|██████████| 97/97 [00:00<00:00, 522.37it/s]


AttributeError: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Experiments on the effect of ettiquete words
THE NEXT SECTION IS DEVOTED TO THE EXPERIMENTS DONE IN REGARDS TO THE EFFECT THAT ETTIQUETE WORDS HAVE ON VADER. NOT NECESSARY FOR THE MAIN MODEL

In [None]:
RUN_ETTIQUETE_EXPS = False

In [None]:
def filter_etiquette(subreddits,return_filtered_subreddits=False,etiquette_list = ["Thank You","Thank you","Thanks","Thx","Thankyou", "thank you", "thanks", "thx", "thankyou"],file_type = '_comment2data',file_extension='p'):
    if(type(subreddits) != type(list())):
        subreddits = [subreddits]
    print("Initializing...")
    total_perc = []
    filtered_subs = {}
    for subreddit in subreddits:
        print(f"Getting r/{subreddit} data...")
        with open(f'{DATADIR}no_thx_2017/{subreddit}_{file_type}.{file_extension}','rb') as infile:
            curr_sub_comments = pickle.load(infile)
        num_comments_with_etiquette = 0
        total_comments = len(curr_sub_comments)
        for key,comment in tqdm(curr_sub_comments.items()):
            for etiquette_word in etiquette_list:
                if etiquette_word in comment['text']:
                    num_comments_with_etiquette += 1
                    curr_sub_comments[str(key)]['text'] = comment['text'].replace(etiquette_word,"")
            if re.search(r"\b" + re.escape("ty") + r"\b",comment['text']):
                num_comments_with_etiquette += 1
                curr_sub_comments[str(key)]['text'] = re.sub(r"\b" + re.escape("ty") + r"\b","",comment['text'])
            if re.search(r"\b" + re.escape("Ty") + r"\b",comment['text']):
                num_comments_with_etiquette += 1
                curr_sub_comments[str(key)]['text'] = re.sub(r"\b" + re.escape("Ty") + r"\b","",comment['text'])
        
        print(f"Percentage of posts or comments filtered in r/{subreddit}: {(num_comments_with_etiquette/total_comments) * 100}%")
        total_perc.append([num_comments_with_etiquette,total_comments])
        print("Saving changes...")
        #pickle.dump(curr_sub_comments,open(DATADIR+ "no_thx_2017/" + f'{subreddit}_{file_type}.{file_extension}','wb'))
        filtered_subs[subreddit] = curr_sub_comments
        print(f"Done with r/{subreddit}")
    print(f"Total Percentage of posts or comments with etiquette words: {sum([total_perc[x][0] for x in range(len(total_perc))])/sum([total_perc[x][1] for x in range(len(total_perc))]) * 100}%")
    print("Done!")
    if(return_filtered_subreddits):
        return filtered_subs

In [None]:
if RUN_ETTIQUETE_EXPS:
    subreddits = ['Anxiety','bipolar','depression','SuicideWatch']
    filter_etiquette(subreddits,file_type="comment2data",return_filtered_subreddits=True)