<a href="https://colab.research.google.com/github/georgiepayne/reddit_mental_health_sentimentanalysis/blob/master/reddit_mental_health.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install nltk  # Package used for prepocessing data
!pip install pyspellchecker # Package used to correct spelling



In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import string
import nltk
from nltk.corpus import stopwords  # To help get a list of stopwords to exclude
nltk.download('stopwords') # The word bank we references to delete stopwords
from collections import Counter # To help count the frequent words
from nltk.stem.porter import PorterStemmer  # To help stem words
from spellchecker import SpellChecker  # To help correct spelling
import re  # To help remove urls

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
ds_load = load_dataset("solomonk/reddit_mental_health_posts")
print(type(ds_load['train']))

df = pd.DataFrame(ds_load['train'])

# Replace the body with a title any time it gets deleted or removed

# Strip any whitespace from the `body` column
body_stripped = df['body'].str.strip()

# Check if the `body` contains `[deleted]` or `[removed]`
is_deleted = body_stripped.isin(['[deleted]', '[removed]'])

# Replace the entry in body with the entry in title
df.loc[is_deleted, 'body'] = ""

df.head(10)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Repo card metadata block was not found. Setting CardData to empty.


<class 'datasets.arrow_dataset.Dataset'>


Unnamed: 0,author,body,created_utc,id,num_comments,score,subreddit,title,upvote_ratio,url
0,HotConversation1273,A few months ago I was accepted into this full...,2021-12-22T18:32:56.000Z,rmbjwb,1,1,ADHD,I get extremely anxious if I’m not working 24/7,1.0,https://www.reddit.com/r/ADHD/comments/rmbjwb/...
1,snorefestt,"Hey guys, I was curious if anyone else has the...",2021-12-22T18:24:25.000Z,rmbd1y,3,5,ADHD,"I can't will myself to clean my own house, but...",1.0,https://www.reddit.com/r/ADHD/comments/rmbd1y/...
2,etyf12,\n\ni have 6 exams in the next 2 weeks one of...,2021-12-22T18:22:52.000Z,rmbbvu,1,2,ADHD,i need some help,1.0,https://www.reddit.com/r/ADHD/comments/rmbbvu/...
3,GetHairOrDieTryin,Is there anyone out there that is struggling w...,2021-12-22T18:20:35.000Z,rmba1t,3,2,ADHD,Anyone up for a chat?,1.0,https://www.reddit.com/r/ADHD/comments/rmba1t/...
4,ZeroTransPat,"Whenever I get hungry, I never eat because I d...",2021-12-22T18:18:47.000Z,rmb8lm,2,1,ADHD,Figuring out what to eat sucks,1.0,https://www.reddit.com/r/ADHD/comments/rmb8lm/...
5,[deleted],,2021-12-22T18:18:19.000Z,rmb88p,1,1,ADHD,Watching movies at x1.5 playback speed,1.0,https://www.reddit.com/r/ADHD/comments/rmb88p/...
6,Used_Inspection2618,I’m on 20 mg of Lexapro and 50 mg Vyvanse and ...,2021-12-22T18:15:52.000Z,rmb6f2,4,1,ADHD,Drinking while on meds?,1.0,https://www.reddit.com/r/ADHD/comments/rmb6f2/...
7,TheToastyToad,"I've recently had a big lifestyle change, with...",2021-12-22T18:13:15.000Z,rmb47u,1,2,ADHD,Using Christmas to take a break,1.0,https://www.reddit.com/r/ADHD/comments/rmb47u/...
8,Field-cave1519,"I'm a newly diagnosed 42 yr old female, who st...",2021-12-22T18:11:07.000Z,rmb2i8,1,1,ADHD,Does everyone get the euphoria feeling when th...,1.0,https://www.reddit.com/r/ADHD/comments/rmb2i8/...
9,Away_Entertainment29,TL;DR - rough time titrating on concerta for t...,2021-12-22T18:09:50.000Z,rmb1ib,1,1,ADHD,Xaggatin: zoned out and angry,1.0,https://www.reddit.com/r/ADHD/comments/rmb1ib/...


In [None]:
# Helper functions that the preprocess function calls

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Helper functions that the preprocess function calls
def remove_stopwords(text):

  # Get the stopwords from from the english language
  # This data is found in the stopwords libarary
  STOPWORDS = set(stopwords.words('english'))

  # Split the string into and array of words
  words = str(text).split()

  filtered = []  # To keep track of the filtered words

  # Loop through the words array
  for word in words:

    # Append any words not in the STOPWORDS array
    filtered.append(word) if word not in STOPWORDS else None

  # Make the final string by adding spaces between all stop words
  cleaned_string = " ".join(filtered)
  return cleaned_string

def remove_frequent_and_rare(data, top_n=10, min_frequency=10):
  counter = Counter()  # make a counter

  # Count words from both 'body' and 'title' columns
  for col in ["body", "title"]:
    # get entrues as strings
    data[col] = data[col].astype(str)

    # update counter for each word
    for text in data[col]:
      for word in text.split():
        counter[word] +=1

  # Get the top N most frequent words
  most_frequent_words = {word for word, _ in counter.most_common(top_n)}

  # Get the rare words (show up less than min_frequency)
  rare_words = {word for word, count in counter.items() if count < min_frequency}

  # Combine frequent and rare words to be filtered out
  words_to_filter = most_frequent_words.union(rare_words)

  # Filter the 'body' and 'title' columns
  for col in ["body", "title"]:
    filtered_column = []
    for text in data[col]:
      words = text.split()
      filtered_words = [word for word in words if word not in words_to_filter]
      filtered_column.append(" ".join(filtered_words))

    # Update the DataFrame column
    data[col] = filtered_column

  return data


def stem_words(text):
  stemmer = PorterStemmer()
  words = text.split()

  # Store the filtered words in the array
  filtered = []
  # Go through the text
  for word in words:
    # And stem each words
    filtered.append(stemmer.stem(word))

  cleaned_string = " ".join(filtered)
  return cleaned_string

def expand_abbreviations(text):

  # Turn a the text file containing abbreviations into a data frame
  path = "abbreviations.txt"
  df = pd.read_csv(path, sep="=", names=["abbreviation", "expanded"], header=None)

  abbreviations_dictionary = {}

  # I work with arrays better so I'm converting it into one
  data_array = df[["abbreviation", "expanded"]].values

  # Fill up the dictionary
  for row in data_array:
    abbreviation = row[0]
    expanded = row[1]
    abbreviations_dictionary[abbreviation] = expanded

  expanded_text = []

  words = text.split()

  # Go through the text
  for word in words:
    # Check if the abbreviation matches anything in the dictionary
    if word in abbreviations_dictionary:
      expanded_text.append( abbreviations_dictionary[word] )
    else:
      expanded_text.append(word)

  cleaned_string = " ".join(expanded_text)
  return cleaned_string


def remove_urls(text):
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
  return url_pattern.sub(r'', text)

In [None]:
# Preprocessing function the cleans the the input data
def preprocess(data):
  # Set everything to lowercase
  data["body"] = data["body"].str.lower()
  data["title"] = data["title"].str.lower()

  # Remove all punctuation
  data["body"] = data["body"].apply(lambda text: remove_punctuation(text) if text is not None else text)
  data["title"] = data["title"].apply(lambda text: remove_punctuation(text) if text is not None else text)

  # remove stop words
  data["body"] = data["body"].apply(lambda text: remove_stopwords(text) if text is not None else text)
  data["title"] = data["title"].apply(lambda text: remove_stopwords(text) if text is not None else text)

  # remove frequent words and rare words
  data = remove_frequent_and_rare(data)

  # apply stemming
  data["body"] = data["body"].apply(lambda text: stem_words(text) if text is not None else text)
  data["title"] = data["title"].apply(lambda text: stem_words(text) if text is not None else text)

  # expand abbreviations
  data["body"] = data["body"].apply(lambda text: expand_abbreviations(text) if text is not None else text)
  data["title"] = data["title"].apply(lambda text: expand_abbreviations(text) if text is not None else text)

  # remove urls
  data["body"] = data["body"].apply(lambda text: remove_urls(text) if text is not None else text)
  data["title"] = data["title"].apply(lambda text: remove_urls(text) if text is not None else text)

  return data


preprocessed_df = preprocess(df)
print(preprocessed_df.head())

                author                                               body  \
0  HotConversation1273  month ago accept full softwar engin it’ made r...   
1           snorefestt  hey guy curiou anyon els issu apart fuck exagg...   
2               etyf12  6 exam next 2 week one monday havent studi ove...   
3    GetHairOrDieTryin  anyon struggl addadhd that’ interest chat bit ...   
4         ZeroTransPat  whenev hungri never eat eat end pizza easi obv...   

                created_utc      id  num_comments  score subreddit  \
0  2021-12-22T18:32:56.000Z  rmbjwb             1      1      ADHD   
1  2021-12-22T18:24:25.000Z  rmbd1y             3      5      ADHD   
2  2021-12-22T18:22:52.000Z  rmbbvu             1      2      ADHD   
3  2021-12-22T18:20:35.000Z  rmba1t             3      2      ADHD   
4  2021-12-22T18:18:47.000Z  rmb8lm             2      1      ADHD   

                                               title  upvote_ratio  \
0                             extrem anxiou wo

In [None]:
preprocessed_df.to_csv('preprocessed_data.csv', index=False)

In [None]:
X = df[['body', 'title']]   # Input the model uses to make a prediction
y = df['subreddit']         # Metal health issue the model should predict

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# bag of words - sklearn count vectorizer