In [1]:
pip install kagglehub pandas nltk torch scikit-learn transformers

Note: you may need to restart the kernel to use updated packages.


# Dataset preparation

In [9]:
import kagglehub
import pandas as pd

# Download via Kaggle's API
path = kagglehub.dataset_download("suchintikasarkar/sentiment-analysis-for-mental-health")
print("Path to dataset files:", path)

Path to dataset files: /home/chengyi/.cache/kagglehub/datasets/suchintikasarkar/sentiment-analysis-for-mental-health/versions/1


In [10]:
# Drop NA values & duplicates
df = pd.read_csv(path + "/Combined Data.csv")
print("Raw data length:", len(df))
print("After dropping na values:", len(df.dropna()))
print("After dropping duplicates:", len(df.dropna().drop_duplicates())) # No duplicates present
df = df.dropna().drop_duplicates().drop(columns=['Unnamed: 0'])

Raw data length: 53043
After dropping na values: 52681
After dropping duplicates: 52681


In [11]:
# Handle casing
df["statement"] = df["statement"].str.lower()
df["status"] = df["status"].str.lower()
df.head()

Unnamed: 0,statement,status
0,oh my gosh,anxiety
1,"trouble sleeping, confused mind, restless hear...",anxiety
2,"all wrong, back off dear, forward doubt. stay ...",anxiety
3,i've shifted my focus to something else but i'...,anxiety
4,"i'm restless and restless, it's been a month n...",anxiety


In [12]:
# Handle data types
df["statement"] = df["statement"].astype("string")
df["status"] = df["status"].astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52681 entries, 0 to 53042
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   statement  52681 non-null  string  
 1   status     52681 non-null  category
dtypes: category(1), string(1)
memory usage: 874.9 KB


In [13]:
# Trailing / beginning whitespaces
df["statement"] = df["statement"].str.strip()
df["status"] = df["status"].str.strip()
df["statement"].str.len().describe()

count       52681.0
mean       578.6781
std      846.248914
min             2.0
25%            80.0
50%           317.0
75%           752.0
max         32759.0
Name: statement, dtype: Float64

In [14]:
# Non-alphanumeric characters only
df["statement"] = df["statement"].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True)
df["statement"].str.len().describe()

count       52681.0
mean     564.987548
std      827.183457
min             2.0
25%            77.0
50%           308.0
75%           735.0
max         31499.0
Name: statement, dtype: Float64

In [15]:
# Normalize spacing
df["statement"] = df["statement"].str.replace(r"\s+", " ", regex=True)
df["statement"].str.len().describe()

count       52681.0
mean     564.154705
std      826.122235
min             1.0
25%            77.0
50%           308.0
75%           734.0
max         31499.0
Name: statement, dtype: Float64

In [35]:
df.to_csv('dataset.csv', index=False)

In [17]:
df = pd.read_csv('dataset.csv')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52681 entries, 0 to 52680
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  52681 non-null  object
 1   status     52681 non-null  object
dtypes: object(2)
memory usage: 823.3+ KB
None


Unnamed: 0,statement,status
0,oh my gosh,anxiety
1,trouble sleeping confused mind restless heart ...,anxiety
2,all wrong back off dear forward doubt stay in ...,anxiety
3,ive shifted my focus to something else but im ...,anxiety
4,im restless and restless its been a month now ...,anxiety


# Split data

This was done by Ariel and imported into Google Drive, let's load it.

In [None]:
import pandas as pd


# CHANGE ME
train_path = 'raw_train.csv'
val_path = 'raw_val.csv'
test_path = 'raw_test.csv'

# Read into df
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

# Create features

I made some helper functions to make the features

In [77]:
import make_features_helper as helper

dir(helper)

['CountVectorizer',
 'DEFAULT_MODEL',
 'DEFAULT_TOKENIZER',
 'DEVICE',
 'DistilBertModel',
 'DistilBertTokenizer',
 'SIA',
 'SentimentIntensityAnalyzer',
 'TfidfVectorizer',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'avg_sentence_length_in_characters',
 'avg_sentence_length_in_words',
 'avg_word_length',
 'character_count',
 'countvec',
 'get_embeddings',
 'nltk',
 'reps_ratio',
 'sia_sentiment',
 'tfidfvec',
 'torch',
 'word_count',
 'word_ratio']

In [79]:
from importlib import reload
reload(helper)

[nltk_data] Downloading package punkt to /home/chengyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/chengyi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


<module 'make_features_helper' from '/home/chengyi/Desktop/Classes/ECS171/ECS171-Final-Project/make_features_helper.py'>

For the vectorization features, we only want to fit on the training data, but still transform on the other splits for easy access

In [36]:
# Initialize vectorizers
train_tfidf, tfidfvec = helper.tfidfvec(train_df['statement'])
train_count, countvec = helper.countvec(train_df['statement'])

In [None]:
import numpy as np

# Put into dataframe.
# For later use: np.vstack(df['tfidf_vec'])
train_df['tfidf_vec'] = list(train_tfidf.toarray())
train_df['count_vec'] = list(train_count.toarray())
val_df['tfidf_vec'] = list(tfidfvec.transform(val_df['statement']).toarray())
val_df['count_vec'] = list(countvec.transform(val_df['statement']).toarray())
test_df['tfidf_vec'] = list(tfidfvec.transform(test_df['statement']).toarray())
test_df['count_vec'] = list(countvec.transform(test_df['statement']).toarray())

For pretrained embedding features, we can iterate through the whole dataset no problem.

In [70]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

DEFAULT_TOKENIZER = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
DEFAULT_MODEL = DistilBertModel.from_pretrained('distilbert-base-uncased')
DEFAULT_MODEL.to('cpu')

# 2. Tokenize input
text = "Replace me with any text you'd like."
encoded_input = DEFAULT_TOKENIZER(text, return_tensors='pt', padding=True, truncation=True, max_length=15000)

# 3. Get raw model outputs
with torch.no_grad():
    outputs = DEFAULT_MODEL(**encoded_input)
outputs.last_hidden_state[:, 0, :]

tensor([[-1.4453e-02, -2.7182e-01, -1.4040e-01, -6.6006e-02,  4.6744e-02,
         -3.6045e-01,  1.9694e-01,  5.6017e-01,  1.7628e-01, -3.7279e-01,
         -4.5718e-02, -1.4598e-01, -2.5549e-01,  2.5197e-01,  2.3573e-02,
          3.6898e-02, -4.4133e-02,  2.6953e-01, -6.9140e-03,  1.4042e-01,
         -6.6925e-02, -1.7396e-01, -2.6942e-01,  5.3961e-02,  2.6116e-02,
         -2.0509e-02,  2.1273e-02, -1.2333e-01,  4.3591e-02, -2.3034e-01,
          7.1838e-02,  1.8205e-01, -3.1158e-01, -2.8259e-01, -7.8816e-02,
          2.0568e-01,  5.6633e-02, -1.0214e-01, -1.4757e-01, -3.1617e-02,
         -5.3188e-01,  7.5297e-02,  1.4844e-01,  8.5746e-03,  3.0512e-01,
         -1.6850e-01, -2.6342e+00, -1.2698e-01, -2.5662e-01, -4.0931e-01,
          1.6262e-02,  1.2963e-01,  3.2765e-01,  4.0754e-01,  6.3290e-01,
          3.3945e-01, -2.4992e-01,  2.2208e-01, -7.5843e-03,  4.0662e-01,
          1.1378e-01,  6.3623e-02, -2.5635e-01, -1.5354e-01, -1.0807e-02,
          4.6236e-02,  2.0253e-02,  6.

In [None]:
bert_train, bert_val, bert_test = [], [], []
for statement in train_df['statement']:
  bert_train.append(helper.get_embeddings(statement))
for statement in val_df['statement']:
  bert_val.append(helper.get_embeddings(statement))
for statement in test_df['statmeent']:
  bert_test.append(helper.get_embeddings(statement))

# For later use: np.vstack(df['bert_features'])
train_df['bert_features'] = bert_train
val_df['bert_features'] = bert_val
test_df['bert_features'] = bert_test

For the rest of the features, we'll do the same iterative process

In [None]:
# For word ratios
first_person_pronouns = ['i', 'me', 'my', 'mine', 'myself']
negatives = ['no', 'not', 'never', 'nothing', 'wrong', 'nope']
suicide_words = ['die', 'end', 'forever', 'leave', 'gone', 'suicide', 'kill']
ratio_items = {'first_person_ratio' : first_person_pronouns, 'negatives_ratio': negatives, 'suicide_ratio': suicide_words}

In [None]:
from nltk.tokenize import sent_tokenize

# List all functions
feature2function = {
  'word_ratio' : helper.word_ratio,
  'reps_ratio' : helper.reps_ratio,
  'character_count' : helper.character_count,
  'word_count' : helper.word_count,
  'avg_word_length' : helper.avg_word_length,
  'avg_sentence_length_in_words' : helper.avg_sentence_length_in_words,
  'avg_sentence_length_in_characters' : helper.avg_sentence_length_in_characters,
  'sia_sentiment' : helper.sia_sentiment,
}

for column in feature2function.keys():

  function = feature2function[column]

  if column == 'word_ratio':
    for ratio in ratio_items.keys():

      ratio_item = ratio_items[ratio]
      new_col_train, new_col_val, new_col_test = [], [], []

      for statement in train_df['statement']:
        new_col_train.append(function(statement, ratio_item))
      for statement in val_df['statement']:
        new_col_val.append(function(statement, ratio_item))
      for statement in test_df['statmeent']:
        new_col_test.append(function(statement, ratio_item))

      train_df[ratio] = new_col_train
      val_df[ratio] = new_col_val
      test_df[ratio] = new_col_test

  elif 'avg_sentence' in column:

    new_col_train, new_col_val, new_col_test = [], [], []

    for statement in train_df['statement']:
      sentences = sent_tokenize(statement)
      new_col_train.append(function(sentences))
    for statement in val_df['statement']:
      sentences = sent_tokenize(statement)
      new_col_val.append(function(sentences))
    for statement in test_df['statmeent']:
      sentences = sent_tokenize(statement)
      new_col_test.append(function(sentences))

    train_df[column] = new_col_train
    val_df[column] = new_col_val
    test_df[column] = new_col_test

  else:
    pass



In [None]:
# Save
train_df.to_csv('features_train.csv')
val_df.to_csv('features_val.csv')
test_df.to_csv('features_test.csv')