In [1]:
pip install kagglehub pandas nltk torch scikit-learn transformers

Note: you may need to restart the kernel to use updated packages.


# Dataset preparation

In [9]:
import kagglehub
import pandas as pd

# Download via Kaggle's API
path = kagglehub.dataset_download("suchintikasarkar/sentiment-analysis-for-mental-health")
print("Path to dataset files:", path)

Path to dataset files: /home/chengyi/.cache/kagglehub/datasets/suchintikasarkar/sentiment-analysis-for-mental-health/versions/1


In [10]:
# Drop NA values & duplicates
df = pd.read_csv(path + "/Combined Data.csv")
print("Raw data length:", len(df))
print("After dropping na values:", len(df.dropna()))
print("After dropping duplicates:", len(df.dropna().drop_duplicates())) # No duplicates present
df = df.dropna().drop_duplicates().drop(columns=['Unnamed: 0'])

Raw data length: 53043
After dropping na values: 52681
After dropping duplicates: 52681


In [11]:
# Handle casing
df["statement"] = df["statement"].str.lower()
df["status"] = df["status"].str.lower()
df.head()

Unnamed: 0,statement,status
0,oh my gosh,anxiety
1,"trouble sleeping, confused mind, restless hear...",anxiety
2,"all wrong, back off dear, forward doubt. stay ...",anxiety
3,i've shifted my focus to something else but i'...,anxiety
4,"i'm restless and restless, it's been a month n...",anxiety


In [12]:
# Handle data types
df["statement"] = df["statement"].astype("string")
df["status"] = df["status"].astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52681 entries, 0 to 53042
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   statement  52681 non-null  string  
 1   status     52681 non-null  category
dtypes: category(1), string(1)
memory usage: 874.9 KB


In [13]:
# Trailing / beginning whitespaces
df["statement"] = df["statement"].str.strip()
df["status"] = df["status"].str.strip()
df["statement"].str.len().describe()

count       52681.0
mean       578.6781
std      846.248914
min             2.0
25%            80.0
50%           317.0
75%           752.0
max         32759.0
Name: statement, dtype: Float64

In [14]:
# Non-alphanumeric characters only
df["statement"] = df["statement"].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True)
df["statement"].str.len().describe()

count       52681.0
mean     564.987548
std      827.183457
min             2.0
25%            77.0
50%           308.0
75%           735.0
max         31499.0
Name: statement, dtype: Float64

In [15]:
# Normalize spacing
df["statement"] = df["statement"].str.replace(r"\s+", " ", regex=True)
df["statement"].str.len().describe()

count       52681.0
mean     564.154705
std      826.122235
min             1.0
25%            77.0
50%           308.0
75%           734.0
max         31499.0
Name: statement, dtype: Float64

In [35]:
df.to_csv('dataset.csv', index=False)

In [17]:
df = pd.read_csv('dataset.csv')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52681 entries, 0 to 52680
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  52681 non-null  object
 1   status     52681 non-null  object
dtypes: object(2)
memory usage: 823.3+ KB
None


Unnamed: 0,statement,status
0,oh my gosh,anxiety
1,trouble sleeping confused mind restless heart ...,anxiety
2,all wrong back off dear forward doubt stay in ...,anxiety
3,ive shifted my focus to something else but im ...,anxiety
4,im restless and restless its been a month now ...,anxiety


# Split data

This was done by Ariel and imported into Google Drive, let's load it.

In [2]:
!pip install pandas transformers numpy nltk scikit-learn torch transformers

Collecting pandas
  Downloading pandas-3.0.0-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)
Collecting transformers
  Downloading transformers-5.1.0-py3-none-any.whl.metadata (31 kB)
Collecting numpy
  Downloading numpy-2.4.2-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting nltk
  Using cached nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting torch
  Downloading torch-2.10.0-cp314-cp314-manylinux_2_28_x86_64.whl.metadata (31 kB)
Collecting huggingface-hub<2.0,>=1.3.0 (from transformers)
  Downloading huggingface_hub-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloa

In [1]:
import pandas as pd


# CHANGE ME
train_path = 'raw_train.csv'
val_path = 'raw_val.csv'
test_path = 'raw_test.csv'

# Read into df
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

# Create features

I made some helper functions to make the features

In [2]:
import make_features_helper as helper

dir(helper)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/chengyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/chengyi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1331.25it/s, Materializing param=transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_transform.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


['CountVectorizer',
 'DEFAULT_MODEL',
 'DEFAULT_TOKENIZER',
 'DEVICE',
 'DistilBertModel',
 'DistilBertTokenizer',
 'SIA',
 'SentimentIntensityAnalyzer',
 'TfidfVectorizer',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'avg_sentence_length_in_characters',
 'avg_sentence_length_in_words',
 'avg_word_length',
 'character_count',
 'countvec',
 'get_embeddings',
 'nltk',
 'reps_ratio',
 'sia_sentiment',
 'tfidfvec',
 'torch',
 'word_count',
 'word_ratio']

In [16]:
from importlib import reload
reload(helper)

[nltk_data] Downloading package punkt to /home/chengyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/chengyi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1151.04it/s, Materializing param=transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_transform.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


<module 'make_features_helper' from '/home/chengyi/Projects/ECS171-Final-Project/make_features_helper.py'>

For the vectorization features, we only want to fit on the training data, but still transform on the other splits for easy access

In [7]:
# Initialize vectorizers
train_tfidf, tfidfvec = helper.tfidfvec(train_df['statement'])
train_count, countvec = helper.countvec(train_df['statement'])

In [8]:
import numpy as np

# Put into dataframe.
# For later use: np.vstack(df['tfidf_vec'])
train_df['tfidf_vec'] = list(train_tfidf.toarray())
train_df['count_vec'] = list(train_count.toarray())
val_df['tfidf_vec'] = list(tfidfvec.transform(val_df['statement']).toarray())
val_df['count_vec'] = list(countvec.transform(val_df['statement']).toarray())
test_df['tfidf_vec'] = list(tfidfvec.transform(test_df['statement']).toarray())
test_df['count_vec'] = list(countvec.transform(test_df['statement']).toarray())

For pretrained embedding features, we can iterate through the whole dataset no problem.

In [5]:
from tqdm import tqdm

bert_train = []
bert_val = []
bert_test = []
for statement in tqdm(train_df['statement']):
  bert_train.append(helper.get_embeddings(statement))
for statement in tqdm(val_df['statement']):
  bert_val.append(helper.get_embeddings(statement))
for statement in tqdm(test_df['statement']):
  bert_test.append(helper.get_embeddings(statement))

# For later use: np.vstack(df['bert_features'])
train_df['bert_features'] = bert_train
val_df['bert_features'] = bert_val
test_df['bert_features'] = bert_test

100%|██████████| 7903/7903 [00:30<00:00, 258.80it/s]


For the rest of the features, we'll do the same iterative process

In [10]:
# For word ratios
first_person_pronouns = ['i', 'me', 'my', 'mine', 'myself']
negatives = ['no', 'not', 'never', 'nothing', 'wrong', 'nope']
suicide_words = ['die', 'end', 'forever', 'leave', 'gone', 'suicide', 'kill']
ratio_items = {'first_person_ratio' : first_person_pronouns, 'negatives_ratio': negatives, 'suicide_ratio': suicide_words}

In [18]:
from nltk.tokenize import sent_tokenize

# List all functions
feature2function = {
  'word_ratio' : helper.word_ratio,
  'reps_ratio' : helper.reps_ratio,
  'character_count' : helper.character_count,
  'word_count' : helper.word_count,
  'avg_word_length' : helper.avg_word_length,
  'avg_sentence_length_in_words' : helper.avg_sentence_length_in_words,
  'avg_sentence_length_in_characters' : helper.avg_sentence_length_in_characters,
  'sia_sentiment' : helper.sia_sentiment,
}

for column in feature2function.keys():

  function = feature2function[column]

  if column == 'word_ratio':
    for ratio in ratio_items.keys():

      ratio_item = ratio_items[ratio]
      new_col_train, new_col_val, new_col_test = [], [], []

      for statement in train_df['statement']:
        new_col_train.append(function(statement, ratio_item))
      for statement in val_df['statement']:
        new_col_val.append(function(statement, ratio_item))
      for statement in test_df['statement']:
        new_col_test.append(function(statement, ratio_item))

      train_df[ratio] = new_col_train
      val_df[ratio] = new_col_val
      test_df[ratio] = new_col_test

  elif 'avg_sentence' in column:

    new_col_train, new_col_val, new_col_test = [], [], []

    for statement in train_df['statement']:
      sentences = sent_tokenize(statement)
      new_col_train.append(function(sentences))
    for statement in val_df['statement']:
      sentences = sent_tokenize(statement)
      new_col_val.append(function(sentences))
    for statement in test_df['statement']:
      sentences = sent_tokenize(statement)
      new_col_test.append(function(sentences))

    train_df[column] = new_col_train
    val_df[column] = new_col_val
    test_df[column] = new_col_test

  else:
    pass



In [21]:
display(train_df)
display(val_df)
display(test_df)

Unnamed: 0,statement,status,bert_features,tfidf_vec,count_vec,first_person_ratio,negatives_ratio,suicide_ratio,avg_sentence_length_in_words,avg_sentence_length_in_characters
0,to put some context i have been though some th...,suicidal,"[tensor(0.1133), tensor(-0.0328), tensor(0.096...","[0.048380833746359736, 0.0, 0.0, 0.02265680514...","[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...",0.154242,0.007712,0.017995,389.0,1871.0
1,michaelgrainger scyranth gigglessssssss yall a...,normal,"[tensor(-0.2632), tensor(0.0368), tensor(-0.01...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000,0.000000,0.000000,6.0,61.0
2,so right now they are planning to meet at the ...,suicidal,"[tensor(0.1105), tensor(0.0768), tensor(0.1931...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.120690,0.043103,0.017241,116.0,517.0
3,being broke is no fun,normal,"[tensor(-0.0191), tensor(-0.0373), tensor(-0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000,0.200000,0.000000,5.0,21.0
4,you dont need to worry about that,normal,"[tensor(-0.0658), tensor(0.0414), tensor(-0.11...","[0.0, 0.0, 0.0, 0.26469062066395976, 0.0, 0.0,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000,0.000000,0.000000,7.0,33.0
...,...,...,...,...,...,...,...,...,...,...
36871,things were better for a little bit but now ye...,suicidal,"[tensor(0.0296), tensor(0.0840), tensor(0.2714...","[0.0, 0.0, 0.0, 0.06630158047757449, 0.0, 0.11...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.126214,0.029126,0.019417,103.0,511.0
36872,just curious because everything just says they...,depression,"[tensor(-0.0084), tensor(0.1656), tensor(0.072...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",0.061728,0.012346,0.000000,81.0,428.0
36873,sweating when hypomanic diagnosed and medicate...,bipolar,"[tensor(0.0761), tensor(0.0824), tensor(-0.026...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.042553,0.000000,0.000000,47.0,273.0
36874,admitting to yourself that you have been doing...,depression,"[tensor(0.1545), tensor(0.1348), tensor(-0.156...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.066667,0.026667,0.000000,75.0,364.0


Unnamed: 0,statement,status,bert_features,tfidf_vec,count_vec,first_person_ratio,negatives_ratio,suicide_ratio,avg_sentence_length_in_words,avg_sentence_length_in_characters
0,marthastewart proof,normal,"[tensor(-0.4538), tensor(-0.0184), tensor(-0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000,0.000000,0.000000,2.0,19.0
1,when i get told i am a good guy it sometimes d...,depression,"[tensor(0.0117), tensor(0.1366), tensor(-0.026...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.160000,0.024000,0.000000,125.0,575.0
2,kuans our tp will start noworench lang sakalam,normal,"[tensor(-0.1735), tensor(-0.0358), tensor(0.04...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000,0.000000,0.000000,8.0,46.0
3,sense of fear have you ever felt a sense of fe...,stress,"[tensor(-0.0940), tensor(0.1250), tensor(-0.04...","[0.0, 0.0, 0.0, 0.04309118446287753, 0.0, 0.07...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000,0.000000,0.000000,155.0,911.0
4,i wont be able to take my latuda for the weeke...,bipolar,"[tensor(0.0041), tensor(0.1421), tensor(0.3476...","[0.0, 0.13908048873273154, 0.13953907255749148...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",0.113924,0.012658,0.000000,79.0,396.0
...,...,...,...,...,...,...,...,...,...,...
7897,it s tuesday evening and i haven t been able t...,depression,"[tensor(0.1243), tensor(0.2234), tensor(0.1163...","[0.0, 0.03951822034079036, 0.03964852198695735...","[0, 1, 1, 4, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, ...",0.038934,0.014344,0.000000,488.0,2533.0
7898,i figure i will leave a note here and in my no...,suicidal,"[tensor(0.0980), tensor(0.0908), tensor(0.2472...","[0.0, 0.0, 0.0, 0.06124285758120328, 0.0, 0.0,...","[0, 0, 0, 4, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 2, ...",0.105263,0.028822,0.002506,798.0,3779.0
7899,suicidal thoughts are nothing new to me howeve...,suicidal,"[tensor(0.1655), tensor(0.1871), tensor(0.1810...","[0.0, 0.05699396505818979, 0.05718188868948248...","[0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",0.140496,0.024793,0.000000,242.0,1162.0
7900,if this is not allowed then please removei am ...,suicidal,"[tensor(0.1321), tensor(0.1497), tensor(0.0008...","[0.0, 0.0, 0.0, 0.07920137744002184, 0.0, 0.0,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.038961,0.025974,0.000000,77.0,408.0


Unnamed: 0,statement,status,bert_features,tfidf_vec,count_vec,first_person_ratio,negatives_ratio,suicide_ratio,avg_sentence_length_in_words,avg_sentence_length_in_characters
0,i feel a colon cancer meltdown coming on i pro...,anxiety,"[tensor(0.0504), tensor(0.3085), tensor(0.1517...","[0.0, 0.0, 0.0, 0.2325254594824028, 0.09922218...","[0, 0, 0, 5, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.085427,0.010050,0.000000,199.0,1062.0
1,perghh lame crazy xopen twitter,normal,"[tensor(-0.0395), tensor(-0.0696), tensor(-0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000,0.000000,0.000000,5.0,31.0
2,jing my eyes are swollen from being bitten is ...,normal,"[tensor(0.1794), tensor(0.1304), tensor(-0.069...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.066667,0.000000,0.000000,15.0,72.0
3,my parents really want me to go to college but...,suicidal,"[tensor(0.0960), tensor(0.1084), tensor(0.0376...","[0.0, 0.06736962333709838, 0.0675917581586256,...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.109195,0.045977,0.000000,174.0,841.0
4,i have been suffering for eight long month aft...,depression,"[tensor(0.0799), tensor(0.0757), tensor(-0.018...","[0.0, 0.0, 0.0, 0.019489295076261143, 0.0, 0.0...","[0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, ...",0.075901,0.017078,0.011385,527.0,2545.0
...,...,...,...,...,...,...,...,...,...,...
7898,well hold on a little longer,normal,"[tensor(0.0020), tensor(-0.1211), tensor(0.077...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000,0.000000,0.000000,6.0,28.0
7899,this feels very weird to do this but maybe i j...,suicidal,"[tensor(0.2397), tensor(0.1970), tensor(0.0407...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.116105,0.029963,0.003745,267.0,1308.0
7900,should i tell my professor im sick and miss la...,bipolar,"[tensor(-0.0355), tensor(0.1879), tensor(0.264...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",0.116022,0.000000,0.000000,181.0,896.0
7901,i am going through a huge rough patch right no...,depression,"[tensor(0.2818), tensor(0.0192), tensor(0.1761...","[0.15686161134447255, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.069444,0.027778,0.000000,72.0,339.0


In [None]:
# Save (8 mins to save)
train_df.to_csv('features_train.csv')
val_df.to_csv('features_val.csv')
test_df.to_csv('features_test.csv')

In [24]:
# Load
import pandas as pd

features_train = pd.read_csv('features_train.csv')
features_val = pd.read_csv('features_val.csv')
features_test = pd.read_csv('features_test.csv')

display(features_train)
display(features_val)
display(features_test)

Unnamed: 0.1,Unnamed: 0,statement,status,bert_features,tfidf_vec,count_vec,first_person_ratio,negatives_ratio,suicide_ratio,avg_sentence_length_in_words,avg_sentence_length_in_characters
0,0,to put some context i have been though some th...,suicidal,"tensor([ 1.1332e-01, -3.2757e-02, 9.6535e-02,...",[0.04838083 0. 0. 0.02265681 0...,[ 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 ...,0.154242,0.007712,0.017995,389.0,1871.0
1,1,michaelgrainger scyranth gigglessssssss yall a...,normal,"tensor([-2.6322e-01, 3.6765e-02, -1.6883e-02,...",[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0.000000,0.000000,0.000000,6.0,61.0
2,2,so right now they are planning to meet at the ...,suicidal,"tensor([ 1.1049e-01, 7.6769e-02, 1.9305e-01,...",[0. 0. 0. 0. 0...,[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0.120690,0.043103,0.017241,116.0,517.0
3,3,being broke is no fun,normal,"tensor([-1.9146e-02, -3.7258e-02, -4.3136e-03,...",[0. 0. 0. 0. 0...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0.000000,0.200000,0.000000,5.0,21.0
4,4,you dont need to worry about that,normal,"tensor([-6.5769e-02, 4.1372e-02, -1.1422e-01,...",[0. 0. 0. 0.26469062 0...,[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0.000000,0.000000,0.000000,7.0,33.0
...,...,...,...,...,...,...,...,...,...,...,...
36871,36871,things were better for a little bit but now ye...,suicidal,"tensor([ 2.9636e-02, 8.3965e-02, 2.7136e-01,...",[0. 0. 0. 0.06630158 0...,[0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0...,0.126214,0.029126,0.019417,103.0,511.0
36872,36872,just curious because everything just says they...,depression,"tensor([-8.3549e-03, 1.6555e-01, 7.2433e-02,...",[0. 0. 0. 0. 0...,[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0...,0.061728,0.012346,0.000000,81.0,428.0
36873,36873,sweating when hypomanic diagnosed and medicate...,bipolar,"tensor([ 7.6064e-02, 8.2418e-02, -2.6254e-02,...",[0. 0. 0. 0. 0...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0.042553,0.000000,0.000000,47.0,273.0
36874,36874,admitting to yourself that you have been doing...,depression,"tensor([ 1.5449e-01, 1.3482e-01, -1.5622e-01,...",[0. 0. 0. 0. 0...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0.066667,0.026667,0.000000,75.0,364.0


Unnamed: 0.1,Unnamed: 0,statement,status,bert_features,tfidf_vec,count_vec,first_person_ratio,negatives_ratio,suicide_ratio,avg_sentence_length_in_words,avg_sentence_length_in_characters
0,0,marthastewart proof,normal,"tensor([-4.5381e-01, -1.8423e-02, -4.4112e-02,...",[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0.000000,0.000000,0.000000,2.0,19.0
1,1,when i get told i am a good guy it sometimes d...,depression,"tensor([ 1.1682e-02, 1.3658e-01, -2.6794e-02,...",[0. 0. 0. 0. 0...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0...,0.160000,0.024000,0.000000,125.0,575.0
2,2,kuans our tp will start noworench lang sakalam,normal,"tensor([-1.7350e-01, -3.5781e-02, 4.1317e-02,...",[0. 0. 0. 0. 0...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0.000000,0.000000,0.000000,8.0,46.0
3,3,sense of fear have you ever felt a sense of fe...,stress,"tensor([-9.4029e-02, 1.2503e-01, -4.5114e-02,...",[0. 0. 0. 0.04309118 0...,[0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0.000000,0.000000,0.000000,155.0,911.0
4,4,i wont be able to take my latuda for the weeke...,bipolar,"tensor([ 4.1228e-03, 1.4209e-01, 3.4761e-01,...",[0. 0.13908049 0.13953907 0. 0...,[0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0...,0.113924,0.012658,0.000000,79.0,396.0
...,...,...,...,...,...,...,...,...,...,...,...
7897,7897,it s tuesday evening and i haven t been able t...,depression,"tensor([ 1.2434e-01, 2.2341e-01, 1.1633e-01,...",[0. 0.03951822 0.03964852 0.09504927 0...,[ 0 1 1 4 0 0 0 0 1 0 0 0 0 0 3 ...,0.038934,0.014344,0.000000,488.0,2533.0
7898,7898,i figure i will leave a note here and in my no...,suicidal,"tensor([ 9.7973e-02, 9.0796e-02, 2.4725e-01,...",[0. 0. 0. 0.06124286 0...,[ 0 0 0 4 0 0 3 0 0 0 0 0 0 0 2 ...,0.105263,0.028822,0.002506,798.0,3779.0
7899,7899,suicidal thoughts are nothing new to me howeve...,suicidal,"tensor([ 1.6552e-01, 1.8709e-01, 1.8102e-01,...",[0. 0.05699397 0.05718189 0.06854098 0...,[ 0 1 1 2 0 0 0 0 0 0 0 0 0 1 0 ...,0.140496,0.024793,0.000000,242.0,1162.0
7900,7900,if this is not allowed then please removei am ...,suicidal,"tensor([ 1.3215e-01, 1.4967e-01, 7.9939e-04,...",[0. 0. 0. 0.07920138 0...,[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0.038961,0.025974,0.000000,77.0,408.0


Unnamed: 0.1,Unnamed: 0,statement,status,bert_features,tfidf_vec,count_vec,first_person_ratio,negatives_ratio,suicide_ratio,avg_sentence_length_in_words,avg_sentence_length_in_characters
0,0,i feel a colon cancer meltdown coming on i pro...,anxiety,"tensor([ 5.0419e-02, 3.0852e-01, 1.5171e-01,...",[0. 0. 0. 0.23252546 0...,[0 0 0 5 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0...,0.085427,0.010050,0.000000,199.0,1062.0
1,1,perghh lame crazy xopen twitter,normal,"tensor([-3.9505e-02, -6.9615e-02, -8.4033e-03,...",[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0.000000,0.000000,0.000000,5.0,31.0
2,2,jing my eyes are swollen from being bitten is ...,normal,"tensor([ 1.7944e-01, 1.3035e-01, -6.9125e-02,...",[0. 0. 0. 0. 0...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0.066667,0.000000,0.000000,15.0,72.0
3,3,my parents really want me to go to college but...,suicidal,"tensor([ 9.5975e-02, 1.0840e-01, 3.7629e-02,...",[0. 0.06736962 0.06759176 0. 0...,[ 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 ...,0.109195,0.045977,0.000000,174.0,841.0
4,4,i have been suffering for eight long month aft...,depression,"tensor([ 7.9945e-02, 7.5668e-02, -1.8524e-02,...",[0. 0. 0. 0.0194893 0...,[ 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 ...,0.075901,0.017078,0.011385,527.0,2545.0
...,...,...,...,...,...,...,...,...,...,...,...
7898,7898,well hold on a little longer,normal,"tensor([ 2.0368e-03, -1.2113e-01, 7.7899e-02,...",[0. 0. 0. 0. 0...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0.000000,0.000000,0.000000,6.0,28.0
7899,7899,this feels very weird to do this but maybe i j...,suicidal,"tensor([ 2.3974e-01, 1.9701e-01, 4.0718e-02,...",[0. 0. 0. 0. 0...,[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0.116105,0.029963,0.003745,267.0,1308.0
7900,7900,should i tell my professor im sick and miss la...,bipolar,"tensor([-3.5477e-02, 1.8785e-01, 2.6438e-01,...",[0. 0. 0. 0. 0...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0...,0.116022,0.000000,0.000000,181.0,896.0
7901,7901,i am going through a huge rough patch right no...,depression,"tensor([ 2.8183e-01, 1.9205e-02, 1.7606e-01,...",[0.15686161 0. 0. 0. 0...,[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0.069444,0.027778,0.000000,72.0,339.0
