In [1]:

import pandas as pd
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

In [2]:
# --------------------
# Load and Clean Data
# --------------------
df = pd.read_csv("data/Depression_Severity_Levels_Dataset.csv")
df = df.dropna(subset=['text']).copy()

In [3]:
df['label'] = df['label'].str.lower().str.strip().str.replace(")", "", regex=False)

In [4]:
print(df['label'])

0            mild
1         minimum
2         minimum
3            mild
4        moderate
           ...   
41868     minimum
41869     minimum
41870     minimum
41871     minimum
41872     minimum
Name: label, Length: 41859, dtype: object


In [5]:
# Binary classification
yes_labels = ['mild', 'moderate', 'severe', 'extreme']
no_labels = ['minimum', 'none']
df['binary_label'] = df['label'].apply(lambda x: 'YES' if x in yes_labels else ('NO' if x in no_labels else 'UNKNOWN'))
df_cleaned = df[df['binary_label'] != 'UNKNOWN'].copy()

In [6]:
df_cleaned.head()

Unnamed: 0,text,label,binary_label
0,"He said he had not felt that way before, sugge...",mild,YES
1,"Hey there r/assistance, Not sure if this is th...",minimum,NO
2,My mom then hit me with the newspaper and it s...,minimum,NO
3,"until i met my new boyfriend, he is amazing, h...",mild,YES
4,October is Domestic Violence Awareness Month a...,moderate,YES


In [7]:
# ----------------------------
# Psycholinguistic Features
# ----------------------------
def extract_features(text):
    text = text.lower()
    blob = TextBlob(text)
    sentiment = blob.sentiment
    polarity = sentiment.polarity
    subjectivity = sentiment.subjectivity
    pronouns = len(re.findall(r'\b(i|me|my|mine|we|us|our|ours)\b', text))
    negations = len(re.findall(r'\b(not|no|never|none|nothing|nobody|neither|nowhere|hardly|scarcely|barely)\b', text))
    temporal = len(re.findall(r'\b(today|yesterday|tomorrow|always|never|often|sometimes|now|then)\b', text))
    word_count = len(text.split())
    char_count = len(text)
    avg_word_length = char_count / word_count if word_count else 0
    return pd.Series([polarity, subjectivity, pronouns, negations, temporal, word_count, avg_word_length])

In [8]:
df_features = df_cleaned['text'].apply(extract_features)
df_features.columns = [
    'polarity', 'subjectivity', 'pronoun_count',
    'negation_count', 'temporal_count', 'word_count',
    'avg_word_length'
]

In [9]:
df_features.head()

Unnamed: 0,polarity,subjectivity,pronoun_count,negation_count,temporal_count,word_count,avg_word_length
0,-0.002742,0.426613,11.0,2.0,1.0,113.0,5.053097
1,0.292857,0.574956,5.0,1.0,0.0,108.0,5.444444
2,0.011894,0.594924,18.0,1.0,4.0,166.0,5.10241
3,0.141671,0.555249,41.0,3.0,2.0,273.0,4.677656
4,-0.204167,0.441667,7.0,1.0,0.0,89.0,5.988764


In [10]:
# ----------------------------
# Text Preprocessing
# ----------------------------
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [11]:
df_cleaned['clean_text'] = df_cleaned['text'].apply(preprocess_text)

In [12]:
# ----------------------------
# TF-IDF Vectorization
# ----------------------------
tfidf_vectorizer = TfidfVectorizer(max_features=300, stop_words='english')
tfidf_features = tfidf_vectorizer.fit_transform(df_cleaned['clean_text'])
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [13]:
print(tfidf_features)

  (np.int32(0), np.int32(215))	0.22820538102717897
  (np.int32(0), np.int32(80))	0.2164765517902374
  (np.int32(0), np.int32(286))	0.18021868591398965
  (np.int32(0), np.int32(151))	0.1186143047793595
  (np.int32(0), np.int32(159))	0.2369519494095514
  (np.int32(0), np.int32(78))	0.26211522806587517
  (np.int32(0), np.int32(171))	0.22220410466544738
  (np.int32(0), np.int32(92))	0.2033004195987493
  (np.int32(0), np.int32(246))	0.2626534310895545
  (np.int32(0), np.int32(187))	0.19484982289696925
  (np.int32(0), np.int32(4))	0.28854693944572657
  (np.int32(0), np.int32(65))	0.29116080691032425
  (np.int32(0), np.int32(223))	0.28734403573755546
  (np.int32(0), np.int32(114))	0.25183493639391624
  (np.int32(0), np.int32(115))	0.27668709541599706
  (np.int32(0), np.int32(106))	0.2725284598876204
  (np.int32(0), np.int32(125))	0.25914004769812143
  (np.int32(1), np.int32(252))	0.27833125627368127
  (np.int32(1), np.int32(212))	0.2391668727168646
  (np.int32(1), np.int32(198))	0.29219498195

In [14]:
tfidf_df.head()

Unnamed: 0,able,actually,advice,afraid,age,ago,alive,angry,anxiety,anymore,...,wish,work,working,world,worse,worst,worth,wrong,year,years
0,0.0,0.0,0.0,0.0,0.288547,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.308047,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116404,0.0
3,0.0,0.089713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.626266,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191802,0.17894


In [15]:
# ----------------------------
# Combine All Features
# ----------------------------
df_full = pd.concat([df_features.reset_index(drop=True), tfidf_df], axis=1)
df_full['binary_label'] = df_cleaned['binary_label'].values

In [16]:
df_full.head()

Unnamed: 0,polarity,subjectivity,pronoun_count,negation_count,temporal_count,word_count,avg_word_length,able,actually,advice,...,work,working,world,worse,worst,worth,wrong,year,years,binary_label
0,-0.002742,0.426613,11.0,2.0,1.0,113.0,5.053097,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,YES
1,0.292857,0.574956,5.0,1.0,0.0,108.0,5.444444,0.0,0.0,0.0,...,0.0,0.308047,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NO
2,0.011894,0.594924,18.0,1.0,4.0,166.0,5.10241,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116404,0.0,NO
3,0.141671,0.555249,41.0,3.0,2.0,273.0,4.677656,0.0,0.089713,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,YES
4,-0.204167,0.441667,7.0,1.0,0.0,89.0,5.988764,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191802,0.17894,YES


In [None]:
df_full.to_csv("data/feature_extracted_dataset.csv", index=False)