In [1]:
pip install kagglehub pandas

Note: you may need to restart the kernel to use updated packages.


# Dataset preparation

In [9]:
import kagglehub
import pandas as pd

# Download via Kaggle's API
path = kagglehub.dataset_download("suchintikasarkar/sentiment-analysis-for-mental-health")
print("Path to dataset files:", path)

Path to dataset files: /home/chengyi/.cache/kagglehub/datasets/suchintikasarkar/sentiment-analysis-for-mental-health/versions/1


In [10]:
# Drop NA values & duplicates
df = pd.read_csv(path + "/Combined Data.csv")
print("Raw data length:", len(df))
print("After dropping na values:", len(df.dropna()))
print("After dropping duplicates:", len(df.dropna().drop_duplicates())) # No duplicates present
df = df.dropna().drop_duplicates().drop(columns=['Unnamed: 0'])

Raw data length: 53043
After dropping na values: 52681
After dropping duplicates: 52681


In [11]:
# Handle casing
df["statement"] = df["statement"].str.lower()
df["status"] = df["status"].str.lower()
df.head()

Unnamed: 0,statement,status
0,oh my gosh,anxiety
1,"trouble sleeping, confused mind, restless hear...",anxiety
2,"all wrong, back off dear, forward doubt. stay ...",anxiety
3,i've shifted my focus to something else but i'...,anxiety
4,"i'm restless and restless, it's been a month n...",anxiety


In [12]:
# Handle data types
df["statement"] = df["statement"].astype("string")
df["status"] = df["status"].astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52681 entries, 0 to 53042
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   statement  52681 non-null  string  
 1   status     52681 non-null  category
dtypes: category(1), string(1)
memory usage: 874.9 KB


In [13]:
# Trailing / beginning whitespaces
df["statement"] = df["statement"].str.strip()
df["status"] = df["status"].str.strip()
df["statement"].str.len().describe()

count       52681.0
mean       578.6781
std      846.248914
min             2.0
25%            80.0
50%           317.0
75%           752.0
max         32759.0
Name: statement, dtype: Float64

In [14]:
# Non-alphanumeric characters only
df["statement"] = df["statement"].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True)
df["statement"].str.len().describe()

count       52681.0
mean     564.987548
std      827.183457
min             2.0
25%            77.0
50%           308.0
75%           735.0
max         31499.0
Name: statement, dtype: Float64

In [15]:
# Normalize spacing
df["statement"] = df["statement"].str.replace(r"\s+", " ", regex=True)
df["statement"].str.len().describe()

count       52681.0
mean     564.154705
std      826.122235
min             1.0
25%            77.0
50%           308.0
75%           734.0
max         31499.0
Name: statement, dtype: Float64

In [35]:
df.to_csv('dataset.csv', index=False)

In [17]:
df = pd.read_csv('dataset.csv')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52681 entries, 0 to 52680
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  52681 non-null  object
 1   status     52681 non-null  object
dtypes: object(2)
memory usage: 823.3+ KB
None


Unnamed: 0,statement,status
0,oh my gosh,anxiety
1,trouble sleeping confused mind restless heart ...,anxiety
2,all wrong back off dear forward doubt stay in ...,anxiety
3,ive shifted my focus to something else but im ...,anxiety
4,im restless and restless its been a month now ...,anxiety
