Import needed libraries

In [1]:
from huggingface_hub import HfApi
from dotenv import load_dotenv
import pandas as pd
import glob
import os

load_dotenv()

True

In [2]:
api = HfApi(token=os.getenv("HF_TOKEN")) # Set up HuggingFace API with your secret

Get downloaded data

In [3]:
# Get current notebook directory (notebooks/)
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()

# Paths to data folders relative to notebook
zenodo_methalhealth_data_path = os.path.join(current_dir, "..", "data", "downloaded_data", "zenodo_data", "mental_health_issues")
zenodo_nomethalhealth_data_path = os.path.join(current_dir, "..", "data", "downloaded_data", "zenodo_data", "no_mental_health_issues")
kaggle_data_path = os.path.join(current_dir, "..", "data", "downloaded_data", "kaggle_data")

# Find all CSVs
zenodo_mentalhealth_csvs = glob.glob(os.path.join(zenodo_methalhealth_data_path, "*.csv"))
zenodo_nomentalhealth_csvs = glob.glob(os.path.join(zenodo_nomethalhealth_data_path, "*.csv"))
kaggle_csvs = glob.glob(os.path.join(kaggle_data_path, "*.csv"))
zenodo_mentalhealth_dfs = [pd.read_csv(f) for f in zenodo_mentalhealth_csvs]
zenodo_nomentalhealth_dfs = [pd.read_csv(f) for f in zenodo_nomentalhealth_csvs]
kaggle_dfs = [pd.read_csv(f) for f in kaggle_csvs]

In [4]:
assert len(kaggle_dfs) == 1 # Only one file is coming from kaggle
assert len(zenodo_mentalhealth_dfs) == 15 # 15 specific mental health issues were diagnosed by zenodo
assert len(zenodo_nomentalhealth_dfs) == 11 # 11 specific no mental health issues were diagnosed by zenodo

A quick view on dowloaded data before processing them

In [5]:
# A quick view on zenodo mental health issues data
# Only columns 'post' and subreddit are needed
zenodo_mentalhealth_dfs[0].head(1)

Unnamed: 0,subreddit,author,date,post,automated_readability_index,coleman_liau_index,flesch_kincaid_grade_level,flesch_reading_ease,gulpease_index,gunning_fog_index,...,tfidf_wish,tfidf_without,tfidf_wonder,tfidf_work,tfidf_worri,tfidf_wors,tfidf_would,tfidf_wrong,tfidf_x200b,tfidf_year
0,addiction,MushroomEagle,2020/01/01,Hadn’t even made it a day Just relapsed for th...,-1.57539,0.344164,0.73342,103.405519,94.238095,3.530736,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13558


In [6]:
# A quick view on zenodo no mental health issues data
# Mainly column 'post' is needed, since we do not really care about the kind of no mental health issue in 'subreddit' column
# But still we will keep it
zenodo_nomentalhealth_dfs[0].head(1)

Unnamed: 0,subreddit,author,date,post,automated_readability_index,coleman_liau_index,flesch_kincaid_grade_level,flesch_reading_ease,gulpease_index,gunning_fog_index,...,tfidf_wish,tfidf_without,tfidf_wonder,tfidf_work,tfidf_worri,tfidf_wors,tfidf_would,tfidf_wrong,tfidf_x200b,tfidf_year
0,conspiracy,Playaguy,2020/01/01,Time to talk Solutions. Shungite does provide ...,10.913765,15.789921,7.715588,54.885147,64.647059,10.458824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# A quick view on kaggle data
# Both columns will be kept (this data is partially unlabeled)
# O in 'label' column means there are no mental health issue, while 1 means there's mental health issue btu we do not know which kind of mental health it could be
kaggle_dfs[0].head(1)

Unnamed: 0,text,label
0,dear american teens question dutch person hear...,0


A quick process on kaggle and zenodo data before saving them

In [8]:
zenodo_final_mentalhealth_df = pd.DataFrame(columns=['text', 'label'])

for df in zenodo_mentalhealth_dfs:
    nwdf = df[['post', 'subreddit']].copy()
    nwdf = nwdf.rename(columns={'post': 'text', 'subreddit': 'label'})
    nwdf = nwdf[nwdf['text'] != ''] # At least, remove any empty comment
    zenodo_final_mentalhealth_df = pd.concat([zenodo_final_mentalhealth_df, nwdf], axis=0)
    del nwdf
zenodo_final_mentalhealth_df = zenodo_final_mentalhealth_df.drop_duplicates().dropna(axis=0)

assert zenodo_final_mentalhealth_df['label'].nunique() == 15 # We need to make sure that all 15 specific mental health issues info are being kept
print(f"We have in total {len(zenodo_final_mentalhealth_df)} samples of mental health issues from zenodo.")

We have in total 124758 samples of mental health issues from zenodo.


In [9]:
zenodo_final_mentalhealth_df.sample(n=5, random_state=123)

Unnamed: 0,text,label
27357,Tonight some asshole went and downvoted a bunc...,depression
536,Hello all! Helpful resource here Have been dea...,healthanxiety
2702,I feel stable until I get into a relationship ...,bpd
894,"When high school ends, I might just too Life i...",suicidewatch
21812,I feel like I'm losing myself I found this sub...,depression


In [10]:
zenodo_final_nomentalhealth_df = pd.DataFrame(columns=['text', 'label'])

for df in zenodo_nomentalhealth_dfs:
    nwdf = df[['post', 'subreddit']].copy()
    nwdf = nwdf.rename(columns={'post': 'text', 'subreddit': 'label'})
    nwdf = nwdf[nwdf['text'] != ''] # At least, remove any empty comment
    zenodo_final_nomentalhealth_df = pd.concat([zenodo_final_nomentalhealth_df, nwdf], axis=0)
    del nwdf
zenodo_final_nomentalhealth_df = zenodo_final_nomentalhealth_df.drop_duplicates().dropna(axis=0)

assert zenodo_final_nomentalhealth_df['label'].nunique() == 11 # We need to make sure that all 11 specific no mental health issues info are being kept
print(f"We have in total {len(zenodo_final_nomentalhealth_df)} samples of no mental health issues from zenodo.")

We have in total 180875 samples of no mental health issues from zenodo.


In [11]:
zenodo_final_nomentalhealth_df.sample(n=5, random_state=123)

Unnamed: 0,text,label
27127,I haven't had a working toilet in over a week ...,legaladvice
2810,North american arms Hello all! I am looking to...,guns
48374,Builder next door has dumped rubbish on my lan...,legaladvice
4811,The Importance of Eating Big I’m 24 and I’ve b...,fitness
39256,Some background to my problem I'm 14 male. My ...,legaladvice


In [12]:
kaggle_final_df = kaggle_dfs[0].copy()
kaggle_final_df = kaggle_final_df[kaggle_final_df['text'] != ''] # At least, remove any empty comment
kaggle_final_df = kaggle_final_df.drop_duplicates().dropna(axis=0)

print(f"We have in total {len(kaggle_final_df)} samples partially labeled from kaggle.")

We have in total 27972 samples partially labeled from kaggle.


In [13]:
kaggle_final_df.sample(n=5, random_state=123)

Unnamed: 0,text,label
557,dont even know say girlfriend last night party...,0
17016,wish could end all tired living fake life arou...,1
4347,miss someone miss someone isnt valentines day ...,0
15336,pointthats really dont get it whats point livi...,1
25878,impossible avoid punishment hispanic parents s...,0


Save files as .feather files

In [14]:
# Optimize columns types before saving
zenodo_final_mentalhealth_df = zenodo_final_mentalhealth_df.astype("category")
zenodo_final_nomentalhealth_df = zenodo_final_nomentalhealth_df.astype("category")
kaggle_final_df = kaggle_final_df.astype("category")

In [15]:
# Define output directories relative to notebook
labeled_out_dir = os.path.join(current_dir, '..', 'data', 'raw_data', 'labeled_data')
unlabeled_out_dir = os.path.join(current_dir, '..', 'data', 'raw_data', 'unlabeled_data')

# Create local output folders if they don't exist
os.makedirs(labeled_out_dir, exist_ok=True)
os.makedirs(unlabeled_out_dir, exist_ok=True)

# Compress data as much as you can into .feather format before uploading them into HuggingFace
zenodo_final_mentalhealth_df.to_feather(
    os.path.join(labeled_out_dir, 'zenodo_mentalhealth_data.feather'), compression="zstd", compression_level=22
    )
zenodo_final_nomentalhealth_df.to_feather(
    os.path.join(labeled_out_dir, 'zenodo_nomentalhealth_data.feather'), compression="zstd", compression_level=22
    )
kaggle_final_df.to_feather(
    os.path.join(unlabeled_out_dir, 'kaggle_data.feather'), compression="zstd", compression_level=22
    )

In [16]:
# Upload files on HuggingFace public repo
api.upload_file(
    path_or_fileobj=os.path.join(unlabeled_out_dir, 'kaggle_data.feather'),
    path_in_repo="kaggle_data.feather",
    repo_id="pfacouetey/DSTI_Deep_Learning_Project_2025",
    repo_type="dataset"
)
api.upload_file(
    path_or_fileobj=os.path.join(labeled_out_dir, 'zenodo_mentalhealth_data.feather'),
    path_in_repo="zenodo_mentalhealth_data.feather",
    repo_id="pfacouetey/DSTI_Deep_Learning_Project_2025",
    repo_type="dataset"
)
api.upload_file(
    path_or_fileobj=os.path.join(labeled_out_dir, 'zenodo_nomentalhealth_data.feather'),
    path_in_repo="zenodo_nomentalhealth_data.feather",
    repo_id="pfacouetey/DSTI_Deep_Learning_Project_2025",
    repo_type="dataset"
)

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/pfacouetey/DSTI_Deep_Learning_Project_2025/commit/7978feefce1f8b7c5d2d5053d3af953914a425a7', commit_message='Upload zenodo_nomentalhealth_data.feather with huggingface_hub', commit_description='', oid='7978feefce1f8b7c5d2d5053d3af953914a425a7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/pfacouetey/DSTI_Deep_Learning_Project_2025', endpoint='https://huggingface.co', repo_type='dataset', repo_id='pfacouetey/DSTI_Deep_Learning_Project_2025'), pr_revision=None, pr_num=None)