Import needed libraries

In [1]:
from huggingface_hub import HfApi
from dotenv import load_dotenv
import pandas as pd
import glob
import os

load_dotenv()

True

In [2]:
api = HfApi(token=os.getenv("HF_TOKEN")) # Set up HuggingFace API with your secret

Get downloaded data

In [3]:
# Get current notebook directory (notebooks/)
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()

# Paths to data folders relative to notebook
labeled_data_path = os.path.join(current_dir, "..", "data", "downloaded_data", "labeled_data")
unlabeled_data_path = os.path.join(current_dir, "..", "data", "downloaded_data", "unlabeled_data")

# Find all CSVs in labeled_data_path
labeled_csvs = glob.glob(os.path.join(labeled_data_path, "*.csv"))
labeled_dfs = [pd.read_csv(f) for f in labeled_csvs]

# Find all CSVs in unlabeled_data_path
unlabeled_csvs = glob.glob(os.path.join(unlabeled_data_path, "*.csv"))
unlabeled_dfs = [pd.read_csv(f) for f in unlabeled_csvs]

In [4]:
assert len(labeled_dfs) == 1 # Only one file is labeled
assert len(unlabeled_dfs) == 4 # We must have 4 unlabeled files

A quick view on dowloaded data before processing them

In [5]:
# A quick view on labeled data
labeled_dfs[0].head(2)

Unnamed: 0,text,label
0,dear american teens question dutch person hear...,0
1,nothing look forward lifei dont many reasons k...,1


In [6]:
# A quick view on unlabeled data
unlabeled_dfs[0].head(1)

# All unlabeled datasets have the first same denomination the column 'post' we need

Unnamed: 0,subreddit,author,date,post,automated_readability_index,coleman_liau_index,flesch_kincaid_grade_level,flesch_reading_ease,gulpease_index,gunning_fog_index,...,tfidf_wish,tfidf_without,tfidf_wonder,tfidf_work,tfidf_worri,tfidf_wors,tfidf_would,tfidf_wrong,tfidf_x200b,tfidf_year
0,mentalhealth,LilUziVertsAutotune,2018/01/01,Any idea what this is? So I came here for awns...,1.198856,2.857999,2.614963,96.068711,79.778689,6.075708,...,0.0,0.0,0.0,0.0,0.0,0.0,0.156102,0.120129,0.0,0.070755


In [7]:
unlabeled_dfs[3].head(1)

Unnamed: 0,subreddit,author,date,post,automated_readability_index,coleman_liau_index,flesch_kincaid_grade_level,flesch_reading_ease,gulpease_index,gunning_fog_index,...,tfidf_wish,tfidf_without,tfidf_wonder,tfidf_work,tfidf_worri,tfidf_wors,tfidf_would,tfidf_wrong,tfidf_x200b,tfidf_year
0,mentalhealth,Crowzur,2018/12/17,How do I stop a compulsion that I know negativ...,8.1165,7.121214,8.73,68.095,62.5,14.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


A quick process on labeled and unlabeled data before saving them

In [8]:
unlabeled_concat_df = pd.DataFrame(columns=['text'])

for unlabeled_df in unlabeled_dfs:
    df = unlabeled_df[['post']].copy()
    df = df.rename(columns={'post': 'text'})
    df = df[df['text'] != ''] # At least, remove any empty comment
    unlabeled_concat_df = pd.concat([unlabeled_concat_df, df], axis=0)
    del df
unlabeled_concat_df = unlabeled_concat_df.drop_duplicates().dropna(axis=0)


labeled_df = labeled_dfs[0].copy()
labeled_df = labeled_df[labeled_df['text'] != ''] # At least, remove any empty comment
labeled_df = labeled_df.drop_duplicates().dropna(axis=0)

print(f"We have in total {len(labeled_df)} labeled samples and {len(unlabeled_concat_df)} unlabeled samples.")

We have in total 27972 labeled samples and 40467 unlabeled samples.


In [9]:
unlabeled_concat_df.sample(5, random_state=123)

Unnamed: 0,text
3086,Need advice on self-loathing Hi everyone! \n\n...
8558,Extreme paranoia and instrusive thoughts? I wa...
1747,"Low energy, no motivation to do much after goi..."
1753,Social Anxiety? Hi whenever I’m around people ...
2873,How to act around a self-destructive person? I...


In [10]:
labeled_df.sample(5, random_state=123)

Unnamed: 0,text,label
557,dont even know say girlfriend last night party...,0
17016,wish could end all tired living fake life arou...,1
4347,miss someone miss someone isnt valentines day ...,0
15336,pointthats really dont get it whats point livi...,1
25878,impossible avoid punishment hispanic parents s...,0


Save files as .feather files

In [11]:
# Optimize columns types before saving
labeled_df = labeled_df.astype("category")
unlabeled_concat_df = unlabeled_concat_df.astype("category")

In [12]:
# Define output directories relative to notebook
labeled_out_dir = os.path.join(current_dir, '..', 'data', 'raw_data', 'labeled_data')
unlabeled_out_dir = os.path.join(current_dir, '..', 'data', 'raw_data', 'unlabeled_data')

# Create local output folders if they don't exist
os.makedirs(labeled_out_dir, exist_ok=True)
os.makedirs(unlabeled_out_dir, exist_ok=True)

# Compress data as much as you can into .feather format before uploading them into HuggingFace
labeled_df.to_feather(
    os.path.join(labeled_out_dir, 'labeled_data.feather'), compression="zstd", compression_level=22
    )
unlabeled_concat_df.to_feather(
    os.path.join(unlabeled_out_dir, 'unlabeled_data.feather'), compression="zstd", compression_level=22
    )

In [13]:
# Upload files on HuggingFace public repo
api.upload_file(
    path_or_fileobj=os.path.join(labeled_out_dir, 'labeled_data.feather'),
    path_in_repo="labeled_data.feather",
    repo_id="pfacouetey/DSTI_Deep_Learning_Project_2025",
    repo_type="dataset"
)
api.upload_file(
    path_or_fileobj=os.path.join(unlabeled_out_dir, 'unlabeled_data.feather'),
    path_in_repo="unlabeled_data.feather",
    repo_id="pfacouetey/DSTI_Deep_Learning_Project_2025",
    repo_type="dataset"
)

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/pfacouetey/DSTI_Deep_Learning_Project_2025/commit/e4ffffe12918e8c85924b6c158c403875a8910b5', commit_message='Upload unlabeled_data.feather with huggingface_hub', commit_description='', oid='e4ffffe12918e8c85924b6c158c403875a8910b5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/pfacouetey/DSTI_Deep_Learning_Project_2025', endpoint='https://huggingface.co', repo_type='dataset', repo_id='pfacouetey/DSTI_Deep_Learning_Project_2025'), pr_revision=None, pr_num=None)