In [None]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

DATA_PATH = r"data\processed\thefinal_dataset.csv" 

df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()


(474399, 2)


Unnamed: 0,text,status
0,"""My mind is a never-ending cycle of worry, and...",anxiety
1,Despite the sun shining and birds singing outs...,bipolar disorder
2,"I'm drowning in responsibilities, each one dem...",stress
3,"""My emotions shift like the wind, leaving me u...",personality disorder
4,"I'm trapped in a whirlwind of thoughts, unable...",anxiety


1) Comprendre la structure (colonnes + types)

In [4]:
df.info()
print("\nColonnes:", df.columns.tolist())
df.describe(include="all").T.head(30)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474399 entries, 0 to 474398
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    474398 non-null  object
 1   status  474399 non-null  object
dtypes: object(2)
memory usage: 7.2+ MB

Colonnes: ['text', 'status']


Unnamed: 0,count,unique,top,freq
text,474398,474398,"""My mind is a never-ending cycle of worry, and...",1
status,474399,14,depression,144271


2) Détecter la colonne texte et la colonne label (auto)

In [None]:
def guess_text_col(df):
    # colonnes candidates texte
    candidates = [c for c in df.columns if df[c].dtype == "object"]
    if not candidates:
        return None
    # choisir celle avec moyenne de longueur la plus grande
    lens = {c: df[c].astype(str).str.len().mean() for c in candidates}
    return max(lens, key=lens.get)

def guess_label_col(df, text_col):
    # label = colonne avec peu de valeurs uniques
    cols = [c for c in df.columns if c != text_col]
    scores = []
    for c in cols:
        nunique = df[c].nunique(dropna=True)
        if nunique <= 50: 
            scores.append((c, nunique))
    return sorted(scores, key=lambda x: x[1])[0][0] if scores else None

text_col = guess_text_col(df)
label_col = guess_label_col(df, text_col)

print("text_col =", text_col)
print("label_col =", label_col)
df[[text_col, label_col]].head()


text_col = text
label_col = status


Unnamed: 0,text,status
0,"""My mind is a never-ending cycle of worry, and...",anxiety
1,Despite the sun shining and birds singing outs...,bipolar disorder
2,"I'm drowning in responsibilities, each one dem...",stress
3,"""My emotions shift like the wind, leaving me u...",personality disorder
4,"I'm trapped in a whirlwind of thoughts, unable...",anxiety


3) Valeurs manquantes + doublons

In [7]:
# Missing
missing = df.isna().mean().sort_values(ascending=False)
display(missing[missing > 0])

# Doublons complets
print("Doublons (lignes identiques):", df.duplicated().sum())

# Doublons sur le texte uniquement
print("Doublons sur texte:", df.duplicated(subset=[text_col]).sum())



text    0.000002
dtype: float64

Doublons (lignes identiques): 0
Doublons sur texte: 0


4) Vérifier les labels (répartition, valeurs bizarres)

In [8]:
df[label_col].value_counts(dropna=False).head(50)

status
depression              144271
suicidal                 72892
adhd                     61705
bipolar disorder         36350
normal                   30201
ocd                      25367
ptsd                     21314
anxiety                  18693
stress                   16051
personality disorder     14810
aspergers                13418
schizophrenia             7682
addiction                 6561
alcoholism                5084
Name: count, dtype: int64

Pourcentage par classe

In [9]:
vc = df[label_col].value_counts(dropna=False)
(vc / len(df) * 100).round(2)

status
depression              30.41
suicidal                15.37
adhd                    13.01
bipolar disorder         7.66
normal                   6.37
ocd                      5.35
ptsd                     4.49
anxiety                  3.94
stress                   3.38
personality disorder     3.12
aspergers                2.83
schizophrenia            1.62
addiction                1.38
alcoholism               1.07
Name: count, dtype: float64

5) Analyse longueur du texte (très important en NLP)

In [10]:
s = df[text_col].astype(str)

df["char_len"] = s.str.len()
df["word_len"] = s.str.split().str.len()

df[["char_len","word_len"]].describe(percentiles=[.5,.75,.9,.95,.99]).T


Unnamed: 0,count,mean,std,min,50%,75%,90%,95%,99%,max
char_len,474399.0,821.1546,965.043988,1.0,549.0,1042.0,1808.0,2481.0,4479.0,40007.0
word_len,474399.0,155.799645,183.517995,0.0,103.0,199.0,344.0,473.0,852.0,8403.0


Voir exemples extrêmes

In [11]:
# très courts
df.sort_values("word_len").head(10)[[text_col, label_col, "word_len"]]

# très longs
df.sort_values("word_len", ascending=False).head(10)[[text_col, label_col, "word_len"]]

Unnamed: 0,text,status,word_len
361481,"I hate myself I want to die, I hate myself, I ...",depression,8403
412982,I can't. I can't. I can't. I can't. I can't...,suicidal,8002
131515,**General post disclaimer:** I know it's weird...,aspergers,7198
123528,"My ADHD journey has been long, meandering, and...",adhd,6692
155656,I was actually one of Robert Bray’s moderators...,ocd,6281
76120,please help me understand what i went through ...,personality disorder,5419
106723,This is going to be lengthy and ahead of time ...,personality disorder,5411
36963,i have only 1 person i can somewhat open to bu...,suicidal,5248
193340,This is a really long story and I don't know i...,ptsd,5183
178049,"I am making this post to share a great evil, a...",ocd,5028


6) Qualité du texte (liens, emojis, ponctuation, caractères non alphabétiques)

In [12]:
def text_quality_features(series: pd.Series):
    s = series.astype(str)
    return pd.DataFrame({
        "has_url": s.str.contains(r"http|www", regex=True).astype(int),
        "has_mention": s.str.contains(r"@\w+", regex=True).astype(int),
        "has_hashtag": s.str.contains(r"#\w+", regex=True).astype(int),
        "digit_ratio": s.str.count(r"\d") / (s.str.len().replace(0, np.nan)),
        "punct_ratio": s.str.count(r"[^\w\s]") / (s.str.len().replace(0, np.nan)),
        "non_ascii_ratio": s.apply(lambda x: sum(ord(ch) > 127 for ch in x)) / (s.str.len().replace(0, np.nan)),
        "upper_ratio": s.str.count(r"[A-Z]") / (s.str.len().replace(0, np.nan)),
    }).fillna(0)

qf = text_quality_features(df[text_col])
qf.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
has_url,474399.0,0.017917,0.132651,0.0,0.0,0.0,0.0,1.0
has_mention,474399.0,0.002818,0.053013,0.0,0.0,0.0,0.0,1.0
has_hashtag,474399.0,0.025196,0.15672,0.0,0.0,0.0,0.0,1.0
digit_ratio,474399.0,0.00257,0.008582,0.0,0.0,0.0,0.002837,0.9
punct_ratio,474399.0,0.030993,0.020555,0.0,0.02214,0.029526,0.037688,1.0
non_ascii_ratio,474399.0,0.003162,0.011261,0.0,0.0,0.0,0.004073,1.0
upper_ratio,474399.0,0.021261,0.024703,0.0,0.005051,0.022005,0.029356,1.0


Par label 

In [13]:
tmp = pd.concat([df[[label_col]].reset_index(drop=True), qf.reset_index(drop=True)], axis=1)
tmp.groupby(label_col).mean().round(3)

Unnamed: 0_level_0,has_url,has_mention,has_hashtag,digit_ratio,punct_ratio,non_ascii_ratio,upper_ratio
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
addiction,0.06,0.006,0.046,0.005,0.032,0.004,0.028
adhd,0.029,0.001,0.046,0.004,0.031,0.003,0.031
alcoholism,0.018,0.001,0.035,0.005,0.032,0.004,0.027
anxiety,0.005,0.0,0.008,0.001,0.039,0.001,0.004
aspergers,0.04,0.001,0.025,0.003,0.032,0.003,0.025
bipolar disorder,0.011,0.001,0.026,0.002,0.033,0.003,0.02
depression,0.011,0.001,0.025,0.002,0.03,0.003,0.023
normal,0.028,0.022,0.011,0.003,0.021,0.003,0.0
ocd,0.027,0.002,0.021,0.002,0.031,0.005,0.029
personality disorder,0.01,0.0,0.002,0.001,0.044,0.001,0.008


7) Vérifier “Data leakage” potentiel (si tu as une colonne source / split / id)

In [14]:
suspects = [c for c in df.columns if any(k in c.lower() for k in ["split","source","user","id","date","time"])]
print("Colonnes suspectes:", suspects)
df[suspects].head() if suspects else "No suspects"


Colonnes suspectes: []


'No suspects'

8) Top mots (rapide) + stopwords (avant TF-IDF/BERT)

Simple et utile pour sanity check.

In [None]:
from collections import Counter

def basic_tokenize(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-zàâäéèêëîïôöùûüç0-9\s]", " ", text)  
    text = re.sub(r"\s+", " ", text).strip()
    return text.split()

tokens = []
for t in df[text_col].sample(min(20000, len(df)), random_state=42): 
    tokens.extend(basic_tokenize(t))

counter = Counter(tokens)
counter.most_common(30)


[('i', 223907),
 ('to', 103584),
 ('and', 96511),
 ('the', 69715),
 ('my', 64418),
 ('a', 62924),
 ('it', 51883),
 ('of', 47330),
 ('me', 39729),
 ('t', 38118),
 ('that', 37930),
 ('in', 34184),
 ('but', 30550),
 ('m', 29887),
 ('for', 28937),
 ('have', 28470),
 ('is', 28401),
 ('this', 24905),
 ('just', 24591),
 ('with', 24108),
 ('was', 22069),
 ('so', 21646),
 ('like', 21022),
 ('s', 20161),
 ('not', 18579),
 ('on', 17968),
 ('do', 17233),
 ('can', 17157),
 ('be', 17138),
 ('you', 16547)]

Par classe (mots qui apparaissent souvent dans chaque label)

In [16]:
def top_words_per_label(df, text_col, label_col, n=15, sample_per_class=5000):
    result = {}
    for lab in df[label_col].dropna().unique():
        sub = df[df[label_col] == lab]
        sub = sub.sample(min(sample_per_class, len(sub)), random_state=42)
        toks = []
        for t in sub[text_col]:
            toks.extend(basic_tokenize(t))
        result[lab] = Counter(toks).most_common(n)
    return result

top_by_label = top_words_per_label(df, text_col, label_col, n=15)
top_by_label


{'anxiety': [('i', 21179),
  ('and', 14506),
  ('my', 12431),
  ('the', 11831),
  ('a', 11234),
  ('to', 10503),
  ('of', 9508),
  ('it', 5694),
  ('me', 5495),
  ('is', 5442),
  ('in', 5364),
  ('m', 4441),
  ('t', 4264),
  ('that', 3666),
  ('can', 3452)],
 'bipolar disorder': [('i', 48926),
  ('and', 24364),
  ('to', 22245),
  ('the', 16787),
  ('a', 15191),
  ('my', 13068),
  ('of', 11884),
  ('it', 10624),
  ('me', 9926),
  ('that', 8430),
  ('in', 8421),
  ('m', 8114),
  ('t', 8092),
  ('with', 7369),
  ('but', 6635)],
 'stress': [('i', 13596),
  ('and', 11681),
  ('to', 10026),
  ('the', 9770),
  ('my', 8392),
  ('of', 7596),
  ('a', 6181),
  ('in', 4732),
  ('me', 4628),
  ('m', 3664),
  ('feeling', 2896),
  ('responsibilities', 2831),
  ('it', 2793),
  ('with', 2497),
  ('t', 2391)],
 'personality disorder': [('i', 19631),
  ('the', 9980),
  ('and', 9422),
  ('a', 9100),
  ('to', 8310),
  ('my', 8192),
  ('of', 7927),
  ('me', 6411),
  ('in', 4480),
  ('m', 4078),
  ('who', 34

9) Mini “rapport” automatique

In [17]:
report = {
    "rows": len(df),
    "cols": df.shape[1],
    "text_col": text_col,
    "label_col": label_col,
    "missing_any": int(df.isna().any(axis=1).sum()),
    "duplicates_rows": int(df.duplicated().sum()),
    "duplicates_text": int(df.duplicated(subset=[text_col]).sum()),
    "label_counts": df[label_col].value_counts(dropna=False).to_dict(),
    "word_len_mean": float(df["word_len"].mean()),
    "word_len_p95": float(df["word_len"].quantile(0.95)),
}
report


{'rows': 474399,
 'cols': 4,
 'text_col': 'text',
 'label_col': 'status',
 'missing_any': 1,
 'duplicates_rows': 0,
 'duplicates_text': 0,
 'label_counts': {'depression': 144271,
  'suicidal': 72892,
  'adhd': 61705,
  'bipolar disorder': 36350,
  'normal': 30201,
  'ocd': 25367,
  'ptsd': 21314,
  'anxiety': 18693,
  'stress': 16051,
  'personality disorder': 14810,
  'aspergers': 13418,
  'schizophrenia': 7682,
  'addiction': 6561,
  'alcoholism': 5084},
 'word_len_mean': 155.7996454461329,
 'word_len_p95': 473.0}

In [18]:
df["status"].value_counts()

status
depression              144271
suicidal                 72892
adhd                     61705
bipolar disorder         36350
normal                   30201
ocd                      25367
ptsd                     21314
anxiety                  18693
stress                   16051
personality disorder     14810
aspergers                13418
schizophrenia             7682
addiction                 6561
alcoholism                5084
Name: count, dtype: int64