In [44]:
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from deep_translator import GoogleTranslator
from sentence_transformers import SentenceTransformer, util
from nlpaug.augmenter.word import SynonymAug
import nltk

# Fix NLTK resources required by SynonymAug
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('omw-1.4')


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [None]:
ORIG_CSV ="../data/cleaned/ml-methods-splits/train_set.csv"
OUTPUT_DIR ="../data/cleaned/ml-methods-splits/augmented_simple"

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

df = pd.read_csv(ORIG_CSV, encoding="utf-8-sig")
print("Original class counts:", df['label'].value_counts())



Original class counts: label
2    298
0     94
1     39
Name: count, dtype: int64


In [46]:
embed_model = SentenceTransformer('dangvantuan/sentence-camembert-base')
syn_aug = SynonymAug(aug_src='wordnet', lang='fra', aug_p=0.1)

In [47]:
def back_translate_fr(text, pivot='en'):
    try:
        inter = GoogleTranslator(source='fr', target=pivot).translate(text)
        if not inter or len(inter.strip()) < 3:
            return None
        back = GoogleTranslator(source=pivot, target='fr').translate(inter)
        if not back or back.strip() == "" or back.strip() == text.strip():
            return None
        return back
    except Exception:
        return None

In [48]:
def is_semantic_sim(orig, aug, threshold=0.80):
    emb_o = embed_model.encode(orig, convert_to_tensor=True)
    emb_a = embed_model.encode(aug, convert_to_tensor=True)
    return util.cos_sim(emb_o, emb_a).item() >= threshold

In [49]:
def augment_text(text, n_aug=1):
    """Generate up to n_aug augmentations from one text."""
    augmented = []
    # try back‑translation
    for pivot in ['en', 'de', 'es']:
        if len(augmented) >= n_aug:
            break
        bt = back_translate_fr(text, pivot)
        if bt and is_semantic_sim(text, bt, threshold=0.80):
            augmented.append(bt)
    # if still need more and synonyms allowed
    while len(augmented) < n_aug:
        syn = syn_aug.augment(text)
        if syn and syn != text and is_semantic_sim(text, syn, threshold=0.90):
            augmented.append(syn)
        else:
            break
    return augmented

In [None]:
# --- BALANCING / AUGMENTATION ---  

TARGET = {0: 250, 1: 250, 2: 250}  

augmented_rows = []
for lbl, group in df.groupby('label'):
    subset = group.reset_index(drop=True)
    tgt = TARGET.get(lbl, len(subset))
    curr = len(subset)
    for _, row in subset.iterrows():
        augmented_rows.append({'text_clean': row['text_clean'], 'label': lbl, 'is_aug': False})
    if curr < tgt:
        needed = tgt - curr
        per_sample = max(1, needed // curr + 1)
        for _, row in subset.iterrows():
            aug_texts = augment_text(row['text_clean'], n_aug=per_sample)
            for a in aug_texts:
                augmented_rows.append({'text_clean': a, 'label': lbl, 'is_aug': True})

df_aug = pd.DataFrame(augmented_rows).drop_duplicates(subset=['text_clean']).reset_index(drop=True)
print("Augmented class counts:", df_aug['label'].value_counts())

out_path = "../data/cleaned/ml-methods-splits/train_augmented_simple.csv"
df_aug.to_csv(out_path, index=False, encoding="utf-8-sig")
print("Saved augmented data to:", out_path)

Augmented class counts: label
2    275
0    247
1    201
Name: count, dtype: int64
Saved augmented data to: /kaggle/working/train_augmented_simple.csv


In [56]:
# Check augmented samples quality
augmented_samples = df[df['is_aug'] == True].head(20)
for i, row in augmented_samples.iterrows():
    print(f"\nLabel {row['label']}:")
    print(f"Original length: {len(row['text_clean'])} chars")
    print(f"Text: {row['text_clean'][:150]}...")


Label 0:
Original length: 251 chars
Text: ne répondez jamais au téléphone _exclamation_ je suis allé au magasin pendant les heures d'ouverture fermées _exclamation_ j'ai laissé un message à la...

Label 0:
Original length: 244 chars
Text: ne réponds jamais au téléphone _exclamation_ est entré dans le magasin pendant les heures d'ouverture fermées _exclamation_ a laissé un mot sur la por...

Label 0:
Original length: 166 chars
Text: vraiment trop bruyant on entend tout voisinage de l'autre côté de la rue chambre pas normale non équipée de système d'insonorisation correct. bref qua...

Label 0:
Original length: 170 chars
Text: vraiment trop fort on entend n'importe quel quartier de l'autre côté de la rue pas une pièce normale non équipée du bon système d'insonorisation. bref...

Label 0:
Original length: 14 chars
Text: _emoji_not_ok_...

Label 0:
Original length: 342 chars
Text: Cette salle de sport a une atmosphère très désagréable à cause de clients peu recommandables. toujours bondé,

In [1]:
import pandas as pd
import re
import os

In [2]:
DATA_PATH = "../data/cleaned/ml-methods-splits/augmented_simple/train_augmented.csv"
if not os.path.exists(DATA_PATH):
    print(f"Error: File not found at {DATA_PATH}")
else:
    # Load the data
    df = pd.read_csv(DATA_PATH, encoding='utf-8-sig')
    print(f"Successfully loaded {len(df):,} reviews")

Successfully loaded 658 reviews


In [3]:
import pandas as pd
import re
import numpy as np
import os
from nltk.corpus import stopwords

# DEEP CLEANING FOR AUGMENTED TRAIN SET

# 1. Load your augmented data
DATA_PATH = "../data/cleaned/ml-methods-splits/augmented_simple/train_augmented.csv"
if not os.path.exists(DATA_PATH):
    print(f"Error: File not found at {DATA_PATH}")
else:
    # Load the data
    df_augmented = pd.read_csv(DATA_PATH, encoding='utf-8-sig')
    print(f"Successfully loaded {len(df_augmented):,} augmented reviews")

# Check initial state
print("\nInitial state:")
print(f"Columns: {df_augmented.columns.tolist()}")
print(f"Shape: {df_augmented.shape}")
print(f"Distribution:\n{df_augmented['label'].value_counts()}")

# 2. Remove 'is_aug' column and keep only text_clean and label
df_augmented = df_augmented[['text_clean', 'label']]

# 3. Basic cleaning: remove NaN and empty strings
df_augmented = df_augmented.dropna(subset=['text_clean'])
df_augmented = df_augmented[df_augmented['text_clean'].str.strip() != '']
print(f"\nAfter removing empty rows: {len(df_augmented):,} samples")

# 4. Define cleaning functions 
def clean_text_ml(text):
    """Clean text for TF-IDF / classical ML."""
    if not isinstance(text, str) or text.strip() == "":
        return ""
    
    # lowercase
    text = text.lower()
    
    # remove unwanted chars but keep basic punctuation
    text = re.sub(r"[^\w\sàâäéèêëîïôöùûüç.!?,;:]", " ", text)
    
    # mark punctuation (important for sentiment)
    text = re.sub(r"(!+)", " _exclamation_ ", text)
    text = re.sub(r"(\?+)", " _question_ ", text)
    
    # remove URLs, mentions, hashtags
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"#\w+", " ", text)
    
    # normalize spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

# Download French stopwords if not already available
import nltk
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

french_stop = set(stopwords.words('french'))

def remove_stopwords(text):
    """Remove French stopwords."""
    words = text.split()
    filtered_words = []
    for w in words:
        # Keep sentiment tokens and special markers
        if w.startswith('_emoji_') or w.startswith('_exclamation_') or w.startswith('_question_'):
            filtered_words.append(w)
        elif w not in french_stop and len(w) > 1:
            filtered_words.append(w)
    return " ".join(filtered_words)

def preprocess_dataframe(df):
    """Apply full preprocessing pipeline."""
    df = df.copy()
    
    # Step 1: clean text
    df["text_clean"] = df["text_clean"].apply(clean_text_ml)
    
    # Step 2: remove stopwords
    df["text_clean"] = df["text_clean"].apply(remove_stopwords)
    
    # Step 3: remove empty rows after cleaning
    before = df.shape[0]
    df["text_clean"] = df["text_clean"].replace("", np.nan)
    df = df.dropna(subset=["text_clean"])
    after = df.shape[0]
    print(f"Removed {before - after} empty rows after preprocessing.")
    
    # Step 4: remove exact duplicates
    before_dedup = df.shape[0]
    df = df.drop_duplicates(subset=["text_clean"])
    after_dedup = df.shape[0]
    print(f"Removed {before_dedup - after_dedup} duplicate rows.")
    
    # Step 5: ensure we only have 2 columns
    df = df[["text_clean", "label"]]
    
    return df

# 5. Apply preprocessing to augmented data
print("\nPreprocessing augmented train set...")
df_augmented_cleaned = preprocess_dataframe(df_augmented)

# 6. Check final distribution
print("FINAL CLEANED AUGMENTED DATA:")
print(f"Total samples: {len(df_augmented_cleaned):,}")
print(f"Class distribution:")
print(df_augmented_cleaned['label'].value_counts())

# 7. Save the cleaned augmented data
output_path = "../data/cleaned/ml-methods-splits/augmented_simple/train_augmented_cleaned.csv"
df_augmented_cleaned.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"\nSaved cleaned augmented data to: {output_path}")

# 8. Show some samples
print("SAMPLE CLEANED AUGMENTED TEXTS:")

for label in [0, 1, 2]:
    samples = df_augmented_cleaned[df_augmented_cleaned['label'] == label].head(2)
    print(f"\nLabel {label} samples:")
    for idx, row in samples.iterrows():
        print(f"  {row['text_clean'][:100]}...")



Successfully loaded 658 augmented reviews

Initial state:
Columns: ['text_clean', 'label', 'is_aug']
Shape: (658, 3)
Distribution:
label
2    257
0    220
1    181
Name: count, dtype: int64

After removing empty rows: 658 samples

Preprocessing augmented train set...
Removed 0 empty rows after preprocessing.
Removed 12 duplicate rows.
FINAL CLEANED AUGMENTED DATA:
Total samples: 646
Class distribution:
label
2    257
0    212
1    177
Name: count, dtype: int64

Saved cleaned augmented data to: ../data/cleaned/ml-methods-splits/augmented_simple/train_augmented_cleaned.csv
SAMPLE CLEANED AUGMENTED TEXTS:

Label 0 samples:
  répondent jamais téléphone _exclamation_ allé magasin pendant heures ouverture fermés _exclamation_ ...
  vraiment trop bruyant entend tout quartier autre côté rue normal salle équipée système insonorisatio...

Label 1 samples:
  lieu confirmé...
  california gym is my go to place for great workout _exclamation_ the trainers are skilled and offer ...

Label 2 samples: