In [None]:
!pip -q install transformers sentencepiece torch tqdm
!python -m spacy download fr_core_news_sm
!python -m spacy download en_core_web_sm

In [35]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import (
    PegasusForConditionalGeneration, PegasusTokenizer,
    MarianMTModel, MarianTokenizer)
import spacy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from scipy.sparse import hstack
import seaborn as sns
from collections import Counter

In [None]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets download -d thedevastator/belgian-statutory-article-retrieval-dataset-bsar
!unzip belgian-statutory-article-retrieval-dataset-bsar.zip


 ***Overview of the dataset***


In [None]:

df_train = pd.read_csv("train.csv", encoding="utf-8" , sep=',')
df_test=pd.read_csv("test.csv", encoding="utf-8" , sep=',')
df= pd.concat([df_train, df_test ], ignore_index=True)


In [None]:
df.to_csv('fr_data.csv', index=False)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1108 entries, 0 to 1107
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1108 non-null   int64 
 1   question           1108 non-null   object
 2   article_ids        1108 non-null   object
 3   category           1108 non-null   object
 4   subcategory        1108 non-null   object
 5   extra_description  990 non-null    object
dtypes: int64(1), object(5)
memory usage: 52.1+ KB


In [None]:
print(df['category'].unique())

['Travail' 'Argent' 'Famille' 'Logement' 'Protection sociale' 'Justice'
 'Etrangers']


In [None]:
len(df['subcategory'].unique())

50

In [None]:
df=df.drop(['subcategory', 'article_ids','extra_description','id'], axis=1)


In [None]:
df.head()

Unnamed: 0,question,category
0,Je suis travailleur salarié(e). Puis-je refuse...,Travail
1,Peut-on saisir tous mes revenus ?,Argent
2,Je suis marié(e). Nous sommes mariés. Dois-je ...,Famille
3,Je mets un kot en location (bail de droit comm...,Logement
4,Suis-je payé pendant la procédure du trajet de...,Travail


In [None]:
print(df['category'].value_counts())

category
Famille               339
Logement              304
Argent                177
Justice               151
Etrangers              63
Protection sociale     39
Travail                35
Name: count, dtype: int64


***Renaming the categories***

In [None]:
category_to_supercat = {
    "Justice": "Droit pénal",
    "Travail": "Droit du travail",
    "Logement": "Droit immobilier",
    "Argent": "Droit financier",
    "Famille": "Droit de la famille",
    "Protection sociale": "Droit de la protection sociale",
    "Etrangers": "Droit des étrangers"
}

In [None]:
df['super_category'] = df['category'].map(category_to_supercat)

In [None]:
df.head()

Unnamed: 0,question,category,super_category
0,Je suis travailleur salarié(e). Puis-je refuse...,Travail,Droit du travail
1,Peut-on saisir tous mes revenus ?,Argent,Droit financier
2,Je suis marié(e). Nous sommes mariés. Dois-je ...,Famille,Droit de la famille
3,Je mets un kot en location (bail de droit comm...,Logement,Droit immobilier
4,Suis-je payé pendant la procédure du trajet de...,Travail,Droit du travail


In [None]:
df=df.drop(['category'], axis=1)


In [None]:
df.head()

Unnamed: 0,question,super_category
0,Je suis travailleur salarié(e). Puis-je refuse...,Droit du travail
1,Peut-on saisir tous mes revenus ?,Droit financier
2,Je suis marié(e). Nous sommes mariés. Dois-je ...,Droit de la famille
3,Je mets un kot en location (bail de droit comm...,Droit immobilier
4,Suis-je payé pendant la procédure du trajet de...,Droit du travail


***Workflow for Data Augmentation:***

*   ***Translate the French queries to English***
*   ***Paraphrase the translated queries in English***
*   ***Translate the paraphrased back to French***




In [None]:


device = "cuda" if torch.cuda.is_available() else "cpu"
df["question"] = df["question"].fillna("").astype(str)

***Loading traslation models***

In [None]:
#FR->EN
fr_en_model_name = "Helsinki-NLP/opus-mt-fr-en"
fr_en_tok = MarianTokenizer.from_pretrained(fr_en_model_name)
fr_en_model = MarianMTModel.from_pretrained(fr_en_model_name).to(device)

def translate_fr_to_en_batch(texts, max_length=256):
    enc = fr_en_tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
    with torch.no_grad():
        out = fr_en_model.generate(
            **enc,
            max_length=max_length,
            num_beams=6,
            do_sample=False
        )
    decoded = fr_en_tok.batch_decode(out, skip_special_tokens=True)
    return decoded

#EN->FR
en_fr_model_name = "Helsinki-NLP/opus-mt-en-fr"
en_fr_tok = MarianTokenizer.from_pretrained(en_fr_model_name)
en_fr_model = MarianMTModel.from_pretrained(en_fr_model_name).to(device)

def translate_en_to_fr_batch(texts, max_length=256):
    enc = en_fr_tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
    with torch.no_grad():
        out = en_fr_model.generate(
            **enc,
            max_length=max_length,
            num_beams=6,
            do_sample=False
        )
    decoded = en_fr_tok.batch_decode(out, skip_special_tokens=True)
    return decoded

***Loading Pegasus model for paraphrasing***

In [None]:
pegasus_name = "tuner007/pegasus_paraphrase"
peg_tok = PegasusTokenizer.from_pretrained(pegasus_name)
peg_model = PegasusForConditionalGeneration.from_pretrained(pegasus_name).to(device)

def paraphrase_en_batch(texts, max_length=60):
    prompts = ["paraphrase: " + (t or "") for t in texts]
    enc = peg_tok(prompts, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)

    with torch.no_grad():
        out = peg_model.generate(
            **enc,
            max_length=max_length,     # cap length of paraphrase
            num_beams=10,              # search quality
            num_return_sequences=1,    # exactly one paraphrase per input
            do_sample=False,           # deterministic
            no_repeat_ngram_size=3     # avoid trivial repeats
        )

    decoded = peg_tok.batch_decode(out, skip_special_tokens=True)
    return decoded

***Using a batched Pipeline to increase speed***

In [None]:

batch_size = 25
french_inputs = df["question"].tolist()

eng_trans = []
eng_paras = []
fr_paras = []

for start in tqdm(range(0, len(french_inputs), batch_size), desc="Paraphrasing & Back-translating"):
    # Slice a batch
    batch = french_inputs[start:start+batch_size]
    # Translate the french inputs
    batch_eng_trans = translate_fr_to_en_batch(batch, max_length=256)

    # English paraphrase
    batch_eng_paras = paraphrase_en_batch(batch_eng_trans, max_length=60)

    # Translate that paraphrase back to French
    batch_fr_paras = translate_en_to_fr_batch(batch_eng_paras, max_length=256)

    # Accumulate results
    eng_trans.extend(batch_eng_trans)
    eng_paras.extend(batch_eng_paras)
    fr_paras.extend(batch_fr_paras)

# Attach to dataframe
df["English_translated"] = eng_trans
df["English_Paraphrase"] = eng_paras
df["French_Paraphrase"]  = fr_paras


#save the dataset
out_path = "fr_english_trans_paras_data.csv"
df.to_csv(out_path, index=False)
print("done!")



***Organizing and saving the datasets***

In [28]:
df=pd.read_csv("fr_english_trans_paras_data.csv", encoding="utf-8" , sep=',')

In [29]:
df_melted = pd.melt(df,
                       id_vars=['super_category'],
                       value_vars=['question', 'English_translated', 'English_Paraphrase', 'French_Paraphrase'],
                       var_name='langage_column',
                       value_name='query')

In [30]:
df_melted.head()

Unnamed: 0,super_category,langage_column,query
0,Droit du travail,question,Je suis travailleur salarié(e). Puis-je refuse...
1,Droit financier,question,Peut-on saisir tous mes revenus ?
2,Droit de la famille,question,Je suis marié(e). Nous sommes mariés. Dois-je ...
3,Droit immobilier,question,Je mets un kot en location (bail de droit comm...
4,Droit du travail,question,Suis-je payé pendant la procédure du trajet de...


In [31]:
english_df = df_melted[df_melted['langage_column'].isin(['English_Paraphrase', 'English_translated'])][['query', 'super_category']]
french_df = df_melted[df_melted['langage_column'].isin(['French_Paraphrase', 'question'])][['query', 'super_category']]

# Reset indexes
english_df = english_df.reset_index(drop=True)
french_df = french_df.reset_index(drop=True)
#shuffling everything
english_df = english_df.sample(frac=1).reset_index(drop=True)
french_df = french_df.sample(frac=1).reset_index(drop=True)

***Preprocessing the datasets***

In [37]:

nlp = spacy.load("fr_core_news_sm")
def preprocess_text(text):
    doc= nlp(text)
    tokens=[]
    for token in doc:
        if not token.is_stop and not token.is_punct:
            tokens.append(token.lemma_.lower().strip())
    return " ".join(tokens)

french_df['processed_text'] = french_df['query'].astype(str).apply(preprocess_text)

In [38]:

nlp = spacy.load("en_core_web_sm")
def preprocess_text(text):
    doc= nlp(text)
    tokens=[]
    for token in doc:
        if not token.is_stop and not token.is_punct:
            tokens.append(token.lemma_.lower().strip())
    return " ".join(tokens)

english_df['processed_text'] = english_df['query'].astype(str).apply(preprocess_text)

***Treating the marié(e and such parentheses problems with regex***

In [39]:
import re
pattern = r"\w+\(\w+"  # a word, then '(', then another word/letter

matches = french_df[french_df["processed_text"].str.contains(pattern, regex=True, na=False)]

unique_matches = set()
for text in french_df["processed_text"].dropna():
    found = re.findall(pattern, text)
    unique_matches.update(found)

print(unique_matches)

{'le(s', 'seul(e', 'domicilié(e', 'présent(e', 'endetté(e', 'salarié(e', 'marié(e', 'époux(se'}


In [40]:
import re
def strip_orphan_paren_suffix(text: str) -> str:
    text = str(text)
    # remove "(" + any non-whitespace that follows, but only when it comes right after a letter
    cleaned = re.sub(r"(?<=[A-Za-zÀ-ÖØ-öø-ÿ])\([^ \t\n\r\f\v]+(?=\s|$)", "", text)
    # collapse extra spaces from deletions
    cleaned = re.sub(r"\s{2,}", " ", cleaned).strip()
    return cleaned

In [41]:
french_df["processed_text_regex"] = french_df["processed_text"].apply(strip_orphan_paren_suffix)

In [42]:
#just to make sure everything is working
unique_matches = set()
for text in french_df["processed_text_regex"].dropna():
    found = re.findall(pattern, text)
    unique_matches.update(found)

print(unique_matches)

set()


***Encoding the categories to numerical data using one hot encoding***

In [43]:
#just renaming and dropping unnecessary columns for easier manipulation
english_df=english_df.rename(columns={'super_category':'category'})

french_df=french_df.drop(['processed_text','query'],axis=1)

french_df=french_df.rename(columns={'super_category':'category','processed_text_regex':'processed_text'})


In [44]:
#applying the one-hot encoding to langage column

english_df['language'] = 'en'
french_df['language'] = 'fr'

combined_df = pd.concat([english_df, french_df], ignore_index=True)
df_encoded = pd.get_dummies(combined_df, columns=['language'], prefix=['lang'])

language_cols = [col for col in df_encoded.columns if col.startswith('lang_')]
categorical_features =language_cols

In [45]:
combined_df.category.value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
Droit de la famille,1356
Droit immobilier,1216
Droit financier,708
Droit pénal,604
Droit des étrangers,252
Droit de la protection sociale,156
Droit du travail,140


***Training a bilingual model on the concatenated data***

In [47]:
vectorizer = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),  # unigrams and bigrams
        min_df=2,  # ignore terms that appear in less than 2 documents
        max_df=0.8,  # ignore terms that appear in more than 80% of documents
        stop_words=None)

text_features = vectorizer.fit_transform(df_encoded['processed_text'])

In [48]:
y = combined_df['category'].astype('category').cat.codes
category_names = combined_df['category'].astype('category').cat.categories.tolist()
print("=== TARGET ===")
print(f"Target categories: {category_names}")
print(f"Target distribution: {Counter(y)}")
categorical_features_array = df_encoded[categorical_features].values
X_combined = hstack([text_features, categorical_features_array])
print(f"\n===FEATURES ===")
print(f"Combined shape: {X_combined.shape}")
print(f"Components: TF-IDF ({text_features.shape[1]}) + Language ({len(categorical_features)})")
print(f"Total features: {X_combined.shape[1]}")

=== TARGET ===
Target categories: ['Droit de la famille', 'Droit de la protection sociale', 'Droit des étrangers', 'Droit du travail', 'Droit financier', 'Droit immobilier', 'Droit pénal']
Target distribution: Counter({0: 1356, 5: 1216, 4: 708, 6: 604, 2: 252, 1: 156, 3: 140})

===FEATURES ===
Combined shape: (4432, 5002)
Components: TF-IDF (5000) + Language (2)
Total features: 5002


In [49]:
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y,
    test_size=0.2,
    random_state=2025,
    stratify=y  # maintain category distribution
)

***The data is imbalanced , so SMOTE is applied***

In [50]:
smote = SMOTE(random_state=42, k_neighbors=3)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [51]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("=== TRAINING RANDOM FOREST ===")
rf_model.fit(X_train_balanced, y_train_balanced)

# Make predictions
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, rf_predictions, target_names=category_names))

=== TRAINING RANDOM FOREST ===
Random Forest Accuracy: 0.7306

Classification Report:
                                precision    recall  f1-score   support

           Droit de la famille       0.89      0.73      0.80       271
Droit de la protection sociale       0.56      0.45      0.50        31
           Droit des étrangers       0.93      0.84      0.89        51
              Droit du travail       0.81      0.79      0.80        28
               Droit financier       0.77      0.47      0.59       142
              Droit immobilier       0.93      0.79      0.85       243
                   Droit pénal       0.41      0.93      0.57       121

                      accuracy                           0.73       887
                     macro avg       0.76      0.71      0.71       887
                  weighted avg       0.81      0.73      0.74       887



In [52]:
lr_model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    multi_class='ovr',
    C=1.0
)

print("=== TRAINING LOGISTIC REGRESSION ===")
lr_model.fit(X_train_balanced, y_train_balanced)

# Make predictions
lr_predictions = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)

print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, lr_predictions, target_names=category_names))


=== TRAINING LOGISTIC REGRESSION ===




Logistic Regression Accuracy: 0.9019

Classification Report:
                                precision    recall  f1-score   support

           Droit de la famille       0.93      0.93      0.93       271
Droit de la protection sociale       0.57      0.65      0.61        31
           Droit des étrangers       0.98      0.92      0.95        51
              Droit du travail       0.63      0.79      0.70        28
               Droit financier       0.87      0.84      0.85       142
              Droit immobilier       0.95      0.94      0.94       243
                   Droit pénal       0.93      0.92      0.93       121

                      accuracy                           0.90       887
                     macro avg       0.84      0.85      0.84       887
                  weighted avg       0.91      0.90      0.90       887



In [54]:
import joblib

# Save the best model
joblib.dump(lr_model, 'log_reg_hhh.pkl')


# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vec_hhh.pkl')


# Save category names and feature info
model_info = {
    'category_names': category_names,
    'categorical_features': categorical_features,
    'model_name': 'Logistic Regression',
    'accuracy': lr_accuracy
}
joblib.dump(model_info, 'model_info_hhh.pkl')
print("Done")


Done
