In [5]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.3.1


In [16]:
!pip install tqdm

  pid, fd = os.forkpty()




In [6]:
import pandas as pd
import transformers
from sentence_transformers import SentenceTransformer
import torch 
import torch.nn.functional as F
import numpy as np

In [7]:
from transformers import logging

# Suppress tokenizer warnings
logging.set_verbosity_error()

# Data collection and preprocessing

## Uploading the data set

In [11]:
df = pd.read_csv('../input/juridia-hackhaton-relevance-competition/train.csv')


In [5]:
df.head(5)

Unnamed: 0,question,article
0,Je suis travailleur salarié(e). Puis-je refuse...,Les dispositions du présent titre s'appliquent...
1,Je suis travailleur salarié(e). Puis-je refuse...,"Les travailleuses visées à l'article X.5-1, al..."
2,Je suis travailleur salarié(e). Puis-je refuse...,Lorsqu'une personne occupe des domestiques et ...
3,Je suis travailleur salarié(e). Puis-je refuse...,L'employeur effectue l'analyse des risques vis...
4,Je suis travailleur salarié(e). Puis-je refuse...,Les résultats de ladite analyse des risques et...


In [8]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
import random

def generate_pairs(data):
    pairs = []
    for i, row in data.iterrows():
        # Positive pair
        pairs.append({"question": row["question"], "article": row["article"], "label": 1})
        
        # Negative pair
        random_answer = data.loc[random.randint(0, len(data) - 1), "article"]
        if random_answer != row["article"]:  # Avoid accidental positives
            pairs.append({"question": row["question"], "article": random_answer, "label": 0})
    
    return pairs

# Generate pairs
pairs = generate_pairs(df)


In [8]:
pairs[0]

{'question': 'Je suis travailleur salarié(e). Puis-je refuser de faire des heures supplémentaires ou de travailler de nuit ?',
 'article': "Les dispositions du présent titre s'appliquent aux employeurs et aux travailleuses visés à l'article 1er de la loi sur le travail du 16 mars 1971.Elles s'appliquent notamment aux travailleuses visées à l'alinéa 1er, pendant la grossesse, après l'accouchement et pendant l'allaitement.",
 'label': 1}

# Model and Tokenizer Setup

In [13]:
from transformers import CamembertTokenizer

# Initialize tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

def preprocess_data(pairs, tokenizer, max_length=256):
    inputs = {"input_ids": [], "attention_mask": [], "labels": []}
    for pair in pairs:
        # Tokenize question and article
        tokenized = tokenizer(
            pair["question"], 
            pair["article"], 
            max_length=max_length, 
            truncation=True, 
            padding="max_length", 
            return_tensors="pt"
        )
        inputs["input_ids"].append(tokenized["input_ids"].squeeze(0))
        inputs["attention_mask"].append(tokenized["attention_mask"].squeeze(0))
        inputs["labels"].append(pair["label"])
    return inputs

# Preprocess the pairs
processed_data = preprocess_data(pairs, tokenizer)



In [14]:
import torch
from torch.utils.data import Dataset, DataLoader

class RelevanceDataset(Dataset):
    def __init__(self, inputs):
        self.input_ids = inputs["input_ids"]
        self.attention_mask = inputs["attention_mask"]
        self.labels = inputs["labels"]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

# Create dataset and dataloader
dataset = RelevanceDataset(processed_data)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


In [15]:
import torch
import torch.nn as nn
from transformers import CamembertModel

class RelevanceModel(nn.Module):
    def __init__(self, pretrained_model_name="camembert-base"):
        super(RelevanceModel, self).__init__()
        # Embedding layer (Pretrained Language Model)
        self.embedding_model = CamembertModel.from_pretrained(pretrained_model_name)
        self.hidden_size = 768  # CamemBERT hidden size
        # Dense layer
        self.dense = nn.Linear(self.hidden_size, 256)
        self.dropout = nn.Dropout(0.2)
        # Normalization layer
        self.normalization = nn.LayerNorm(256)
        # Output layer (raw logits)
        self.output = nn.Linear(256, 1)

    def forward(self, input_ids, attention_mask):
        # Extract embeddings from the pretrained model
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
        
        # Pass through dense and normalization layers
        dense_output = self.dense(cls_embedding)
        dense_output = self.dropout(dense_output)
        normalized_output = self.normalization(dense_output)
        
        # Return raw logits (no sigmoid here)
        logits = self.output(normalized_output)
        return logits  # raw logits, to be processed by BCEWithLogitsLoss


In [29]:
model = RelevanceModel()

In [30]:
model = model.to(device)

In [31]:
# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Safe for mixed precision
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)


In [14]:
from torch.nn import DataParallel
from transformers import CamembertTokenizer
from torch.cuda.amp import autocast, GradScaler

In [15]:
print(f"Using {torch.cuda.device_count()} GPUs")

Using 2 GPUs


In [16]:
print(next(model.parameters()).device)


cpu


# Model training 

In [32]:
import tqdm as tqdm
scaler = GradScaler()
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc=f"Training Epoch {epoch+1}"):
        # Move the batch tensors to the same device as the model
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        # Mixed precision forward pass
        with autocast():
            logits = model(input_ids, attention_mask).squeeze()  # Ensure all tensors are on the same device
            loss = criterion(logits, labels)  # Use logits directly

        # Backward pass and optimizer step
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1} Loss: {total_loss / len(dataloader)}")


  scaler = GradScaler()
  with autocast():
Training Epoch 1: 100%|██████████| 723/723 [03:22<00:00,  3.57it/s]


Epoch 1 Loss: 0.43546830918408036


Training Epoch 2: 100%|██████████| 723/723 [03:28<00:00,  3.46it/s]


Epoch 2 Loss: 0.2482410333509638


Training Epoch 3: 100%|██████████| 723/723 [03:27<00:00,  3.48it/s]


Epoch 3 Loss: 0.19609222023452877


Training Epoch 4: 100%|██████████| 723/723 [03:27<00:00,  3.48it/s]


Epoch 4 Loss: 0.17088880258667724


Training Epoch 5: 100%|██████████| 723/723 [03:27<00:00,  3.49it/s]


Epoch 5 Loss: 0.1490711303734392


Training Epoch 6: 100%|██████████| 723/723 [03:27<00:00,  3.49it/s]


Epoch 6 Loss: 0.13932970282814902


Training Epoch 7: 100%|██████████| 723/723 [03:27<00:00,  3.49it/s]


Epoch 7 Loss: 0.13464038814340729


Training Epoch 8: 100%|██████████| 723/723 [03:26<00:00,  3.49it/s]


Epoch 8 Loss: 0.12157566955600985


Training Epoch 9: 100%|██████████| 723/723 [03:26<00:00,  3.49it/s]


Epoch 9 Loss: 0.12138241049617833


Training Epoch 10: 100%|██████████| 723/723 [03:26<00:00,  3.50it/s]

Epoch 10 Loss: 0.11567404688294734





In [37]:
# Save the model's state_dict
torch.save(model.state_dict(), "relevance_model.pth")


# testing the model output

In [47]:
import torch
from transformers import CamembertTokenizer
from torch.nn.functional import sigmoid

# Load the trained model
#model = RelevanceModel(pretrained_model_name="camembert-base")
# model.load_state_dict(torch.load("model_checkpoint.pth"))  # Load trained weights
model.eval()  # Set model to evaluation mode

# Load tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Define the question and answer
question = pairs[121]['question']
answer = pairs[121]['article']
label = pairs[121]['label']

# Preprocess the input
def preprocess_input(question, answer, tokenizer, max_length=256):
    tokenized = tokenizer(
        question,
        answer,
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    return tokenized

# Tokenize input
inputs = preprocess_input(question, answer, tokenizer)

# Move inputs to the appropriate device (e.g., GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {key: value.to(device) for key, value in inputs.items()}
model.to(device)

# Get the model's prediction
with torch.no_grad():
    logits = model(inputs["input_ids"], inputs["attention_mask"]).squeeze()

# Convert logits to probabilities
relevance_score = sigmoid(logits).item()  # Convert to a probability between 0.0 and 1.0

# Print the relevance score
print(f"Relevance Score: {relevance_score:.4f}")


Relevance Score: 0.9979


# Generate relevance for the test.csv to be submit it

In [4]:
df_test = pd.read_csv('../input/juridia-hackhaton-relevance-competition/test.csv')
df_test

Unnamed: 0,question,article,Id
0,Quels sont les critères communaux d'insalubrité ?,Le bourgmestre statue sur le rapport d'enquête...,0
1,A-t-on droit à l'allocation de naissance en ca...,§ 1er. Lorsqu'un enfant est décédé au moment d...,1
2,A-t-on droit à l'allocation de naissance en ca...,L'acte d'enfant sans vie mentionne :1° la date...,2
3,Quels frais peut-on ajouter lors d'un recouvre...,Dans les obligations qui se bornent au payemen...,3
4,Quels frais peut-on ajouter lors d'un recouvre...,"§ 1er. Le juge peut, d'office ou à la demande ...",4
...,...,...,...
1056,A qui dois-je payer ma dette ?,Le payement doit être fait au créancier ou à q...,1056
1057,Je suis marié(e). On prend un logement en loca...,Chaque époux perçoit seul ses revenus et les a...,1057
1058,Est-ce que je peux signer plusieurs baux de co...,Baux de courte duréePar dérogation à l'article...,1058
1059,Je suis victime de violences conjugales. En ta...,Dans les cas mentionnés aux articles 398 à 405...,1059


In [55]:
def predict_relevance(model, tokenizer, question, article, device):
    # Tokenize the input pair
    inputs = tokenizer(question, article, return_tensors='pt', padding=True, truncation=True, max_length=512)

    # Move inputs to the same device as the model
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Get model predictions (logits)
    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask).squeeze()
        
    # Apply sigmoid to get probability (relevance score between 0 and 1)
    relevance_score = sigmoid(logits).item()
    return relevance_score


In [56]:
# List to store results
results = []

# Loop through the dataset and calculate relevance for each pair
for index, row in df_test.iterrows():
    question = row['question']
    article = row['article']
    id_ = row['Id']
    
    # Get the relevance score
    relevance_score = predict_relevance(model, tokenizer, question, article, device)
    
    # Append the result with the id and relevance score (rounded to 4 decimals)
    results.append({"Id": id_, "relevance_score": round(relevance_score, 4)})

# Create a DataFrame from the results
results_df = pd.DataFrame(results)


In [67]:
results_df.to_csv('submission.csv', index=False)

In [66]:
results_df

Unnamed: 0,Id,relevance_score
0,0,0.9978
1,1,0.9980
2,2,0.9972
3,3,0.0461
4,4,0.0003
...,...,...
1056,1056,0.9990
1057,1057,0.9323
1058,1058,0.9987
1059,1059,0.9980
