# Installing all libraries

In [1]:
# cell 1
!pip install --no-cache-dir transformers
!pip install --no-cache-dir evaluate
!pip install --no-cache-dir nltk
!pip install --no-cache-dir datasets
!pip install --no-cache-dir scikit-learn


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


# Importing all libraries

In [2]:
# cell 2
import pandas as pd
from transformers import MarianTokenizer, MarianMTModel
import torch
import evaluate
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from datasets import Dataset
print("done importing")

done importing


# Cleaning and tokenisation function 

In [3]:
# Function to clean and tokenize sentences(cell 3)
def clean_and_tokenize(text):
    # Tokenize the text and remove unnecessary characters
    tokens = word_tokenize(text.lower())  # Tokenization
    return tokens
print("Done")


Done


# Data Preprocessing

In [4]:
# Load the dataset (replace 'your_dataset.csv' with the correct path)(cell 4)
df = pd.read_csv('/kaggle/input/legaldataset/cleaned_legal_dataset.csv')

# Split the dataset into train, validation, and test sets (80% train, 10% validation, 10% test)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Print the sizes of each split
print(f"Training data size: {len(train_df)}")
print(f"Validation data size: {len(val_df)}")
print(f"Test data size: {len(test_df)}")


Training data size: 275127
Validation data size: 34391
Test data size: 34391


# Appling cleaning and tokenisation(column wise)

In [5]:
# Apply cleaning and tokenization to the training, validation, and test datasets(cell 5)
train_df['cleaned_source_sentence'] = train_df['Source Sentence'].apply(clean_and_tokenize)
train_df['cleaned_target_sentence'] = train_df['Target Sentence'].apply(clean_and_tokenize)

val_df['cleaned_source_sentence'] = val_df['Source Sentence'].apply(clean_and_tokenize)
val_df['cleaned_target_sentence'] = val_df['Target Sentence'].apply(clean_and_tokenize)

test_df['cleaned_source_sentence'] = test_df['Source Sentence'].apply(clean_and_tokenize)
test_df['cleaned_target_sentence'] = test_df['Target Sentence'].apply(clean_and_tokenize)
print("cleaning and tokenization to the training, validation, and test datasets done")

cleaning and tokenization to the training, validation, and test datasets done


# converting to hugging face datasets

In [6]:
# Convert pandas DataFrames to Hugging Face Datasets(cell 6)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)
print("Convert pandas DataFrames to Hugging Face Datasets done")

Convert pandas DataFrames to Hugging Face Datasets done


In [7]:
print(train_dataset.column_names)


['Source Sentence', 'Target Sentence', 'cleaned_source_sentence', 'cleaned_target_sentence', '__index_level_0__']


# Defining tokeniser

In [8]:
from transformers import AutoTokenizer

# Define the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print("done")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

done


# Tokenisation of source and target sentence

In [9]:
def tokenize_function(examples):
    # Tokenizing the source sentences (input)
    model_inputs = tokenizer(examples["cleaned_source_sentence"], max_length=128, padding="max_length", truncation=True)
    
    # Tokenizing the target sentences (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["cleaned_target_sentence"], max_length=128, padding="max_length", truncation=True)
    
    # Adding labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
print("done!!!!")

done!!!!


In [10]:
from transformers import MarianTokenizer
from datasets import Dataset

# Load the Marian tokenizer for English to French
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

def tokenize_function(examples):
    # Tokenize the source sentences (e.g., English)
    model_inputs = tokenizer(
        examples["Source Sentence"], max_length=128, padding="max_length", truncation=True
    )
    
    # Tokenize the target sentences (e.g., French)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["Target Sentence"], max_length=128, padding="max_length", truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# Apply tokenization in batches (adjust batch_size as needed)
train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=32)  # Adjust batch size to 32 for testing
val_dataset = val_dataset.map(tokenize_function, batched=True, batch_size=32)
test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=32)

# Remove unnecessary columns to save memory after tokenization
train_dataset = train_dataset.remove_columns(["Source Sentence", "Target Sentence"])
val_dataset = val_dataset.remove_columns(["Source Sentence", "Target Sentence"])
test_dataset = test_dataset.remove_columns(["Source Sentence", "Target Sentence"])

# Inspect the tokenized data (first 5 rows)
print(train_dataset[:5])


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



Map:   0%|          | 0/275127 [00:00<?, ? examples/s]



Map:   0%|          | 0/34391 [00:00<?, ? examples/s]

Map:   0%|          | 0/34391 [00:00<?, ? examples/s]

{'cleaned_source_sentence': [['decision', '93/467/eec', 'is', 'hereby', 'amended', 'as', 'follows', ':'], ['a', ')', 'la', 'valeur', 'de', 'la', 'production', 'commercialisée', 'est', 'inférieure', 'au', 'montant', 'utilisé', 'pour', 'le', 'calcul', 'de', "l'aide", 'visée', 'à', "l'article", '3', ',', 'ou'], ['no', 'state', 'or', 'regional', 'economic', 'integration', 'organization', 'may', 'deposit', 'an', 'instrument', 'of', 'ratification', ',', 'acceptance', ',', 'aproval', 'or', 'accession', 'to', 'this', 'amendment', 'unless', 'it', 'has', 'previously', ',', 'or', 'simultaneously', ',', 'deposited', 'such', 'an', 'instrument', 'to', 'the', 'amendment', 'adopted', 'at', 'the', 'second', 'meeting', 'of', 'the', 'parties', 'in', 'london', ',', '29', 'june', '1990', '.'], ['6', ')', 'à', "l'annexe", 'i', 'point', '2', ',', 'le', 'texte', 'suivant', 'est', 'inséré', 'avant', 'la', 'ligne', 'zea', 'mays', 'du', 'tableau', ':'], ['d', ')', 'montants', 'des', 'primes', 'et', 'des', 'paiem

# Loading the pre-trained models

In [11]:
# Function to load pre-trained models and tokenizers
def load_model_and_tokenizer(model_name):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

# Load the models for English-to-French and French-to-English
tokenizer_en_fr, model_en_fr = load_model_and_tokenizer("Helsinki-NLP/opus-mt-en-fr")
tokenizer_fr_en, model_fr_en = load_model_and_tokenizer("Helsinki-NLP/opus-mt-fr-en")
print("load pre-trained models and tokenizers done")

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

load pre-trained models and tokenizers done


# Installing sentence-transformer model

In [12]:
pip install sentence-transformers


  pid, fd = os.forkpty()


model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1
Note: you may need to restart the kernel to use updated packages.


# Calculating BLEU Score,Cosine similarity,Fine tuning with RL,Implementation of user feedback loop and Implementation of interactive user input 

In [13]:
import torch
from transformers import MarianMTModel, MarianTokenizer, AdamW
from evaluate import load
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os

# Load BLEU metric and Sentence Transformer model for semantic similarity
bleu = load("bleu")
similarity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Load pre-trained models and tokenizers
def load_model_and_tokenizer(model_name):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

# Initialize models and tokenizers
tokenizer_en_fr, model_en_fr = load_model_and_tokenizer("Helsinki-NLP/opus-mt-en-fr")
tokenizer_fr_en, model_fr_en = load_model_and_tokenizer("Helsinki-NLP/opus-mt-fr-en")
feedback_data = []  # To accumulate user feedback

# Reward function combining BLEU score and cosine similarity
def calculate_reward(reference, hypothesis):
    # Calculate BLEU score with smoothing
    bleu_score = bleu.compute(predictions=[hypothesis], references=[[reference]], smooth=True)["bleu"]
    
    # Scale the BLEU score to always be between 0.5 and 1
    bleu_score = max(0.5, min(1.0, bleu_score))  # Ensures BLEU score is within [0.5, 1]

    # Calculate cosine similarity
    ref_embedding = similarity_model.encode([reference])
    hyp_embedding = similarity_model.encode([hypothesis])
    cosine_sim = cosine_similarity(ref_embedding, hyp_embedding)[0][0]

    # Normalize cosine similarity to be between 0 and 1
    cosine_sim_normalized = (cosine_sim + 1) / 2  # Converts cosine similarity from [-1, 1] to [0, 1]

    # Combine BLEU and cosine similarity scores
    final_score = 0.5 * bleu_score + 0.5 * cosine_sim_normalized

    # Ensure final score is never below 0.5
    final_score = max(0.5, final_score)

    return final_score

# Reinforcement Learning-based fine-tuning
def fine_tune_with_rl(model, tokenizer, feedback_data, num_epochs=3, learning_rate=5e-5):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    model.train()

    for epoch in range(num_epochs):
        total_reward = 0
        for feedback in feedback_data:
            input_ids = tokenizer(feedback["source"], return_tensors="pt", padding=True, truncation=True).input_ids
            reference = feedback["target"]

            # Generate translation
            outputs = model.generate(input_ids, max_length=128)
            generated_ids = outputs[0]
            hypothesis = tokenizer.decode(generated_ids, skip_special_tokens=True)

            # Calculate reward
            reward = calculate_reward(reference, hypothesis)
            total_reward += reward

            # Policy Gradient Update
            logits = model(input_ids=input_ids, decoder_input_ids=generated_ids[:-1].unsqueeze(0)).logits
            log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
            selected_log_probs = log_probs.gather(2, generated_ids[1:].unsqueeze(0).unsqueeze(2)).squeeze(2)
            loss = -torch.mean(selected_log_probs) * reward

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{num_epochs} completed. Total Reward: {total_reward / len(feedback_data):.4f}")

    # Save the fine-tuned model
    model.save_pretrained("./fine_tuned_model")
    tokenizer.save_pretrained("./fine_tuned_model")
    print("Model fine-tuned using reinforcement learning and saved successfully.")

# Reload fine-tuned model
def load_fine_tuned_model():
    if os.path.exists("./fine_tuned_model"):
        tokenizer = MarianTokenizer.from_pretrained("./fine_tuned_model")
        model = MarianMTModel.from_pretrained("./fine_tuned_model")
        return tokenizer, model
    else:
        return tokenizer_en_fr, model_en_fr

# Translation function with feedback loop
def translate_and_evaluate(input_text, direction):
    # Predefined reference translations for BLEU score calculation
    reference_translations = {
        "The contract is governed by French law. In the event of a dispute, the parties undertake to submit their dispute to the exclusive jurisdiction of the French courts. Any modification of the contract must be made in writing and signed by both parties.":
        "Le contrat est régi par le droit français. En cas de litige, les parties s'engagent à soumettre leur différend à la compétence exclusive des juridictions françaises. Toute modification du contrat devra être faite par écrit et signée par les deux parties."
    }

    # Automatically set the reference translation
    reference_translation = reference_translations.get(input_text, "No reference available") 

    if direction == "en_fr":
        tokenizer = tokenizer_en_fr
        model = model_en_fr
    elif direction == "fr_en":
        tokenizer = tokenizer_fr_en
        model = model_fr_en
    else:
        print("Invalid translation direction.")
        return

    # Translate input text
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).input_ids
    output_ids = model.generate(input_ids, max_length=128)
    translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Calculate BLEU score and cosine similarity
    final_score = calculate_reward(reference_translation, translated_text)

    # Print output in a clean and organized way
    print("\n--- Translation ---")
    print("-" * 50)
    print(f"Input Text: \n{input_text}")
    print("-" * 50)
    print(f"Translated Text: \n{translated_text}")
    print("-" * 50)
    print(f"Final Score (BLEU + Cosine Similarity): {final_score:.4f}")
    print("-" * 50)

    # Gather user feedback
    feedback = input("\nIs the translation good? (yes/no): ").strip().lower()
    if feedback == "no":
        corrected_translation = input("Please provide the correct translation: ").strip()

        # Store feedback
        feedback_data.append({"source": input_text, "target": corrected_translation})
        print("Feedback recorded for reinforcement learning.")

        # Fine-tune model after collecting 3 feedback examples
        if len(feedback_data) >= 3:
            print("\nFine-tuning the model with collected feedback...")
            fine_tune_with_rl(model, tokenizer, feedback_data)
            feedback_data.clear()  # Clear feedback after fine-tuning
    else:
        print("Translation accepted.")

# Main function
def main():
    global tokenizer_en_fr, model_en_fr
    tokenizer_en_fr, model_en_fr = load_fine_tuned_model()

    print("\nWelcome to the Reinforcement Learning-Based Translation System!")
    print("You need to provide at least 3 feedbacks to fine-tune the model.\n")

    while True:
        input_text = input("\nEnter text for translation (or type 'exit' to quit): ").strip()
        if input_text.lower() == "exit":
            print("Exiting the translation system. Goodbye!")
            break

        direction = input("Enter translation direction (en_fr for English to French, fr_en for French to English): ").strip()

        translate_and_evaluate(input_text, direction)

# Run the system
if __name__ == "__main__":
    main()


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Welcome to the Reinforcement Learning-Based Translation System!
You need to provide at least 3 feedbacks to fine-tune the model.



model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]


Enter text for translation (or type 'exit' to quit):  Any contracting State which, under its domestic law, requires as a condition of respect for copyright formalities such as deposit, registration, notification, notarial certification, payment of fees or manufacture or publication in that contracting State shall consider these conditions fulfilled for all works protected in accordance with this Convention and first published outside its territory and whose author is not one of its nationals, if, from the time of first publication, all copies of the work published with the authority of the author or other copyright owner bear the symbol © accompanied by the name of the copyright owner and the year of first publication placed in the manner and in the place of reasonable notice of the copyright claim.
Enter translation direction (en_fr for English to French, fr_en for French to English):  en_fr


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


--- Translation ---
--------------------------------------------------
Input Text: 
Any contracting State which, under its domestic law, requires as a condition of respect for copyright formalities such as deposit, registration, notification, notarial certification, payment of fees or manufacture or publication in that contracting State shall consider these conditions fulfilled for all works protected in accordance with this Convention and first published outside its territory and whose author is not one of its nationals, if, from the time of first publication, all copies of the work published with the authority of the author or other copyright owner bear the symbol © accompanied by the name of the copyright owner and the year of first publication placed in the manner and in the place of reasonable notice of the copyright claim.
--------------------------------------------------
Translated Text: 
Tout État contractant qui, en vertu de son droit interne, exige comme condition de respec


Is the translation good? (yes/no):  yes


Translation accepted.



Enter text for translation (or type 'exit' to quit):  exit


Exiting the translation system. Goodbye!
