# Import packages

In [None]:
# Import packages
import pandas as pd
import torch
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

from transformers import CamembertForSequenceClassification, CamembertTokenizer, AdamW

from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import accuracy_score

from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

import os
os.system('pip install spacy')
os.system('python -m spacy download fr_core_news_sm')

import spacy
import string
nlp = spacy.load('fr_core_news_sm')

import gensim.downloader as api
import random

# Discovering the CamemBERT model

Facing the reality that classic ML models were limited in achieving a high accuracy (maximum accuracy so far = 0.46), we embarked on a quest for innovative solutions. In our pursuit, we encountered the CamemBERT model, a cutting-edge neural network architecture tailored for natural language understanding tasks.

How does it work? First, CamemBERT undergoes a pre-training phase where it familiarizes itself with the nuances of the French language by digesting vast amounts of text data. During this stage, it learns to comprehend relationships between words and sentences, leveraging a technique called self-attention to capture contextual dependencies effectively. Once pre-training is complete, CamemBERT can be fine-tuned for specific tasks, such as predicting the difficulty of French sentences.

## Train on 80% of training data, test on 20% of training data

In [None]:
# Load the training data
training = pd.read_csv("https://raw.githubusercontent.com/cvermno/ML-Project/main/Datasets/training_data.csv")

# Split your data into features (X) and target variable (y)
X = training['sentence']
y = training['difficulty']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 123)

# Encode labels: convert difficulty levels (A1, A2, etc.) into numerical labels.
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define a custom dataset class: this tokenizes the input sentences using the CamemBERT tokenizer and prepares them for input to the model.
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),  # Assuming labels are already numerical
        }

# Define the model's parameters
MAX_LEN = 310
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_EPOCHS = 20

# Initialize the CamemBERT tokenizer and the CamemBERT model with the base pre-trained weights
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6)

# Prepare datasets using the custom dataset class
train_dataset = CustomDataset(X_train.values, y_train_encoded, tokenizer, MAX_LEN)
test_dataset = CustomDataset(X_test.values, y_test_encoded, tokenizer, MAX_LEN)

# Create data loaders to efficiently feed batches of data to the model during training and evaluation
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(NUM_EPOCHS):
    print(f"Epoch {epoch} of {NUM_EPOCHS}")
    model.train()
    total_loss = 0

    k = 1
    for batch in train_loader:
        print(f"\tBatch {k} of {len(train_loader)}")
        k += 1
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

# Test loop
model.eval()
test_preds = []
test_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_labels, test_preds)
print(f"Epoch {epoch + 1}/{NUM_EPOCHS}, Loss: {total_loss:.4f}, Test Acc: {test_acc:.4f}")

# Convert encoded labels back to original labels
y_test_decoded = label_encoder.inverse_transform(test_labels)

# Create a DataFrame with sentences and corresponding difficulty predictions
output_data = pd.DataFrame({"id": X_test.index, "difficulty": y_test_decoded})
output_data

Takeaways: The test accuracy (0.5229) notably indicates high performance, while the loss (6.1162) remains relatively low, suggesting effective model training. The model shows great potential!

## Train on 100% training data, test on the unlabelled test data

In [None]:
# Load the training and test data
training = pd.read_csv("https://raw.githubusercontent.com/cvermno/ML-Project/main/Datasets/training_data.csv")
test = pd.read_csv("https://raw.githubusercontent.com/cvermno/ML-Project/main/Datasets/unlabelled_test_data.csv")

# Split your train data into features (X) and target variable (y)
X = training['sentence']
y = training['difficulty']

# Split your test data into features (X) and target variable (y)
X_final = test['sentence']

# Encode labels: convert difficulty levels (A1, A2, etc.) into numerical labels.
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Define a custom dataset class: this tokenizes the input sentences using the CamemBERT tokenizer and prepares them for input to the model.
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),  # Assuming labels are already numerical
        }

# Define the model's parameters
MAX_LEN = 310
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_EPOCHS = 20

# Initialize the CamemBERT tokenizer and the CamemBERT model with the base pre-trained weights
tokenizer_final = CamembertTokenizer.from_pretrained("camembert-base")
model_final = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6)

# Prepare datasets using the custom dataset class
train_dataset = CustomDataset(X.values, y_encoded, tokenizer_final, MAX_LEN)
test_dataset = CustomDataset(X_final.values, [0]*len(X_final), tokenizer_final, MAX_LEN)

# Create data loaders to efficiently feed batches of data to the model during training and evaluation
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define optimizer and loss function
optimizer = AdamW(model_final.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_final.to(device)

for epoch in range(NUM_EPOCHS):
    print(f"Epoch {epoch} of {NUM_EPOCHS}")
    model_final.train()
    total_loss = 0

    k = 1
    for batch in train_loader:
        print(f"\tBatch {k} of {len(train_loader)}")
        k += 1
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model_final(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

# Test loop
model_final.eval()
test_predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Forward pass
        outputs = model_final(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()

        # Get predicted labels
        predicted_labels = np.argmax(logits, axis=1)
        test_predictions.extend(predicted_labels)

# Convert predicted labels back to original labels
predicted_labels = label_encoder.inverse_transform(test_predictions)

# Save the predictions to a CSV file
test['predicted_difficulty'] = predicted_labels
test.drop(columns=['sentence'], inplace=True)
test.rename(columns={'predicted_difficulty': 'difficulty'}, inplace=True)
test.to_csv('final.csv', index=False)

# Download the CSV file
from google.colab import files
files.download("final.csv")

# Finetuning the model: adjusting the parameters

Finetuning the camemBERT model involves adjusting its parameters to minimize the disparity between predicted and actual difficulty levels. Let's take a closer look at the definition and significance of each parameter, understanding its role within the model:

1. **MAX_LEN**: This is the maximum number of words the model looks at in a sentence. Sequences longer than this length will be truncated, while shorter sequences will be padded to match this length. Given that the maximum number of words in our dataset sentences is 304, we opt to set the maximum length to 310. This choice aims to preserve all valuable information within sentences, while maintaining computational efficiency.
2. **BATCH_SIZE**: This is the number of sentences processed simultaneously by the model during each iteration of training or evaluation. Larger batch sizes typically result in faster training but may require more memory. Conversely, smaller batch sizes may lead to slower training but can sometimes yield better generalization. A common practice is to use a batch size that divides the total number of samples evenly. With 1200 sentences in our test set, a batch size of 32 would be suitable, providing approximately 37 batches for the entire dataset. We validated this choice empirically by experimenting with different BATCH_SIZE values.
3. **LEARNING_RATE**: This is the step size (gradient descent) taken during optimization to update the model's parameters. A higher learning rate allows for faster convergence but may lead to overshooting the optimal solution or instability in training. On the other hand, a lower learning rate may result in slower convergence but can yield more stable training and better performance. A commonly recommended starting point is to use a learning rate within the range of 1e-5 to 5e-5 for fine-tuning CamemBERT models.
4. **NUM_EPOCHS**: This is how many times the model looks at the entire dataset. Training for more epochs allows the model to learn from the data multiple times, potentially improving its performance. However, training for too many epochs can lead to overfitting, where the model memorizes the training data and performs poorly on unseen data. Starting with a relatively small number of epochs, such as 10, is a common practice in model training.

Below is a summary table presenting the accuracy levels achieved for various parameter configurations. This table was generated by importing the predictions dataframe for the model trained on 100% of the training data to Kaggle.

In [5]:
parameters_optimization = [
    ["MAX_LEN", "BATCH_SIZE", "LEARNING_RATE", "NUM_EPOCHS", "Accuracy"],
    [310, 16, "2e-5", 10, 0.581],
    [310, 16, "2e-5", 20, 0.555],
    [310, 16, "1e-5", 10, 0.557],
    [310, 16, "1e-5", 20, 0.540],
    [310, 32, "2e-5", 10, 0.583],
    [310, 32, "2e-5", 20, 0.521],
    [310, 32, "1e-5", 10, 0.551],
    [310, 32, "1e-5", 20, 0.518],
    [310, 16, "2e-5", 5, 0.571],
    [310, 32, "2e-5", 5, 0.554],
    [310, 16, "10e-5", 20, 0.155]
]

parameters_optimization_df = pd.DataFrame(parameters_optimization[1:], columns=parameters_optimization[0])
parameters_optimization_df

Unnamed: 0,MAX_LEN,BATCH_SIZE,LEARNING_RATE,NUM_EPOCHS,Accuracy
0,310,16,2e-05,10,0.581
1,310,16,2e-05,20,0.555
2,310,16,1e-05,10,0.557
3,310,16,1e-05,20,0.54
4,310,32,2e-05,10,0.587
5,310,32,2e-05,20,0.521
6,310,32,1e-05,10,0.551
7,310,32,1e-05,20,0.518
8,310,16,2e-05,5,0.571
9,310,32,2e-05,5,0.554


# Exploring further text processing techniques: lemmatization

Lemmatization is the process of reducing words to their base or root form. Here, it is applied to the text data before tokenization. This ensures that the tokenized text used for training and inference contains standardized representations of words, potentially improving model performance and interpretability. Let's investigate its efficacy.

In [None]:
# TRAIN ON 80% OF THE TRAINING DATA, TEST ON 20% OF THE TRAINING DATA

# Load the training data
training = pd.read_csv("https://raw.githubusercontent.com/cvermno/ML-Project/main/Datasets/training_data.csv")

# Create tokenizer function for preprocessing
def spacy_tokenizer(text):

    # Define stopwords, punctuation, and numbers
    stop_words = stopwords.words('french')
    punctuations = string.punctuation +'–' + '—'
    numbers = "0123456789"

    # Create spacy object
    mytokens = nlp(text)

    # Lemmatize each token and convert each token into lowercase
    mytokens = ([ word.lemma_.lower().strip() for word in mytokens ])

    # Remove stop words and punctuation
    mytokens = ([ word for word in mytokens
                 if word not in stop_words and word not in punctuations ])

    # Remove sufix like ".[1" in "experience.[1"
    mytokens_2 = []
    for word in mytokens:
        for char in word:
            if (char in punctuations) or (char in numbers):
                word = word.replace(char, "")
        if word != "":
            mytokens_2.append(word)

    # Return preprocessed list of tokens
    return mytokens_2

# Tokenize texts in training data
training['processed_sentence'] = training['sentence'].apply(spacy_tokenizer)
training['processed_sentence'] = training['processed_sentence'].apply(lambda x: ' '.join(x))

# Split your train data into features (X) and target variable (y)
X = training['processed_sentence']
y = training['difficulty']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 123)

# Encode labels: convert difficulty levels (A1, A2, etc.) into numerical labels.from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define a custom dataset class: this tokenizes the input sentences using the CamemBERT tokenizer and prepares them for input to the model.
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),  # Assuming labels are already numerical
        }

# Define the model's parameters
MAX_LEN = 310
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_EPOCHS = 20

# Initialize the CamemBERT tokenizer and the CamemBERT model with the base pre-trained weights
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6)

# Prepare datasets using the custom dataset class
train_dataset = CustomDataset(X_train.values, y_train_encoded, tokenizer, MAX_LEN)
test_dataset = CustomDataset(X_test.values, y_test_encoded, tokenizer, MAX_LEN)

# Create data loaders to efficiently feed batches of data to the model during training and evaluation
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(NUM_EPOCHS):
    print(f"Epoch {epoch} of {NUM_EPOCHS}")
    model.train()
    total_loss = 0

    k = 1
    for batch in train_loader:
        print(f"\tBatch {k} of {len(train_loader)}")
        k += 1
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

# Test loop
model.eval()
test_preds = []
test_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_labels, test_preds)
print(f"Epoch {epoch + 1}/{NUM_EPOCHS}, Loss: {total_loss:.4f}, Test Acc: {test_acc:.4f}")

# Convert encoded labels back to original labels
y_test_decoded = label_encoder.inverse_transform(test_labels)

# Create a DataFrame with sentences and corresponding difficulty predictions
output_data = pd.DataFrame({"id": X_test.index, "difficulty": y_test_decoded})
output_data

Takeaways: After running this model with various parameters, we observed that the accuracy level did not surpass that of the previous model (0.4729 < 0.5229). Additionally, the loss more than doubled (14.7011 > 6.1162), indicating a less effective performance compared to the previous configuration.

# Integrating data augmentation methods

Data augmentation by synonym replacement is a technique used to increase the diversity of training data. In this technique, words in the text are replaced with their synonyms while preserving the overall meaning of the text. The goal of this approach is to improve the robustness and generalization ability of machine learning models trained on that data.

We will explore two different approaches: using word embeddings or leveraging pre-existing libraries. Let's delve into each of these approaches sequentially.

## Enriching training data using word embedding








What exactly is word embedding? It's a technique to represent words as vectors in a high-dimensional space. These vectors capture semantic relationships between words, meaning that similar words are represented by similar vectors.

Within this data augmentation approach, words are replaced with similar ones based on their embeddings. Hence the model may learn more robust representations of language patterns and semantics.

In [None]:
# TRAIN ON 80% OF THE TRAINING DATA, TEST ON 20% OF THE TRAINING DATA

# Load the training data
training = pd.read_csv("https://raw.githubusercontent.com/cvermno/ML-Project/main/Datasets/training_data.csv")

# Split your data into features (X) and target variable (y)
X = training['sentence']
y = training['difficulty']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 123)

# Encode labels: convert difficulty levels (A1, A2, etc.) into numerical labels.
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Load pre-trained French word embeddings
french_word_vectors = api.load("word2vec-ruscorpora-300")

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len, augment=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.augment = augment

    def word_embedding_augmentation(self, text):
        augmented_text = []
        for word in text.split():
            if word in french_word_vectors:
                similar_words = french_word_vectors.most_similar(word, topn=5)
                augmented_text.append(similar_words[0][0])  # Choose the most similar word
            else:
                augmented_text.append(word)
        return ' '.join(augmented_text)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        if self.augment:
            text = self.word_embedding_augmentation(text)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),  # Assuming labels are already numerical
        }

# Define the model's parameters
MAX_LEN = 310
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_EPOCHS = 20

# Initialize the CamemBERT tokenizer and the CamemBERT model with the base pre-trained weights
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6)

# Prepare datasets using the custom dataset class
train_dataset = CustomDataset(X_train.values, y_train_encoded, tokenizer, MAX_LEN, augment=True)
test_dataset = CustomDataset(X_test.values, y_test_encoded, tokenizer, MAX_LEN)

# Create data loaders to efficiently feed batches of data to the model during training and evaluation
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(NUM_EPOCHS):
    print(f"Epoch {epoch} of {NUM_EPOCHS}")
    model.train()
    total_loss = 0

    k = 1
    for batch in train_loader:
        print(f"\tBatch {k} of {len(train_loader)}")
        k += 1
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

# Test loop
model.eval()
test_preds = []
test_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_labels, test_preds)
print(f"Epoch {epoch + 1}/{NUM_EPOCHS}, Loss: {total_loss:.4f}, Test Acc: {test_acc:.4f}")

# Convert encoded labels back to original labels
y_test_decoded = label_encoder.inverse_transform(test_labels)

# Create a DataFrame with sentences and corresponding difficulty predictions
output_data = pd.DataFrame({"id": X_test.index, "difficulty": y_test_decoded})
output_data

Takeaways: Utilizing word embeddings to replace words with synonyms results in a  higher accuracy (0.5802 > 0.5229). However, there's a slight increase in loss (7.8193 > 6.1162), although it remains relatively consistent.

## Enriching training data using synonyms from NLTK's WordNet

In this method, words within sentences are randomly substituted with their synonyms retrieved from NLTK's WordNet, thereby enhancing the diversity of the training data by generating new variations of the original sentences.

In [None]:
# TRAIN ON 80% OF THE TRAINING DATA, TEST ON 20% OF THE TRAINING DATA

# Load the training data
training = pd.read_csv("https://raw.githubusercontent.com/cvermno/ML-Project/main/Datasets/training_data.csv")

# Split your data into features (X) and target variable (y)
X = training['sentence']
y = training['difficulty']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 123)

# Encode labels: convert difficulty levels (A1, A2, etc.) into numerical labels.
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Function to get synonyms of a word
def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return synonyms

def synonym_replacement(sentence, n):
    words = word_tokenize(sentence)
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stopwords.words('french')]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    sentence = ' '.join(new_words)
    return sentence

# Define a custom dataset class: this tokenizes the input sentences using the CamemBERT tokenizer and prepares them for input to the model.
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len, augment=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.augment = augment

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        if self.augment:
           text = synonym_replacement(text, 1)  # You can adjust the number of replacements


        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),  # Assuming labels are already numerical
        }

# Define the model's parameters
MAX_LEN = 310
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_EPOCHS = 20

# Initialize the CamemBERT tokenizer and the CamemBERT model with the base pre-trained weights
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6)

# Prepare datasets using the custom dataset class
train_dataset = CustomDataset(X_train.values, y_train_encoded, tokenizer, MAX_LEN, augment=True)
test_dataset = CustomDataset(X_test.values, y_test_encoded, tokenizer, MAX_LEN)

# Create data loaders to efficiently feed batches of data to the model during training and evaluation
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(NUM_EPOCHS):
    print(f"Epoch {epoch} of {NUM_EPOCHS}")
    model.train()
    total_loss = 0

    k = 1
    for batch in train_loader:
        print(f"\tBatch {k} of {len(train_loader)}")
        k += 1
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

# Test loop
model.eval()
test_preds = []
test_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_labels, test_preds)
print(f"Epoch {epoch + 1}/{NUM_EPOCHS}, Loss: {total_loss:.4f}, Test Acc: {test_acc:.4f}")

# Convert encoded labels back to original labels
y_test_decoded = label_encoder.inverse_transform(test_labels)

# Create a DataFrame with sentences and corresponding difficulty predictions
output_data = pd.DataFrame({"id": X_test.index, "difficulty": y_test_decoded})
output_data

Takeaways: In contrast to the previous model, there's a reduction in accuracy (0.5604 < 0.5802), alongside an increase in loss (8.7431 > 7.8193). Consequently, this model fails to demonstrate any improvement in performance.