# Funathon 2023 - Sujet 5

Analyse textuelle des commentaires clients de services de commande de repas en ligne

In [1]:
!pip install -r ../requirements.txt



In [2]:
# download the data
import pandas as pd

df = pd.read_parquet("https://minio.lab.sspcloud.fr/projet-funathon/2023/sujet5/diffusion/reviews_takeaway.parquet")

# local copy of the data
df.to_parquet("reviews_takeaway.parquet")

# BERT

In [3]:
# import libraries

import pandas as pd

from sklearn.model_selection import train_test_split

from tqdm import tqdm

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader

In [4]:
df = pd.read_parquet("reviews_takeaway.parquet")

In [5]:
# Convert data à InputExample format
class InputExample(object):
    def __init__(self, guid, text_a, text_b, label):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

def convert_data_to_examples(df, input_column, target_column):
    examples = []
    for i, row in df.iterrows():
        guid = None
        text_a = row[input_column]
        text_b = None
        label = row[target_column]
        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

class CustomDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, index):
        example = self.examples[index]
        inputs = self.tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True
        )
        input_ids = torch.tensor(inputs["input_ids"])
        attention_mask = torch.tensor(inputs["attention_mask"])
        token_type_ids = torch.tensor(inputs["token_type_ids"])
        label = torch.tensor(example.label)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "label": label
        }

In [None]:
# Split
df['full_raw'] = 'TITRE: ' + df['title'] + ' |AND| ' + 'COMMENT: ' + df['comment']
df_text = df[['note', 'full_raw']].copy()
df_text['note'] = df_text['note'] - 1

train_df, val_df = train_test_split(df_text, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(val_df, test_size=0.5, random_state=42)

# Convert à InputExample format
train_examples = convert_data_to_examples(train_df, 'full_raw', 'note')
val_examples = convert_data_to_examples(val_df, 'full_raw', 'note')
test_examples = convert_data_to_examples(test_df, 'full_raw', 'note')

# Convert à PyTorch dataset format
train_dataset = CustomDataset(train_examples, tokenizer, max_length=128)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = CustomDataset(val_examples, tokenizer, max_length=128)
val_dataloader = DataLoader(val_dataset, batch_size=32)


test_dataset = CustomDataset(test_examples, tokenizer, max_length=128)
test_dataloader = DataLoader(test_dataset, batch_size=32)

In [None]:
# Train
NUM_EPOCHS = 3 #à 10 epochs, ça overfit dès le 4e epoch

In [None]:
# BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=5)

# Freeze BERT layers
for param in model.parameters():
    param.requires_grad = True

# Define optimizer, loss function, and metric
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, eps=1e-8)
loss_fn = torch.nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=model.to(device)


#for epoch in range(NUM_EPOCHS): #si sans tqdm
for epoch in tqdm(range(NUM_EPOCHS), desc="Training"):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["label"].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
        
        loss = outputs.loss
        logits = outputs.logits
        
        _, predicted = torch.max(logits, dim=1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    train_loss = total_loss / len(train_dataloader)
    train_accuracy = total_correct / total_samples
    
    model.eval()
    val_loss = 0
    val_correct = 0
    val_samples = 0
    
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            labels = batch["label"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
            
            loss = outputs.loss
            logits = outputs.logits
            
            _, predicted = torch.max(logits, dim=1)
            val_correct += (predicted == labels).sum().item()
            val_samples += labels.size(0)
            
            val_loss += loss.item()
    
    val_loss /= len(val_dataloader)
    val_accuracy = val_correct / val_samples
    
    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
    print()

In [None]:
# Evaluate
model.eval()
test_loss = 0
test_correct = 0
test_samples = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["label"].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
        
        loss = outputs.loss
        logits = outputs.logits
        
        _, predicted = torch.max(logits, dim=1)
        test_correct += (predicted == labels).sum().item()
        test_samples += labels.size(0)
        
        test_loss += loss.item()

test_loss /= len(test_dataloader)
test_accuracy = test_correct / test_samples

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
text='TITRE: JE SUIS PAS CONTENT !!! ' ' |AND| ' + 'COMMENT: VOTRE PRODUIT EST NUL'

In [None]:
        guid = None
        text_a = row[input_column]
        text_b = None
        label = row[target_column]
        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))