In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import os
import re
import requests
from dotenv import load_dotenv



In [15]:
DATA_PATH = "../data/cleaned/cleaned_reviews_general.csv"
if not os.path.exists(DATA_PATH):
    print(f"Error: File not found at {DATA_PATH}")
else:
    # Load the data
    df = pd.read_csv(DATA_PATH, encoding='utf-8-sig')
    print(f"Successfully loaded {len(df):,} reviews")

Successfully loaded 542 reviews


In [16]:
print("MISSING VALUES")

# For rating column, count NaN values + 0 values as missing
# For other columns, count only NaN values
missing_counts = {}

for column in df.columns:
    if column == 'rating':
        # Count NaN + 0 values for rating
        nan_count = df[column].isna().sum()
        zero_count = (df[column] == 0).sum()
        missing_counts[column] = nan_count + zero_count
    else:
        # Count only NaN for other columns
        missing_counts[column] = df[column].isna().sum()

# Convert to Series
missing = pd.Series(missing_counts)
missing_percent = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing.index,
    'Missing': missing.values,
    'Percent': missing_percent.values
})

print(missing_df.to_string(index=False))

MISSING VALUES
           Column  Missing  Percent
           rating        0      0.0
        sentiment        0      0.0
    comment_clean        0      0.0
sentiment_encoded        0      0.0


## drop emty comments

In [17]:
df = df.dropna(subset=['comment_clean'])
df = df[df['comment_clean'].str.strip() != '']

In [18]:
print("MISSING VALUES")

# For rating column, count NaN values + 0 values as missing
# For other columns, count only NaN values
missing_counts = {}

for column in df.columns:
    if column == 'rating':
        # Count NaN + 0 values for rating
        nan_count = df[column].isna().sum()
        zero_count = (df[column] == 0).sum()
        missing_counts[column] = nan_count + zero_count
    else:
        # Count only NaN for other columns
        missing_counts[column] = df[column].isna().sum()

# Convert to Series
missing = pd.Series(missing_counts)
missing_percent = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing.index,
    'Missing': missing.values,
    'Percent': missing_percent.values
})

print(missing_df.to_string(index=False))

MISSING VALUES
           Column  Missing  Percent
           rating        0      0.0
        sentiment        0      0.0
    comment_clean        0      0.0
sentiment_encoded        0      0.0


In [19]:
dfrob = df[['comment_clean', 'sentiment_encoded']]
dfrob = dfrob.rename(columns={'comment_clean': 'text', 'sentiment_encoded': 'label'})
dfrob = dfrob.dropna()


In [20]:
dfrob.to_csv("../data/cleaned/finetuning_reviews_general.csv", index=False)

## NOTE pour seulement 560 commentaires, il est souvent plus efficace d’utiliser distilRoBERTa ou de fine-tuner uniquement la tête de classification pour éviter le surapprentissage et réduire le temps d’entraînement.

In [21]:
import os
import pandas as pd
DATA_PATH = "../data/cleaned/finetuning_reviews_general.csv"
if not os.path.exists(DATA_PATH):
    print(f"Error: File not found at {DATA_PATH}")
else:
    # Load the data
    df = pd.read_csv(DATA_PATH, encoding='utf-8-sig')
    print(f"Successfully loaded {len(df):,} reviews")

print("Distribution des labels :")
print(df['label'].value_counts())        # nombre par classe
print(df['label'].value_counts(normalize=True) * 100)  # pourcentages


Successfully loaded 542 reviews
Distribution des labels :
label
2    375
0    118
1     49
Name: count, dtype: int64
label
2    69.188192
0    21.771218
1     9.040590
Name: proportion, dtype: float64


## Split data into train and test

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os


os.makedirs("../data/cleaned/finetuning-splits", exist_ok=True)

X = df["text"]
y = df["label"]

# 4. Faire le split train/test (stratifié)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

# 5. Reconstruire des DataFrames
train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})

# 6. Enregistrer les splits en CSV
train_path = "../data/cleaned/finetuning-splits/train_set.csv"
test_path = "../data/cleaned/finetuning-splits/test_set.csv"

train_df.to_csv(train_path, index=False, encoding='utf-8-sig')
test_df.to_csv(test_path, index=False, encoding='utf-8-sig')

# 7. Affichage des informations
print("SPLITTING TERMINÉ")
print(f"Train set : {train_df.shape}")
print(f"Test set  : {test_df.shape}\n")

print("Répartition des classes – Train :")
print(train_df["label"].value_counts())
print("\nRépartition des classes – Test :")
print(test_df["label"].value_counts())


SPLITTING TERMINÉ
Train set : (433, 2)
Test set  : (109, 2)

Répartition des classes – Train :
label
2    300
0     94
1     39
Name: count, dtype: int64

Répartition des classes – Test :
label
2    75
0    24
1    10
Name: count, dtype: int64
