<a href="https://colab.research.google.com/github/ernaGit14/Fine-tune-model-for-Sentiment-analysis-3-class-english-text/blob/main/Fine_Tune_model_for_english_text_3_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers datasets scikit-learn accelerate evaluate

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

# 1) Load CSV
df = pd.read_csv("data_comment with label.csv", sep=';')

# 2) Map string labels to ints
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v: k for k, v in label2id.items()}

df["label_id"] = df["label"].map(label2id)

# 3) Split train / test
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label_id"], random_state=42)

# 4) Convert to HuggingFace Dataset
train_ds = Dataset.from_pandas(train_df[["text", "label_id"]])
test_ds = Dataset.from_pandas(test_df[["text", "label_id"]])

datasets = DatasetDict({
    "train": train_ds,
    "test": test_ds
})

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "j-hartmann/sentiment-roberta-large-english-3-classes"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)


In [None]:
def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_datasets = datasets.map(tokenize_fn, batched=True)

# Trainer expects column "labels"
tokenized_datasets = tokenized_datasets.rename_column("label_id", "labels")
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir="./sentiment-roberta-finetuned",
    eval_strategy="epoch", # Changed from evaluation_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_steps=50,
    report_to="none" # Explicitly disable reporting to experiment trackers like wandb
)

# Instantiate DataCollatorWithPadding to handle dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    # Replaced 'tokenizer=tokenizer' with 'data_collator=data_collator'
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
trainer.train()


In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("./sentiment-roberta-finetuned")
tokenizer.save_pretrained("./sentiment-roberta-finetuned")


Now we are going to use new model, fine tune model result


In [None]:
from transformers import pipeline

clf = pipeline( #clf is model classifier
    "text-classification",
    model="./sentiment-roberta-finetuned",
    tokenizer="./sentiment-roberta-finetuned",
    return_all_scores=True
)

print(clf("The product is amazing, I really love it!"))


Read new data from data_comment.csv and save it in dataframe

In [None]:
df_data = pd.read_csv("data_comment.csv", sep=';')
text_column = "comment" # define which coloum as the dataText

In [None]:
if text_column not in df_data.columns:
    raise ValueError(f"Comment coloum '{text_column}' is not found in your CSV file. List of data coloum : {list(df.columns)}")

texts = df_data[text_column].fillna("").astype(str).tolist()

In [None]:
# ==== Predict all texts ====
results = clf(texts, batch_size=32, truncation=True)

pred_labels = []
max_scores = []
neg_scores = []
neu_scores = []
pos_scores = []

for item in results:
    # item = list of dict: [{'label': 'negative', 'score': ...}, ...]
    # choose the label with the highest score
    best = max(item, key=lambda x: x["score"])
    pred_labels.append(best["label"])     # 'negative' / 'neutral' / 'positive'
    max_scores.append(best["score"])

    # take all the scores
    score_dict = {d["label"]: d["score"] for d in item}
    neg_scores.append(score_dict.get("negative", None))
    neu_scores.append(score_dict.get("neutral", None))
    pos_scores.append(score_dict.get("positive", None))

# ==== 5. Input the result to dataframe ====
df_data["sentiment_label"] = pred_labels          # negative / neutral / positive
df_data["sentiment_confidence"] = max_scores
df_data["score_negative"] = neg_scores
df_data["score_neutral"] = neu_scores
df_data["score_positive"] = pos_scores

# Define OUTPUT_CSV_model2 before use
OUTPUT_CSV_model2 = "reviews_with_sentiment_model2.csv"

# ==== 6. Save in new csv file ====
df_data.to_csv(OUTPUT_CSV_model2, index=False)
print(f"Done! Saved with sentiment to: {OUTPUT_CSV_model2}")

# ==== PIE CHART ====
counts = df_data["sentiment_label"].value_counts()

plt.figure()
plt.pie(
    counts,
    labels=[lbl.upper() for lbl in counts.index],  # POSITIVE / NEGATIVE / NEUTRAL
    autopct="%1.1f%%"
)
plt.title("Sentiment Distribution")
plt.show()