In [None]:
! pip install transformers

In [6]:
from transformers import pipeline

distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    return_all_scores=True
)

In [10]:
# English text
english_text = "I love this movie and I would watch it again and again!"
english_result = distilled_student_sentiment_classifier(english_text)

# Malay text
malay_text = "Saya suka filem ini dan saya akan menontonnya lagi dan lagi!"
malay_result = distilled_student_sentiment_classifier(malay_text)

# Japanese text
japanese_text = "私はこの映画が大好きで、何度も見ます！"
japanese_result = distilled_student_sentiment_classifier(japanese_text)

# Print the results
print("English sentiment analysis result:")
print(english_result)

print("\nMalay sentiment analysis result:")
print(malay_result)

print("\nJapanese sentiment analysis result:")
print(japanese_result)

English sentiment analysis result:
[[{'label': 'positive', 'score': 0.9754056930541992}, {'label': 'neutral', 'score': 0.01555436011403799}, {'label': 'negative', 'score': 0.009039935655891895}]]

Malay sentiment analysis result:
[[{'label': 'positive', 'score': 0.9760094285011292}, {'label': 'neutral', 'score': 0.018045149743556976}, {'label': 'negative', 'score': 0.0059454599395394325}]]

Japanese sentiment analysis result:
[[{'label': 'positive', 'score': 0.9342429041862488}, {'label': 'neutral', 'score': 0.040193185210227966}, {'label': 'negative', 'score': 0.0255639236420393}]]


In [3]:
# Step 1: Install necessary libraries
!pip install transformers

# Step 2: Import required libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import torch

# Step 3: Create the manual dataset
texts = [
    "I love this movie and I would watch it again and again!",
    "This film was terrible, I couldn't even finish it.",
    "Absolutely fantastic experience, highly recommend!",
    "Not worth the time, very boring and slow.",
    "The plot was interesting and the acting was great.",
    "Horrible movie, will never watch it again.",
    "A masterpiece, beautifully executed.",
    "Pretty average, nothing special.",
    "Terrible plot and bad acting, do not recommend.",
    "Enjoyed every moment of it, a must-watch!"
]
labels = [1, 0, 1, 0, 1, 0, 1, 0, 0, 1]

# Convert the texts and labels into a format suitable for fine-tuning
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Step 4: Initialize tokenizer and create dataset
tokenizer = AutoTokenizer.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student")
dataset = SimpleDataset(texts, labels, tokenizer)

# Split dataset into train and test
train_size = int(0.8 * len(dataset))
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])

# Step 5: Fine-Tune the Model
model = AutoModelForSequenceClassification.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student", num_labels=3)
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

trainer.train()

# Step 6: Use the Fine-Tuned Model for Sentiment Analysis
fine_tuned_sentiment_classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, return_all_scores=True)

# Analyze sentiments in different languages
texts_to_analyze = [
    ("English", "I love this movie and I would watch it again and again!"),
    ("Malay", "Saya suka filem ini dan saya akan menontonnya lagi dan lagi!"),
    ("Japanese", "私はこの映画が大好きで、何度も見ます！")
]

for language, text in texts_to_analyze:
    result = fine_tuned_sentiment_classifier(text)
    print(f"\n{language} sentiment analysis result:")
    print(result)






Epoch,Training Loss,Validation Loss
1,No log,2.156434
2,No log,1.795476
3,No log,1.690278





English sentiment analysis result:
[[{'label': 'positive', 'score': 0.7180215120315552}, {'label': 'neutral', 'score': 0.17993322014808655}, {'label': 'negative', 'score': 0.10204528272151947}]]

Malay sentiment analysis result:
[[{'label': 'positive', 'score': 0.8495468497276306}, {'label': 'neutral', 'score': 0.11804182827472687}, {'label': 'negative', 'score': 0.03241141512989998}]]

Japanese sentiment analysis result:
[[{'label': 'positive', 'score': 0.7794512510299683}, {'label': 'neutral', 'score': 0.14067824184894562}, {'label': 'negative', 'score': 0.07987045496702194}]]
