In [None]:
# uploading lexicon.json to colab

from google.colab import files
uploaded = files.upload()  # Select lexicon.json from your local system

In [None]:
import os
print(os.listdir())

['.config', 'preprocessed_lexicon.csv', 'drive', 'sample_data']


In [None]:
# confirming wheter dataset is successfully uploaded in colab or not

import json

# Load the lexicon.json file
with open("kn-lexicons.json", "r", encoding="utf-8") as f:
    lexicon = json.load(f)

# Inspect the first few entries
print(list(lexicon.items())[:10])

[('ತ್ಯಜಿಸಿ', -1), ('ಕೈಬಿಡಲಾಗಿದೆ', -1), ('ಕೈಬಿಡುವವನು', -1), ('ಪರಿತ್ಯಕ್ತರು', -1), ('ತ್ಯಜಿಸುವುದು', -1), ('ಪರಿತ್ಯಾಗ', -1), ('ತ್ಯಜಿಸುತ್ತದೆ', -1), ('ಅಪಹರಿಸಲಾಗಿದೆ', -1), ('ಅಪಹರಣ', -1), ('ಅಪಹರಣಗಳು', -1)]


In [None]:
# preprocessing the data ( cleaning the data : removing neutral values, duplicates ), convert the JSON to a Pandas DataFrame, save the cleaned dataset

import pandas as pd

# Convert lexicon to DataFrame
df = pd.DataFrame(list(lexicon.items()), columns=["Word", "Sentiment"])

# Preprocessing: Remove neutral entries
df = df[df["Sentiment"] != 0]

# Drop duplicates
df = df.drop_duplicates()

# Save the preprocessed data
df.to_csv("preprocessed_lexicon.csv", index=False)

# Preview the cleaned dataset
print(df)

               Word  Sentiment
0           ತ್ಯಜಿಸಿ         -1
1       ಕೈಬಿಡಲಾಗಿದೆ         -1
2        ಕೈಬಿಡುವವನು         -1
3       ಪರಿತ್ಯಕ್ತರು         -1
4       ತ್ಯಜಿಸುವುದು         -1
...             ...        ...
6969      ದೃಢನಿಶ್ಚಯ          1
6970         ದೃಢವಾದ          1
6971  ಡ್ರಾಪ್-ಔಟ್ಗಳು         -1
6972   ಡ್ರಾಪ್ಔಟ್ಗಳು         -1
6973  ಮನರಂಜಿಸುತ್ತದೆ          1

[6973 rows x 2 columns]


In [None]:
df['Sentiment'].value_counts()

# -1 (Negative Sentiment): 4233 entries
# 1 (Positive Sentiment): 2740 entries
# It shows that the dataset is imbalanced

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
-1,4233
1,2740


In [None]:
#downloading the preprocessed file

from google.colab import files
files.download("preprocessed_lexicon.csv")  # Download the preprocessed file

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Install libraries for NLP and deep learning:
# transformers: Pre-trained models like XLM-RoBERTa, IndicBERT
# datasets: Efficient dataset handling for NLP
# torch: Core library for deep learning with PyTorch
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
# Import the necessary modules for working with pre-trained models.

# Load the XLM-RoBERTa tokenizer and model:
# "AutoTokenizer": Automatically selects the correct tokenizer for the model.
# "AutoModelForSequenceClassification": Loads the model for sentiment classification with 2 labels (positive and negative).

# Each step loads the tokenizer and model, enabling further processing and training.
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load XLM-RoBERTa
xlm_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
xlm_model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)

# Load IndicBERT
indic_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indic_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

#took 30s to execute

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# fine tuning the dataset

from transformers import AdamW
from torch.utils.data import DataLoader
import torch

In [None]:
# to read and store the contents of preprocessed data in dataframe df
df = pd.read_csv("preprocessed_lexicon.csv")

In [None]:
from sklearn.model_selection import train_test_split

# Split into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["Word"],  # Feature: text data
    df["Sentiment"],  # Target: sentiment labels
    test_size=0.2,  # 20% of data used for testing
    random_state=42  # Ensures reproducibility
)

In [None]:
# Tokenize for XLM-RoBERTa
xlm_train_encodings = xlm_tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
xlm_test_encodings = xlm_tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

In [None]:
# Import the PyTorch library.

# Define a custom dataset class:
# "SentimentDataset" inherits from PyTorch's Dataset class to handle input data.
# - `__init__`: Initializes the dataset with encoded inputs (tokenized text) and labels (sentiments).
# - `__len__`: Returns the number of samples in the dataset.
# - `__getitem__`: Retrieves a single data sample (input and label) as PyTorch tensors.

# Create datasets for XLM-RoBERTa:
# - xlm_train_dataset: Prepares the training data (tokenized inputs and labels) for model training.
# - xlm_test_dataset: Prepares the testing data for evaluation.
# This structure ensures compatibility with PyTorch's DataLoader for efficient data batching and iteration.
import torch

# Define a custom dataset class
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, torch.tensor(self.labels[idx])

# Create datasets for XLM-RoBERTa
xlm_train_dataset = SentimentDataset(xlm_train_encodings, list(train_labels))
xlm_test_dataset = SentimentDataset(xlm_test_encodings, list(test_labels))

In [None]:
from torch.utils.data import DataLoader

# DataLoaders for XLM-RoBERTa
# Converts the dataset into manageable batches for training and testing
# `batch_size=16` ensures the model processes 16 samples at a time
# `shuffle=True` randomizes the training dataset for better generalization
xlm_train_loader = DataLoader(xlm_train_dataset, batch_size=16, shuffle=True)
xlm_test_loader = DataLoader(xlm_test_dataset, batch_size=16)

In [None]:
# Example: Check a batch from the XLM-RoBERTa training DataLoader
for batch in xlm_train_loader:
    print(batch)
    break

[{'input_ids': tensor([[     0,   3663,  99019,  14600,  20978,   3461, 206335,      2,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1],
        [     0,   8348,   1508,  14406,      2,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1],
        [     0, 204366,  32370,      2,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1],
        [     0,  96962,   4811,  37285,  94506,  68230,      2,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1],
        [     0,  17026, 123244,   8985,  14895,      2,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1],
        [     0,      6,  19041,   3943, 146635,      2,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1],
       

In [None]:
# Convert the Pandas Series to PyTorch tensors
train_labels = torch.tensor(train_labels.values, dtype=torch.long)
test_labels = torch.tensor(test_labels.values, dtype=torch.long)

# Replace -1 with 0 in train and test labels using torch.where()
train_labels = torch.where(train_labels == -1, torch.tensor(0, dtype=torch.long), train_labels)
test_labels = torch.where(test_labels == -1, torch.tensor(0, dtype=torch.long), test_labels)

# Check the labels
print("Updated train labels:", torch.unique(train_labels))
print("Updated test labels:", torch.unique(test_labels))

Updated train labels: tensor([0, 1])
Updated test labels: tensor([0, 1])


In [None]:
#Verify the Replacement
print("Train labels unique values:", train_labels.unique())
print("Test labels unique values:", test_labels.unique())

Train labels unique values: tensor([0, 1])
Test labels unique values: tensor([0, 1])


In [None]:
#Verify the Label Distribution
print("Train label distribution:", torch.unique(train_labels, return_counts=True))
print("Test label distribution:", torch.unique(test_labels, return_counts=True))

Train label distribution: (tensor([0, 1]), tensor([3387, 2191]))
Test label distribution: (tensor([0, 1]), tensor([846, 549]))


In [None]:
print("Sample train labels:", train_labels)
print("Sample test labels:", test_labels)

Sample train labels: tensor([0, 1, 1,  ..., 0, 0, 0])
Sample test labels: tensor([0, 1, 1,  ..., 1, 1, 0])


In [None]:
#Double-check the labels before training

print("Sample train labels:", train_labels[:10])  # Check the first few labels
print("Sample test labels:", test_labels[:10])

# Check if there are any remaining -1 values
print("Unique train labels:", torch.unique(train_labels))
print("Unique test labels:", torch.unique(test_labels))

Sample train labels: tensor([0, 1, 1, 1, 0, 1, 1, 0, 1, 1])
Sample test labels: tensor([0, 1, 1, 0, 0, 0, 0, 1, 1, 0])
Unique train labels: tensor([0, 1])
Unique test labels: tensor([0, 1])


In [None]:
# Re-create datasets with updated labels
xlm_train_dataset = SentimentDataset(xlm_train_encodings, train_labels)
xlm_test_dataset = SentimentDataset(xlm_test_encodings, test_labels)

# Re-create DataLoaders
xlm_train_loader = DataLoader(xlm_train_dataset, batch_size=16, shuffle=True)
xlm_test_loader = DataLoader(xlm_test_dataset, batch_size=16)

In [None]:
# Train the model
# Sets up the optimizer and training loop for fine-tuning the XLM-RoBERTa model on the sentiment dataset.

from transformers import AdamW
import torch

# Set up the optimizer
# AdamW optimizer is used to adjust the model's weights to minimize the loss during training.
optimizer = AdamW(xlm_model.parameters(), lr=1e-5)

# Move the model to GPU (if available)
# Ensures the model and data are processed on the GPU for faster computation if available.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlm_model.to(device)

# Training loop
# Loops through the dataset for a fixed number of epochs to train the model.
xlm_model.train()  # Puts the model in training mode to enable weight updates.

for epoch in range(20):  # Training for 20 epochs (can be adjusted based on requirement).
    total_loss = 0
    for batch in xlm_train_loader:  # Iterates through batches of training data.
        # Move data to the same device as the model (GPU or CPU).
        inputs = {key: val.to(device) for key, val in batch[0].items()}
        labels = batch[1].to(device)

        optimizer.zero_grad()  # Resets the gradients before each batch.

        # Forward pass
        # Passes the input through the model and calculates the loss.
        outputs = xlm_model(**inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        # Computes the gradient of the loss w.r.t model weights.
        loss.backward()

        # Update weights
        # Adjusts the model weights based on the computed gradients.
        optimizer.step()

    # Print the loss after each epoch to monitor training progress.
    print(f"Epoch {epoch+1} completed. Loss: {total_loss/len(xlm_train_loader)}")

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, torch.tensor(self.labels[idx])


Epoch 1 completed. Loss: 0.6760229007220883
Epoch 2 completed. Loss: 0.6564601645257889
Epoch 3 completed. Loss: 0.6089460269940276
Epoch 4 completed. Loss: 0.5386704659479054
Epoch 5 completed. Loss: 0.48566757987939185
Epoch 6 completed. Loss: 0.4444302222064368
Epoch 7 completed. Loss: 0.4023923985766818
Epoch 8 completed. Loss: 0.36564779638133965
Epoch 9 completed. Loss: 0.3334772350496071
Epoch 10 completed. Loss: 0.3019749523703553
Epoch 11 completed. Loss: 0.26888898440922226
Epoch 12 completed. Loss: 0.23883455460704328
Epoch 13 completed. Loss: 0.21580784444438353
Epoch 14 completed. Loss: 0.2018416621335342
Epoch 15 completed. Loss: 0.17673835208917033
Epoch 16 completed. Loss: 0.15777171771028706
Epoch 17 completed. Loss: 0.4250879443349333
Epoch 18 completed. Loss: 0.3907622910027304
Epoch 19 completed. Loss: 0.14390806653229973
Epoch 20 completed. Loss: 0.1193664667001768


In [None]:
# test the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import numpy as np

# Set the model to evaluation mode
xlm_model.eval()

# Lists to store predictions and true labels
predictions = []
true_labels = []

# Disable gradient calculation for evaluation
with torch.no_grad():
    for batch in xlm_test_loader:  # Using test DataLoader
        # Move data to the same device as the model (GPU or CPU)
        inputs = {key: val.to(device) for key, val in batch[0].items()}
        labels = batch[1].to(device)

        # Forward pass to get logits
        outputs = xlm_model(**inputs)
        logits = outputs.logits

        # Get predicted class by choosing the class with the highest logit
        predicted_class = torch.argmax(logits, dim=-1)

        # Append predictions and true labels
        predictions.extend(predicted_class.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Convert predictions and true labels to numpy arrays
predictions = np.array(predictions)
true_labels = np.array(true_labels)

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, torch.tensor(self.labels[idx])


Accuracy: 0.8072
Precision: 0.7373
Recall: 0.7923
F1-Score: 0.7638


In [None]:
# Accuracy (0.8258): The model correctly predicted 82.58% of the test data. This is a good overall measure of how well the model is doing.

# Precision (0.7844): Among all the predictions the model made as positive, 78.44% were actually positive. This tells you how reliable the model is when it predicts a positive sentiment.

# Recall (0.7687): Out of all the actual positive cases, the model correctly identified 76.87%. This means the model is capturing most of the positive cases, but there's room for improvement.

# F1-Score (0.7764): This is a balanced score between precision and recall. It indicates that your model is performing decently, considering both false positives and false negatives.

In [None]:
# Save the trained model and tokenizer for XLM-RoBERTa
xlm_model.save_pretrained("./xlm_roberta_sentiment_model")
xlm_tokenizer.save_pretrained("./xlm_roberta_sentiment_model")

('./xlm_roberta_sentiment_model/tokenizer_config.json',
 './xlm_roberta_sentiment_model/special_tokens_map.json',
 './xlm_roberta_sentiment_model/sentencepiece.bpe.model',
 './xlm_roberta_sentiment_model/added_tokens.json',
 './xlm_roberta_sentiment_model/tokenizer.json')

In [None]:
from google.colab import files
!zip -r xlm_roberta_sentiment_model.zip ./xlm_roberta_sentiment_model
files.download("xlm_roberta_sentiment_model.zip")

  adding: xlm_roberta_sentiment_model/ (stored 0%)
  adding: xlm_roberta_sentiment_model/sentencepiece.bpe.model (deflated 49%)
  adding: xlm_roberta_sentiment_model/config.json (deflated 51%)
  adding: xlm_roberta_sentiment_model/model.safetensors (deflated 32%)
  adding: xlm_roberta_sentiment_model/tokenizer.json (deflated 76%)
  adding: xlm_roberta_sentiment_model/tokenizer_config.json (deflated 77%)
  adding: xlm_roberta_sentiment_model/special_tokens_map.json (deflated 52%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print("Unique labels in train dataset:", torch.unique(train_labels))
print("Unique labels in test dataset:", torch.unique(test_labels))

Unique labels in train dataset: tensor([0, 1])
Unique labels in test dataset: tensor([0, 1])


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the saved model and tokenizer
model_path = "./xlm_roberta_sentiment_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Move the model to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set the model to evaluation mode

# Function to predict sentiment
def predict_sentiment(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).item()

    # Map the prediction to sentiment
    sentiment = "Positive" if predicted_class == 1 else "Negative"
    return sentiment

# Test the function with some custom sentences
test_sentences = [
    "ನಾನು ಈ ಉತ್ಪನ್ನವನ್ನು ತುಂಬಾ ಇಷ್ಟಪಟ್ಟೆ",  # "I really liked this product."
    "ಈ ಸೇವೆ ತುಂಬಾ ಕಳಪೆ.",  # "This service is very bad."
    "ಅದ್ಭುತವಾದ ಅನುಭವ!",  # "Amazing experience!"
    "ಇದು ಸರಿ ಇಲ್ಲ.",  # "This is not okay."
    "ಪ್ರಸನ್ನ", #Prasanna
    "ಇಂದು ಶಾಲೆಗೆ ರಜೆ ಘೋಷಿಸಲಾಗಿದೆ ಮತ್ತು ನಾನು ಸಂತೋಷಗೊಂಡಿದ್ದೇನೆ. ಆದರೆ ಇದ್ದಕ್ಕಿದ್ದಂತೆ ಮಳೆ ಪ್ರಾರಂಭವಾಯಿತು ಮತ್ತು ಅದು ನನ್ನ ಆಟದ ಮನಸ್ಥಿತಿಯನ್ನು ಹಾಳುಮಾಡಿತು.", #Today is a holiday for school and I am happy. But suddenly it started raining and it ruined my mood for playing.
    "ಹಣೆಬರಹ", #Destiny
    "ಇದು ಭಯಾನಕ ಚಲನಚಿತ್ರವಾಗಿತ್ತು. ನನ್ನ ಸಮಯವನ್ನು ವ್ಯರ್ಥ ಮಾಡಿದ್ದಕ್ಕಾಗಿ ತುಂಬಾ ಧನ್ಯವಾದಗಳು.", #This was a terrible movie. Thank you very much for wasting my time.
    "ನಾನು ವಾಸಿಸುವ ಮನೆಯ ಬಗ್ಗೆ ತುಂಬಾ ದ್ವಂದ್ವಾರ್ಥವನ್ನು ಅನುಭವಿಸಿದೆ" #I felt very ambivalent about the house I lived in.
]

# Predict sentiment for each sentence
for sentence in test_sentences:
    sentiment = predict_sentiment(sentence)
    print(f"Sentence: {sentence} -> Sentiment: {sentiment}")

Sentence: ನಾನು ಈ ಉತ್ಪನ್ನವನ್ನು ತುಂಬಾ ಇಷ್ಟಪಟ್ಟೆ -> Sentiment: Positive
Sentence: ಈ ಸೇವೆ ತುಂಬಾ ಕಳಪೆ. -> Sentiment: Negative
Sentence: ಅದ್ಭುತವಾದ ಅನುಭವ! -> Sentiment: Positive
Sentence: ಇದು ಸರಿ ಇಲ್ಲ. -> Sentiment: Negative
Sentence: ಪ್ರಸನ್ನ -> Sentiment: Positive
Sentence: ಇಂದು ಶಾಲೆಗೆ ರಜೆ ಘೋಷಿಸಲಾಗಿದೆ ಮತ್ತು ನಾನು ಸಂತೋಷಗೊಂಡಿದ್ದೇನೆ. ಆದರೆ ಇದ್ದಕ್ಕಿದ್ದಂತೆ ಮಳೆ ಪ್ರಾರಂಭವಾಯಿತು ಮತ್ತು ಅದು ನನ್ನ ಆಟದ ಮನಸ್ಥಿತಿಯನ್ನು ಹಾಳುಮಾಡಿತು. -> Sentiment: Negative
Sentence: ಹಣೆಬರಹ -> Sentiment: Negative
Sentence: ಇದು ಭಯಾನಕ ಚಲನಚಿತ್ರವಾಗಿತ್ತು. ನನ್ನ ಸಮಯವನ್ನು ವ್ಯರ್ಥ ಮಾಡಿದ್ದಕ್ಕಾಗಿ ತುಂಬಾ ಧನ್ಯವಾದಗಳು. -> Sentiment: Negative
Sentence: ನಾನು ವಾಸಿಸುವ ಮನೆಯ ಬಗ್ಗೆ ತುಂಬಾ ದ್ವಂದ್ವಾರ್ಥವನ್ನು ಅನುಭವಿಸಿದೆ -> Sentiment: Positive


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the saved model and tokenizer
model_path = "./xlm_roberta_sentiment_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Move the model to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set the model to evaluation mode

# Function to predict sentiment
def predict_sentiment(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).item()

    # Map the prediction to sentiment
    sentiment = "Positive" if predicted_class == 1 else "Negative"
    return sentiment

# Interactive loop to allow the user to input sentences and get sentiment predictions
while True:
    # Get input from the user
    user_input = input("Enter a sentence for sentiment analysis (or type 'exit' to quit): ")

    if user_input.lower() == 'exit':
        print("Exiting the program.")
        break

    # Predict sentiment for the input sentence
    sentiment = predict_sentiment(user_input)

    # Print the sentiment result
    print(f"Sentence: {user_input} -> Sentiment: {sentiment}")

Sentence: ಒಳ್ಳೆಯದು -> Sentiment: Positive
Sentence: ಮೂಡಲಕಟ್ಟೆ -> Sentiment: Negative


In [None]:
# for testing purpose
# ನಾನು ಇಂದು ತುಂಬಾ ಸಂತೋಷವಾಗಿದ್ದೇನೆ.
# ಈ ಚಿತ್ರ ಬಹಳವೇ ಅದ್ಭುತವಾಗಿದೆ.
# ಅವನು ನನಗೆ ಬಹುಮಾನ ನೀಡಿದಾಗ ನನಗೆ ತುಂಬಾ ಖುಷಿಯಾಯಿತು.
# ಈ ಭೋಜನ ಇಷ್ಟವಾಗಿದೆ.
# ನಾನು ನನ್ನ ಸ್ನೇಹಿತರೊಂದಿಗೆ ಉತ್ತಮ ಸಮಯ ಕಳೆಯುತ್ತಿದ್ದೇನೆ.
# ಆ ಪುಸ್ತಕವು ನನಗೆ ಬಹಳ ಇಷ್ಟವಾಗಿದೆ.
# ಇವತ್ತು ಹವಾಮಾನ ಬಹುಶಃ ಚೆನ್ನಾಗಿದ್ದು ನನಗೆ ಸಂತೋಷವಾಗಿದೆ.
# ಅವಳ ಮಾತುಗಳು ನನಗೆ ಪ್ರೇರಣೆಯಾದವು.
# ನಾನು ನನ್ನ ಪ್ರಪಂಚದಲ್ಲಿ ತುಂಬಾ ನಿರಂತರವಾಗಿ ಯಶಸ್ವಿಯಾಗಿದ್ದೇನೆ.
# ಅವನು ನನಗೆ ಸಿಹಿ ಹಿಗ್ಗಿಸಿದನು.

# ನಾನು ಇಂದು ತುಂಬಾ ದುಃಖಿತನಾಗಿದ್ದೇನೆ.
# ಅವನು ನನಗೆ ಕೊಟ್ಟ ಸಲಹೆ ಸರಿಯಾಗಿಲ್ಲ.
# ಇವತ್ತು ಹವಾಮಾನ ತುಂಬಾ ದುರಸ್ಥಿಯಾಗಿದೆ.
# ಈ ಚಿತ್ರ ನನಗೆ ಇಷ್ಟವಾಗುತ್ತಿಲ್ಲ.
# ನಾನು ನಿನ್ನೆ ಕೆಲಸದಲ್ಲಿ ಕಷ್ಟಪಟ್ಟಿದ್ದೇನೆ.
# ಅವಳಿಂದ ನಿರೀಕ್ಷಿಸಿದ ಸಹಾಯ ನನಗೆ ಲಭ್ಯವಾಗುತ್ತಿಲ್ಲ.
# ನಾನು ಎಲ್ಲೆಡೆಯಿಂದ ನೊಂದಿದ್ದೇನೆ.
# ಆ ವ್ಯಕ್ತಿ ನನಗೆ ತುಂಬಾ ಕೋಪವನ್ನು ತಂದನು.
# ಅವಳ ಅಭಿಪ್ರಾಯ ನನಗೆ ನೋವುಂಟು ಮಾಡಿತು.
# ನಾನು ಈ ದಿನ ಬೇರೆಯವರಿಂದ ಅಸಮಾಧಾನಗೊಂಡಿದ್ದೇನೆ.

SyntaxError: invalid syntax (<ipython-input-1-72d6d0dee807>, line 1)