In [None]:
%pip install transformers
%pip install torch
%pip install PyTorch
%pip install import_ipynb

In [3]:
import import_ipynb
import cleanData as datab

Empty DataFrame
Columns: [user_id, text]
Index: []


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [5]:
model_name = "tabularisai/robust-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


In [6]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()
        sentiment_map = {
            0: "Very Negative",
            1: "Negative",
            2: "Neutral",
            3: "Positive",
            4: "Very Positive"
        }
        return sentiment_map[predicted_class]


In [10]:
datab.dataset_df["sentiment"] = datab.dataset_df["text"].apply(predict_sentiment)
print(datab.dataset_df[["text", "sentiment"]].head())
 

                                                text      sentiment
0  Great staff always helps and always nice. Alwa...  Very Positive
1  After my ROTD  yesterday of a different Sweet ...        Neutral
2  Our family returned for breakfast again this w...       Positive
3  If I could give it a zero, I would. I order a ...  Very Negative
4  Id you haven't been to the Smoothie King cente...       Positive


In [18]:
datab.dataset_df.to_json("hugging_face_sentiment_analysis.json", orient="records", lines=True)


In [19]:
# Define the mapping from sentiment labels to scores
sentiment_score_map = {
    "Very Negative": 1,
    "Negative": 2,
    "Neutral": 3,
    "Positive": 4,
    "Very Positive": 5
}

# Apply the mapping to create a new column
datab.dataset_df["sentiment_score"] = datab.dataset_df["sentiment"].map(sentiment_score_map)


In [21]:
datab.dataset_df.to_json("full_sentiment_data_with_scores.json", orient="records", lines=True)


In [35]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(datab.dataset_df["stars"].astype(int),
                          datab.dataset_df["sentiment_score"].astype(int))
print("Exact Match Accuracy:", accuracy)


Exact Match Accuracy: 0.47183465879457454


In [37]:
from sklearn.metrics import mean_absolute_error

# Compute MAE
mae = mean_absolute_error(datab.dataset_df["stars"], datab.dataset_df["sentiment_score"])
print(f"Mean Absolute Error: {mae:.4f}")


Mean Absolute Error: 0.6702


In [38]:
def convert_to_3_scale(score):
    if score in [1, 2]:
        return 1
    elif score == 3:
        return 2
    elif score in [4, 5]:
        return 3

# Apply to both columns
datab.dataset_df["stars_3_scale"] = datab.dataset_df["stars"].apply(convert_to_3_scale)
datab.dataset_df["sentiment_score_3_scale"] = datab.dataset_df["sentiment_score"].apply(convert_to_3_scale)
datab.dataset_df.to_json("updated_sentiment_data_3_scale.json", orient="records", lines=True)



In [39]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(datab.dataset_df["stars_3_scale"].astype(int),
                          datab.dataset_df["sentiment_score_3_scale"].astype(int))
print("Exact Match Accuracy:", accuracy)


Exact Match Accuracy: 0.7091632379660278


In [40]:
from sklearn.metrics import mean_absolute_error

# Compute MAE
mae = mean_absolute_error(datab.dataset_df["stars_3_scale"], datab.dataset_df["sentiment_score_3_scale"])
print(f"Mean Absolute Error: {mae:.4f}")


Mean Absolute Error: 0.3119


In [36]:
# Calculate the absolute difference
datab.dataset_df["abs_diff"] = (datab.dataset_df["stars"] - datab.dataset_df["sentiment_score"]).abs()

# Print the first few rows showing stars, sentiment_score, and their absolute difference
print(datab.dataset_df[["text","stars", "sentiment_score", "abs_diff"]].head(10))
datab.dataset_df.to_json("full_sentiment_data_with_scores_and_difference.json", orient="records", lines=True)



                                                text  stars  sentiment_score  \
0  Great staff always helps and always nice. Alwa...    5.0                5   
1  After my ROTD  yesterday of a different Sweet ...    4.0                3   
2  Our family returned for breakfast again this w...    5.0                4   
3  If I could give it a zero, I would. I order a ...    1.0                1   
4  Id you haven't been to the Smoothie King cente...    4.0                4   
5  Don't know what it is but If my tummy's feelin...    4.0                3   
6  Service and management terrible... After messi...    1.0                1   
7  Been to this location twice and will never go ...    1.0                3   
8  This is one of the busiest Chick fil A's I've ...    5.0                5   
9  Had the brisket sandwich with two sides and a ...    5.0                4   

   abs_diff  
0       0.0  
1       1.0  
2       1.0  
3       0.0  
4       0.0  
5       1.0  
6       0.0  
7      

In [34]:
# Filter rows
filtered_df = datab.dataset_df[datab.dataset_df["abs_diff"] > 1]

# Save to JSON
filtered_df.to_json("filtered_sentiment_mismatches.json", orient="records", lines=True)

