In [26]:
# imports
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pathlib import Path

In [27]:
# constants
BASE_DIR = Path.cwd()
if (BASE_DIR / "DATA").exists() is False:
    BASE_DIR = BASE_DIR.parent

INPUT_CSV = BASE_DIR / "DATA" / "cleaned" / "airline_cleaned.csv"
OUTPUT_CSV = BASE_DIR / "DATA" / "with_sentiments" / "airline_cleaned_with_sentiment.csv"

OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)

TEXT_COLUMN = "content"
BATCH_SIZE = 32

MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"

In [28]:
from transformers import logging
logging.set_verbosity_error()

# Load the dataset
df = pd.read_csv(INPUT_CSV)

# Set up the device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME).to(device)
model.eval()

# Prepare the texts for sentiment analysis
texts = df[TEXT_COLUMN].fillna("").astype(str).tolist()

In [35]:
# Store sentiment scores
sentiment_scores = []

# Perform sentiment analysis in batches
with torch.no_grad():
    for i in range(0, len(texts), BATCH_SIZE):

        # Get the current batch of texts
        batch = texts[i:i+BATCH_SIZE]

        # Tokenize the batch of texts
        tokens = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)

        # Get the model's output logits and convert to probabilities
        logits = model(**tokens).logits
        probs = torch.nn.functional.softmax(logits, dim=-1).cpu()

        # index 0 = negative
        # index 1 = neutral
        # index 2 = positive

        # Calculate the sentiment scores as the difference between positive and negative probabilities
        scores = probs[:, 2] - probs[:, 0]

        # Append the scores to the list
        sentiment_scores.extend(scores.tolist())

# Add the sentiment scores to the df
df["sentiment_score"] = sentiment_scores

# Save the updated df to a new CSV file
df.to_csv(OUTPUT_CSV, index=False)

print("Data w/ sentiment scores was successfully saved!")
print(df[["sentiment_score"]].head())


Data w/ sentiment scores was successfully saved!
   sentiment_score
0         0.326550
1         0.947232
2         0.946018
3         0.922577
4        -0.591914


In [36]:
# Add mismatch indicator
SENT_COL = "sentiment_score"
RATING_COL = "overall_rating"
MISMATCH_COL = "mismatch"

# Convert columns to numbers
df[SENT_COL] = pd.to_numeric(df[SENT_COL], errors="coerce")
df[RATING_COL] = pd.to_numeric(df[RATING_COL], errors="coerce")

# Define mismatch conditions
pos_mismatch = (df[SENT_COL] > 0.05) & (df[RATING_COL] <= 4)
neg_mismatch = (df[SENT_COL] < -0.05) & (df[RATING_COL] >= 8)

# Create df with mismatch indicator column
df[MISMATCH_COL] = (pos_mismatch | neg_mismatch).astype(int)

# Save df with mismatch indicator as a new CSV file
OUTPUT_PATH = BASE_DIR / "DATA" / "with_mismatch" / "airline_with_mismatch.csv"
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUTPUT_PATH, index=False)

print("Data w/ mismatch indicator was successfully saved!")
print("Total mismatches:", df[MISMATCH_COL].sum())
print("Mismatch rate:", round(df[MISMATCH_COL].mean() * 100, 2), "%")


Data w/ mismatch indicator was successfully saved!
Total mismatches: 2399
Mismatch rate: 6.51 %
