In [1]:
import pandas as pd
import spacy
from transformers import pipeline
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch

### Load Models

In [2]:
# Load zero-shot classification model for topic labeling
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Load spaCy model for sentence segmentation
nlp = spacy.load("en_core_web_md")
nlp.add_pipe("sentencizer")  # Add sentence boundary detector

# Load sentiment analysis model (3-class: positive, neutral, negative)
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Device set to use cuda:0


### Load and Clean Data


In [3]:
# Read the review dataset
df = pd.read_csv("AirlineReviews.csv")
df = df.dropna(subset=["ReviewBody"])
df["ReviewBody"] = df["ReviewBody"].astype(str).str.strip()
df = df[df["ReviewBody"].str.len() > 10]

### Define Topic Labels and Fallback Rules


In [4]:
# List of predefined airline-related aspect categories
labels = [
    "Flight Delay", "Seat Comfort", "In-flight Food and Drinks",
    "In-flight Entertainment", "Wi-Fi and Connectivity", "Baggage",
    "Boarding process at the gate", "Check-in", "Cabin Service", "Staff & Service Attitude",
    "Customer Service Response", "Compensation & Refunds",
    "Value for Money", "Overall Airline Experience", "Lounge Experience",
    "Loyalty Program or Elite Status", "Cultural Benchmark"
]

# Define fallback rule function for low-confidence predictions
def fallback_label(text):
    s = text.lower()
    fallback_rules = [
        (r"\b(avios|loyalty|frequent flyer|status|tier points|elite member|executive club|member)\b", "Loyalty Program or Elite Status"),
        (r"\b(refund|voucher|compensation|reimbursement|money back|covered cost|receipt|compensate)\b", "Compensation & Refunds"),
        (r"\b(never again|last time|worst airline|national disgrace|not reliable|gone downhill|joke airline|abysmal|conclusion|overall|in summary)\b", "Overall Airline Experience"),
        (r"\b(boarding|gate|queue|line|group|shuttle|overbooked|chaotic|boarding pass|bumped|boarding delay|reallocated seats|gate change)\b", "Boarding process at the gate"),
        (r"\b(rude|polite|friendly|unhelpful|helpful|aggressive|shouted|respectful|ignored|not helpful|walked away|no one helped|they dont care|they don't care|indifferent staff|dismissive|condescending|sarcastic|snarky|cold|robotic|wouldn’t listen|wouldn't listen|wouldnt listen)\b", "Staff & Service Attitude"),
        (r"\b(lost luggage|baggage missing|bag was not|delayed bag|baggage lost|airtag|luggage still in|baggage didn’t arrive|reclaim)\b", "Baggage"),
        (r"\b(customer service|call center|couldn’t reach|no response|hung up|phone support|agent said|long hold|chatbot|no email reply|no one answers|excuses|apologize|complaint|excuses)\b", "Customer Service Response"),
        (r"\b(delayed|flight cancelled|connection missed|schedule change|late flight|gate closed|rebooked|replacement flight|technical issue|no crew)\b", "Flight Delay"),
        (r"\b(meal|food|beverage|catering|drink|snack|inedible|ran out|no food|lentil pie|beef cheeks|breakfast|tray service)\b", "In-flight Food and Drinks"),
        (r"\b(screen|entertainment|media system|movie didn’t work|broken screen|TV|headphones)\b", "In-flight Entertainment"),
        (r"\b(legroom|recline|tight|seat comfort|uncomfortable|tray seat|middle seat|blocked middle|seat didn’t work|seat controls|cramped|no space|reclining|business class seat|seating)\b", "Seat Comfort"),
        (r"\b(wifi|wi-fi|internet|no signal|connection drop|onboard wifi|connectivity)\b", "Wi-Fi and Connectivity"),
        (r"\b(value for money|not worth it|expensive|waste|rip off|costly|ticket price|basic essentials|low-cost)\b", "Value for Money"),
        (r"\b(lounge|lounges|business lounge|airport lounge|vip lounge|lounge access|lounge experience|crowded lounge|no lounge access)\b", "Lounge Experience"),
        (r"\b(cabin crew|crew service|service poor|crew attitude|no explanation|service was ok|minimal service)\b", "Cabin Service"),
        (r"\b(check-in|security|airport process|terminal|check in counter|kiosk|boarding card|passport control|line long|not checked in)\b", "Check-in Process"),
    ]
    for pattern, label in fallback_rules:
        if re.search(pattern, s):
            return label
    return None

### Sentiment Scoring with RoBERTa

In [5]:
def get_sentiment_roberta(text):
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
    with torch.no_grad():
        output = model(**encoded_input)
    scores = output.logits[0].numpy()
    probs = softmax(scores)

    labels = ['Negative', 'Neutral', 'Positive']
    label = labels[probs.argmax()]
    score = probs.max()

    # Convert label into polarity score
    if label == 'Positive':
        polarity = score
    elif label == 'Negative':
        polarity = -score
    else:
        polarity = 0.0

    return label, polarity


### Sentence Segmentation and Text Cleaning Functions

In [6]:
# Split a review into sentences using spaCy
def get_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

# Clean text by lowercasing and removing unwanted characters
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s\.,!?\'\";\-\(\)]', '', text)
    return text


### Process Each Review

In [9]:
import textwrap

# Target review indices to process (can be adjusted)
target_indices = [79, 138, 732] #list(range(0, 11))
output_rows = []

for idx in target_indices:
    review = df["ReviewBody"].iloc[idx]
    print(f"========== Review #{idx+1} Summary ==========")
    print("**Original Review**")
    print(textwrap.fill(review, width=120))
    
    # ========= Sentence Segmentation =========
    sentences = get_sentences(review)
    sentence_info = []
    
    for sent in sentences:
        if len(sent.strip()) < 5:
            continue  # Skip very short sentences
        
        cleaned_sent = clean_text(sent)

        # ========= Sentiment Analysis =========
        label, polarity = get_sentiment_roberta(sent)

        # ========= Topic Classification =========
        classification = classifier(sent, labels)
        best_label = classification['labels'][0]
        best_score = classification['scores'][0]

        # ========= Apply Fallback Rule if Confidence < 0.3 =========
        if best_score < 0.3:
            fb_label = fallback_label(sent)
            if fb_label:
                best_label = fb_label
                best_score += 0.5  # Indicate fallback rule was applied

        sentence_info.append((sent, best_label, best_score, polarity))
    
    # ========= Select Most Emotional Sentence =========
    most_emotional = None
    max_emotion_score = 0

    for item in sentence_info:
        emotion_strength = abs(item[3])
        
        if round(emotion_strength, 1) > round(max_emotion_score, 1):
            max_emotion_score = emotion_strength
            most_emotional = item
        elif round(emotion_strength, 1) == round(max_emotion_score, 1):
            label = item[1]
            if label == "Overall Airline Experience" or label == "Value for Money":
                most_emotional = item
    
    # ========= Select Topic Summary Sentences (Excluding Emotional One) =========
    sorted_by_score = sorted(sentence_info, key=lambda x: x[2], reverse=True)
    selected_summary = {}
    emotion_sent_text = most_emotional[0] if most_emotional else None

    for sent, label, score, polarity in sorted_by_score:
        if sent == emotion_sent_text:
            continue  # Skip emotional sentence
        if label not in selected_summary and score >= 0.25:
            selected_summary[label] = sent
        if len(selected_summary) >= 3:
            break

    # ========= Ensure At Least 3 Summary Topics =========
    if len(selected_summary) < 3:
        for sent, label, score, polarity in sorted_by_score:
            if sent == emotion_sent_text or label in selected_summary:
                continue
            selected_summary[label] = sent
            if len(selected_summary) >= 3:
                break

    # ========= Output Summary =========
    print("\n===== Auto-Generated Review Summary =====")
    for topic, sent in selected_summary.items():
        print(f"[{topic}]")
        print(f"- {sent}")
        
    print(f"\nMost Emotional Sentence: {most_emotional[0]} | Sentiment Score: {most_emotional[3]:.2f}\n")

    # ========= Save Results to Output List =========
    output_rows.append({
        "ReviewID": idx,
        "OriginalReview": review,
        "Summary1": list(selected_summary.values())[0] if len(selected_summary) > 0 else None,
        "Topic1": list(selected_summary.keys())[0] if len(selected_summary) > 0 else None,
        "Summary2": list(selected_summary.values())[1] if len(selected_summary) > 1 else None,
        "Topic2": list(selected_summary.keys())[1] if len(selected_summary) > 1 else None,
        "Summary3": list(selected_summary.values())[2] if len(selected_summary) > 2 else None,
        "Topic3": list(selected_summary.keys())[2] if len(selected_summary) > 2 else None,
        "MostEmotionalSentence": most_emotional[0] if most_emotional else None,
        "EmotionTopic": most_emotional[1] if most_emotional else None,
        "EmotionScore": most_emotional[3] if most_emotional else None
    })

**Original Review**
The staff are very rude and not trained properly. No exceptions are made for children and elderly people. The price of
the ticket is very expensive given the distance and the service is extremely extremely poor

===== Auto-Generated Review Summary =====
[Staff & Service Attitude]
- The staff are very rude and not trained properly.

Most Emotional Sentence: The price of the ticket is very expensive given the distance and the service is extremely extremely poor | Sentiment Score: -0.98

**Original Review**
An excellent flight in Club World on British Airways. The welcome aboard was warm and that continued throughout the
flight. The crew were attentive, friendly and very professional. On board food for dinner and breakfast was good and
there was a well chosen selection of wines. In flight entertainment offered a great selection of films and audio. The
seat/flat bed was very comfortable - British Airways have done an excellent job in the design and comfort of the suites

### Export Results to CSV


In [8]:
# Convert results list to DataFrame and export to CSV
df_output = pd.DataFrame(output_rows)
df_output.to_csv("review_summary_output.csv", index=False)
print("✅ CSV file saved as 'review_summary_output.csv'")

✅ CSV file saved as 'review_summary_output.csv'
