In [None]:
!pip install opendatasets

In [5]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/denizbilginn/google-maps-restaurant-reviews")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:Your Kaggle Key:Dataset URL: https://www.kaggle.com/datasets/denizbilginn/google-maps-restaurant-reviews
Downloading google-maps-restaurant-reviews.zip to ./google-maps-restaurant-reviews


100%|██████████| 657M/657M [00:00<00:00, 3.14GB/s]





## Data Ingestion and Data Processing

In [7]:
# Install dependencies
!pip install pandas scikit-learn nltk

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import nltk
nltk.download('punkt')

# --- 1. Load CSVs ---
reviews_df = pd.read_csv("google-maps-restaurant-reviews/reviews.csv")
meta_df = pd.read_csv("google-maps-restaurant-reviews/sepetcioglu_restaurant.csv")

print("=== Reviews CSV (first 5 rows) ===")
print(reviews_df.head())

=== Reviews CSV (first 5 rows) ===
                     business_name    author_name  \
0  Haci'nin Yeri - Yigit Lokantasi    Gulsum Akar   
1  Haci'nin Yeri - Yigit Lokantasi  Oguzhan Cetin   
2  Haci'nin Yeri - Yigit Lokantasi     Yasin Kuyu   
3  Haci'nin Yeri - Yigit Lokantasi     Orhan Kapu   
4  Haci'nin Yeri - Yigit Lokantasi     Ozgur Sati   

                                                text  \
0  We went to Marmaris with my wife for a holiday...   
1  During my holiday in Marmaris we ate here to f...   
2  Prices are very affordable. The menu in the ph...   
3  Turkey's cheapest artisan restaurant and its f...   
4  I don't know what you will look for in terms o...   

                                               photo  rating  \
0         dataset/taste/hacinin_yeri_gulsum_akar.png       5   
1        dataset/menu/hacinin_yeri_oguzhan_cetin.png       4   
2  dataset/outdoor_atmosphere/hacinin_yeri_yasin_...       3   
3  dataset/indoor_atmosphere/hacinin_yeri_orhan_k... 

[nltk_data] Downloading package punkt to /Users/juninho/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
print("\n=== Restaurant Metadata CSV (first 5 rows) ===")
print(meta_df.head())


=== Restaurant Metadata CSV (first 5 rows) ===
                           photo  rating    rating_category
0  sepetcioglu_restaurant/09.png       4              taste
1  sepetcioglu_restaurant/01.png       5  indoor_atmosphere
2  sepetcioglu_restaurant/25.png       2               menu
3  sepetcioglu_restaurant/10.png       5              taste
4  sepetcioglu_restaurant/02.png       3  indoor_atmosphere


In [9]:
def clean_text_for_transformers(text):
    """Clean text but preserve useful signals for transformers."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "URL", text)   # replace links with token
    text = re.sub(r"[^a-zA-Z\s]", " ", text)       # keep letters + spaces
    text = re.sub(r"\s+", " ", text).strip()       # normalize whitespace
    return text

# Example usage:
reviews_df['cleaned_review'] = reviews_df['text'].apply(clean_text_for_transformers)
print(reviews_df[['text', 'cleaned_review']].head(10))


                                                text  \
0  We went to Marmaris with my wife for a holiday...   
1  During my holiday in Marmaris we ate here to f...   
2  Prices are very affordable. The menu in the ph...   
3  Turkey's cheapest artisan restaurant and its f...   
4  I don't know what you will look for in terms o...   
5                                    Generally good.   
6  What you see is 125 TL in total. It's a pretty...   
7  Delicious food at rock bottom prices. Friendly...   
8  Every time I go, I still experience the amazem...   
9          The most f/p of all businesses I've seen.   

                                      cleaned_review  
0  we went to marmaris with my wife for a holiday...  
1  during my holiday in marmaris we ate here to f...  
2  prices are very affordable the menu in the pho...  
3  turkey s cheapest artisan restaurant and its f...  
4  i don t know what you will look for in terms o...  
5                                     generally good 

## Pseudo Labeling Using LLM 

LLMs in consideration - Might use all to stress test

  google/flan-t5-large or flan-t5-xl
  
  mistralai/Mistral-7B-Instruct
  
  HuggingFaceH4/zephyr-7b-alpha

In [11]:
!pip install transformers accelerate sentencepiece -q

## Prompt Pipeline

In [12]:
import pandas as pd
from transformers import pipeline

# Sample a smaller set for stress testing
sample_df = reviews_df.sample(100, random_state=42).reset_index(drop=True)

# 2. Candidate label set
candidate_labels = ["trustworthy", "advertisement", "rant", "irrelevant"]

# 3. Define prompts
PROMPTS = {
    "direct": """Classify the review into one of exactly these categories:
[trustworthy, advertisement, rant, irrelevant].
Return ONLY the category word and nothing else.
Review: {review}""",

    "few_shot": """Examples:
Review: "Best pizza in town, will come again!" → trustworthy
Review: "Visit my website www.bestfoodpromo.com for deals!" → advertisement
Review: "I hate this place, never been there but looks bad." → rant
Review: "qwerty lorem ipsum nothing" → irrelevant

Now classify the following review.
Return ONLY one category word: trustworthy, advertisement, rant, irrelevant.
Review: {review}""",

    "cot": """Think step by step to decide if the review is trustworthy, an advertisement, a rant, or irrelevant.
At the end, output ONLY one word: trustworthy, advertisement, rant, or irrelevant.
Review: {review}"""
}

# 4. Define models (swap in different Hugging Face checkpoints here)
MODELS = {
    "bart": pipeline("zero-shot-classification", model="facebook/bart-large-mnli"),
    "t5_large": pipeline("text2text-generation", model="google/flan-t5-large"),
    "t5_base": pipeline("text2text-generation", model="google/flan-t5-base"),
    # You can add flan-t5-large if Colab has GPU
}

# 5. Run function
def classify_review(review, model_name, prompt_type):
    prompt = PROMPTS[prompt_type].format(review=review)
    model = MODELS[model_name]

    if model_name == "bart":
        result = model(review, candidate_labels=candidate_labels)
        return result["labels"][0]  # top predicted label

    else:  # T5 family
        result = model(prompt, max_length=32, clean_up_tokenization_spaces=True)
        return result[0]["generated_text"]

# 6. Stress test loop
results = []
for idx, row in sample_df.iterrows():
    review = row["text"]
    for model_name in MODELS.keys():
        for prompt_type in PROMPTS.keys():
            label = classify_review(review, model_name, prompt_type)
            results.append({
                "review": review,
                "model": model_name,
                "prompt": prompt_type,
                "label": label
            })

results_df = pd.DataFrame(results)
results_df.to_csv("stress_test_results.csv", index=False)

print(results_df.head())


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use mps:0


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use mps:0


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use mps:0
Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_class

                                              review     model    prompt  \
0  I went to there with my girl friend; I liked f...      bart    direct   
1  I went to there with my girl friend; I liked f...      bart  few_shot   
2  I went to there with my girl friend; I liked f...      bart       cot   
3  I went to there with my girl friend; I liked f...  t5_large    direct   
4  I went to there with my girl friend; I liked f...  t5_large  few_shot   

                    label  
0                    rant  
1                    rant  
2                    rant  
3             trustworthy  
4  not enough information  


In [13]:
import pandas as pd
from transformers import pipeline
from collections import Counter

# ------------------------------------------------------
# 1️⃣ Sample a smaller set for stress testing
# ------------------------------------------------------
sample_df = reviews_df.sample(100, random_state=42).reset_index(drop=True)

# ------------------------------------------------------
# 2️⃣ Candidate labels
# ------------------------------------------------------
candidate_labels = ["trustworthy", "advertisement", "rant", "irrelevant"]

# ------------------------------------------------------
# 3️⃣ T5 prompt templates (cleaner & parameterized)
# ------------------------------------------------------
PROMPTS = {
    "direct": """Classify the review into one of exactly these categories:
[trustworthy, advertisement, rant, irrelevant].
Return only the single category word.

Review: {review}""",

    "few_shot": """Here are some examples:
"Best pizza in town, will come again!" -> trustworthy
"Visit www.bestfoodpromo.com for deals!" -> advertisement
"I hate this place, never been there but looks bad." -> rant
"qwerty lorem ipsum" -> irrelevant

Now classify this review into one category word:
{review}""",

    "cot": """Think step by step and decide whether the review is trustworthy, an advertisement, a rant, or irrelevant.
At the end, output ONLY one word.

Review: {review}"""
}

# ------------------------------------------------------
# 4️⃣ Define models
# ------------------------------------------------------
MODELS = {
    "bart": pipeline("zero-shot-classification", model="facebook/bart-large-mnli"),
    "t5_base": pipeline("text2text-generation", model="google/flan-t5-base"),
    "t5_large": pipeline("text2text-generation", model="google/flan-t5-large"),
}

# ------------------------------------------------------
# 5️⃣ Generator parameters for T5
# ------------------------------------------------------
T5_GEN_PARAMS = {
    "max_length": 5,           # single-word output
    "temperature": 0.0,        # deterministic
    "num_return_sequences": 3, # generate multiple outputs
    "do_sample": False,
    "repetition_penalty": 1.2,
    "no_repeat_ngram_size": 2
}

# ------------------------------------------------------
# 6️⃣ Thresholds for BART
# ------------------------------------------------------
THRESHOLDS = {
    "trustworthy": 0.5,
    "advertisement": 0.65,
    "rant": 0.55,
    "irrelevant": 0.5
}

# ------------------------------------------------------
# 7️⃣ Function to classify a review
# ------------------------------------------------------
def classify_review(review, model_name, prompt_type):
    prompt = PROMPTS[prompt_type].format(review=review)
    model = MODELS[model_name]

    if model_name == "bart":
        result = model(review, candidate_labels=candidate_labels, multi_label=False)
        top_label = result["labels"][0]
        confidence = result["scores"][0]
        # Apply threshold
        threshold = THRESHOLDS.get(top_label.lower(), 0.5)
        final_label = top_label if confidence >= threshold else "uncertain"
        return final_label

    else:  # T5 models
        outputs = model(prompt, **T5_GEN_PARAMS)
        # Clean outputs and normalize
        predictions = [out["generated_text"].strip().lower() for out in outputs]
        normalize_map = {
            "[trustworthy]": "trustworthy",
            "[inappropriate]": "irrelevant",
            "untrustworthy": "rant"
        }
        predictions = [normalize_map.get(p, p) for p in predictions]
        # Majority vote
        most_common = max(set(predictions), key=predictions.count)
        return most_common

# ------------------------------------------------------
# 8️⃣ Stress test loop
# ------------------------------------------------------
results = []
for idx, row in sample_df.iterrows():
    review = row["text"]
    for model_name in MODELS.keys():
        for prompt_type in PROMPTS.keys():
            label = classify_review(review, model_name, prompt_type)
            results.append({
                "review": review,
                "model": model_name,
                "prompt": prompt_type,
                "label": label
            })

results_df = pd.DataFrame(results)
results_df.to_csv("stress_test_results.csv", index=False)
print(results_df.head())

# ------------------------------------------------------
# 9️⃣ Optional: compute agreement ratio
# ------------------------------------------------------
def compute_agreement(df):
    agreements = []
    for review, group in df.groupby("review"):
        labels = [str(l).lower().strip() for l in group["label"]]
        most_common_label, count = Counter(labels).most_common(1)[0]
        agreement_ratio = count / len(labels)
        agreements.append({
            "review": review,
            "most_common_label": most_common_label,
            "agreement_ratio": agreement_ratio,
            "all_labels": labels
        })
    return pd.DataFrame(agreements)

agreement_df = compute_agreement(results_df)
agreement_df.to_csv("agreement_summary.csv", index=False)
print(agreement_df.head())


Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Both `max_new_tokens` (=256) and `max_length`(=5) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Both `max_new_tokens` (=256) and `max_length`(=5) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Both `max_new_tokens` (=256) a

                                              review    model    prompt  \
0  I went to there with my girl friend; I liked f...     bart    direct   
1  I went to there with my girl friend; I liked f...     bart  few_shot   
2  I went to there with my girl friend; I liked f...     bart       cot   
3  I went to there with my girl friend; I liked f...  t5_base    direct   
4  I went to there with my girl friend; I liked f...  t5_base  few_shot   

           label  
0      uncertain  
1      uncertain  
2      uncertain  
3  advertisement  
4            bad  
                                              review most_common_label  \
0  A decent place. You can eat as a family. You c...       trustworthy   
1                                 A delicious doner.       trustworthy   
2  A good kebab place. The meat is very good; tas...       trustworthy   
3  Appetizers were good. The size of the lahmacun...       trustworthy   
4  As we were passing on the road; it was pleasin...       trustw

In [35]:
# !pip install datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset

ValueError: pyarrow.lib.IpcReadOptions size changed, may indicate binary incompatibility. Expected 112 from C header, got 104 from PyObject

In [None]:
model_name = "distilbert-base-uncased"  # good balance of speed/accuracy
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

df = pd.read_csv("agreement_summary.csv")
dataset = dataset.from_pandas(df)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

tokenized_dataset = df.map(tokenize)

#train
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # ideally split into train/test
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
import os
import random
from pathlib import Path
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    pipeline,
)

# -----------------------------
# Configuration (edit as needed)
# -----------------------------
DATA_PATH = "reviews_labeled.csv"  # CSV with columns 'text' and 'label'
MODEL_NAME = "roberta-base"
OUTPUT_DIR = "./roberta-review-checkpoint"
NUM_EPOCHS = 3
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
MAX_LENGTH = 256
RANDOM_SEED = 42

# If your labels are strings, list them here in desired order. If labels are already integers 0..N-1,
# keep LABEL_LIST = None and the script will infer the number of labels automatically.
# Example for your hackathon classes:
# Relevant, Advertisement, Irrelevant, Rant, Spam
LABEL_LIST = ["Relevant", "Advertisement", "Irrelevant", "Rant", "Spam"]

# -----------------------------
# Helpers
# -----------------------------

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def load_dataframe(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    if "text" not in df.columns or "label" not in df.columns:
        raise ValueError("CSV must contain 'text' and 'label' columns")
    return df[["text", "label"]].dropna().reset_index(drop=True)


def prepare_dataset(df: pd.DataFrame, label2id: dict):
    # If labels are strings, map them to ints. If ints already, try to coerce.
    if df["label"].dtype == object or isinstance(df.loc[0, "label"], str):
        df = df.copy()
        df["label"] = df["label"].map(label2id)
    else:
        # ensure integer dtype
        df["label"] = df["label"].astype(int)
    return Dataset.from_pandas(df)


# -----------------------------
# Main training routine
# -----------------------------

def main():
    set_seed(RANDOM_SEED)

    # --- Load CSV ---
    if not Path(DATA_PATH).exists():
        raise FileNotFoundError(f"Data file {DATA_PATH} not found. Put your CSV at this path or edit DATA_PATH")

    df = load_dataframe(DATA_PATH)

    # --- Label mapping ---
    global LABEL_LIST
    if LABEL_LIST is None:
        # infer labels (assume 0..N-1 if ints present)
        unique_labels = sorted(df["label"].unique())
        label_list = [str(x) for x in unique_labels]
        label2id = {lab: i for i, lab in enumerate(label_list)}
        id2label = {i: lab for lab, i in label2id.items()}
    else:
        label_list = LABEL_LIST
        label2id = {lab: i for i, lab in enumerate(label_list)}
        id2label = {i: lab for lab, i in label2id.items()}

    num_labels = len(label_list)
    print(f"Labels ({num_labels}): {label_list}")

    # --- Train/validation split ---
    train_df, val_df = train_test_split(df, test_size=0.15, random_state=RANDOM_SEED, stratify=df["label"] if df["label"].nunique() > 1 else None)
    print(f"Train size: {len(train_df)}, Val size: {len(val_df)}")

    # --- Prepare Hugging Face datasets ---
    train_ds = prepare_dataset(train_df, label2id)
    val_ds = prepare_dataset(val_df, label2id)

    # --- Tokenizer and model ---
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels, id2label=id2label, label2id=label2id)

    def tokenize_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding=False, max_length=MAX_LENGTH)

    train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=["text", "__index_level_0__"] if "__index_level_0__" in train_ds.column_names else ["text"])  # remove the raw text column
    val_ds = val_ds.map(tokenize_fn, batched=True, remove_columns=["text", "__index_level_0__"] if "__index_level_0__" in val_ds.column_names else ["text"])

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # --- Metrics ---
    def compute_metrics(pred):
        labels = pred.label_ids
        preds = np.argmax(pred.predictions, axis=1)
        acc = accuracy_score(labels, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
        return {"accuracy": acc, "precision_macro": precision, "recall_macro": recall, "f1_macro": f1}

    # --- Training arguments ---
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=NUM_EPOCHS,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        push_to_hub=False,
        logging_steps=50,
        fp16=torch.cuda.is_available(),
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # --- Train ---
    trainer.train()

    # --- Save ---
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print(f"Model and tokenizer saved to {OUTPUT_DIR}")

    # --- Quick local inference demo ---
    classifier = pipeline("text-classification", model=OUTPUT_DIR, tokenizer=OUTPUT_DIR, return_all_scores=True)

    demo_texts = [
        "Amazing food — will come back!",
        "Call us now to get 70% off on shoes! Visit our site",
        "I don't know much about this place but I heard bad things",
    ]

    print("\nDemo predictions:\n")
    for t in demo_texts:
        out = classifier(t, truncation=True, max_length=MAX_LENGTH)
        # out is a list of dicts with scores per label
        # convert to label:score mapping for clearer printing
        label_scores = {d['label']: d['score'] for d in out[0]}
        print("TEXT:\n", t)
        print("PRED:", label_scores)
        print("-")


if __name__ == "__main__":
    main()

SyntaxError: invalid syntax (740767688.py, line 185)