In [22]:
# !pip install torch transformers datasets scikit-learn pandas
# !pip install evaluate
!pip install accelerate





In [23]:
# Core Python
import os
import random
import numpy as np
import pandas as pd

# Hugging Face Transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    pipeline
)

# Hugging Face Datasets
from datasets import Dataset

# Metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# PyTorch
import torch


In [24]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/denizbilginn/google-maps-restaurant-reviews")

ModuleNotFoundError: No module named 'opendatasets'

## Data Ingestion and Data Processing

In [None]:
# Install dependencies
!pip install pandas scikit-learn nltk

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import nltk
nltk.download('punkt')

# --- 1. Load CSVs ---
reviews_df = pd.read_csv("google-maps-restaurant-reviews/reviews.csv")
meta_df = pd.read_csv("google-maps-restaurant-reviews/sepetcioglu_restaurant.csv")

print("=== Reviews CSV (first 5 rows) ===")
print(reviews_df.head())

=== Reviews CSV (first 5 rows) ===
                     business_name    author_name  \
0  Haci'nin Yeri - Yigit Lokantasi    Gulsum Akar   
1  Haci'nin Yeri - Yigit Lokantasi  Oguzhan Cetin   
2  Haci'nin Yeri - Yigit Lokantasi     Yasin Kuyu   
3  Haci'nin Yeri - Yigit Lokantasi     Orhan Kapu   
4  Haci'nin Yeri - Yigit Lokantasi     Ozgur Sati   

                                                text  \
0  We went to Marmaris with my wife for a holiday...   
1  During my holiday in Marmaris we ate here to f...   
2  Prices are very affordable. The menu in the ph...   
3  Turkey's cheapest artisan restaurant and its f...   
4  I don't know what you will look for in terms o...   

                                               photo  rating  \
0         dataset/taste/hacinin_yeri_gulsum_akar.png       5   
1        dataset/menu/hacinin_yeri_oguzhan_cetin.png       4   
2  dataset/outdoor_atmosphere/hacinin_yeri_yasin_...       3   
3  dataset/indoor_atmosphere/hacinin_yeri_orhan_k... 

[nltk_data] Downloading package punkt to /Users/juninho/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
print("\n=== Restaurant Metadata CSV (first 5 rows) ===")
print(meta_df.head())


=== Restaurant Metadata CSV (first 5 rows) ===
                           photo  rating    rating_category
0  sepetcioglu_restaurant/09.png       4              taste
1  sepetcioglu_restaurant/01.png       5  indoor_atmosphere
2  sepetcioglu_restaurant/25.png       2               menu
3  sepetcioglu_restaurant/10.png       5              taste
4  sepetcioglu_restaurant/02.png       3  indoor_atmosphere


In [None]:
def clean_text_for_transformers(text):
    """Clean text but preserve useful signals for transformers."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "URL", text)   # replace links with token
    text = re.sub(r"[^a-zA-Z\s]", " ", text)       # keep letters + spaces
    text = re.sub(r"\s+", " ", text).strip()       # normalize whitespace
    return text

# Example usage:
reviews_df['cleaned_review'] = reviews_df['text'].apply(clean_text_for_transformers)
print(reviews_df[['text', 'cleaned_review']].head(10))


                                                text  \
0  We went to Marmaris with my wife for a holiday...   
1  During my holiday in Marmaris we ate here to f...   
2  Prices are very affordable. The menu in the ph...   
3  Turkey's cheapest artisan restaurant and its f...   
4  I don't know what you will look for in terms o...   
5                                    Generally good.   
6  What you see is 125 TL in total. It's a pretty...   
7  Delicious food at rock bottom prices. Friendly...   
8  Every time I go, I still experience the amazem...   
9          The most f/p of all businesses I've seen.   

                                      cleaned_review  
0  we went to marmaris with my wife for a holiday...  
1  during my holiday in marmaris we ate here to f...  
2  prices are very affordable the menu in the pho...  
3  turkey s cheapest artisan restaurant and its f...  
4  i don t know what you will look for in terms o...  
5                                     generally good 

## Pseudo Labeling Using LLM 

LLMs in consideration - Might use all to stress test

  google/flan-t5-large or flan-t5-xl
  
  mistralai/Mistral-7B-Instruct
  
  HuggingFaceH4/zephyr-7b-alpha

## Prompt Pipeline

In [None]:
import pandas as pd
from transformers import pipeline

# Sample a smaller set for stress testing
sample_df = reviews_df.sample(100, random_state=42).reset_index(drop=True)

# 2. Candidate label set
candidate_labels = ["trustworthy", "advertisement", "rant", "irrelevant"]

# 3. Define prompts
PROMPTS = {
    "direct": """Classify the review into one of exactly these categories:
[trustworthy, advertisement, rant, irrelevant].
Return ONLY the category word and nothing else.
Review: {review}""",

    "few_shot": """Examples:
Review: "Best pizza in town, will come again!" → trustworthy
Review: "Visit my website www.bestfoodpromo.com for deals!" → advertisement
Review: "I hate this place, never been there but looks bad." → rant
Review: "qwerty lorem ipsum nothing" → irrelevant

Now classify the following review.
Return ONLY one category word: trustworthy, advertisement, rant, irrelevant.
Review: {review}""",

    "cot": """Think step by step to decide if the review is trustworthy, an advertisement, a rant, or irrelevant.
At the end, output ONLY one word: trustworthy, advertisement, rant, or irrelevant.
Review: {review}"""
}

# 4. Define models (swap in different Hugging Face checkpoints here)
MODELS = {
    "bart": pipeline("zero-shot-classification", model="facebook/bart-large-mnli"),
    "t5_large": pipeline("text2text-generation", model="google/flan-t5-large"),
    "t5_base": pipeline("text2text-generation", model="google/flan-t5-base"),
    # You can add flan-t5-large if Colab has GPU
}

# 5. Run function
def classify_review(review, model_name, prompt_type):
    prompt = PROMPTS[prompt_type].format(review=review)
    model = MODELS[model_name]

    if model_name == "bart":
        result = model(review, candidate_labels=candidate_labels)
        return result["labels"][0]  # top predicted label

    else:  # T5 family
        result = model(prompt, max_length=32, clean_up_tokenization_spaces=True)
        return result[0]["generated_text"]

# 6. Stress test loop
results = []
for idx, row in sample_df.iterrows():
    review = row["text"]
    for model_name in MODELS.keys():
        for prompt_type in PROMPTS.keys():
            label = classify_review(review, model_name, prompt_type)
            results.append({
                "review": review,
                "model": model_name,
                "prompt": prompt_type,
                "label": label
            })

results_df = pd.DataFrame(results)
results_df.to_csv("stress_test_results.csv", index=False)

print(results_df.head())


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use mps:0


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use mps:0


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use mps:0
Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_class

                                              review     model    prompt  \
0  I went to there with my girl friend; I liked f...      bart    direct   
1  I went to there with my girl friend; I liked f...      bart  few_shot   
2  I went to there with my girl friend; I liked f...      bart       cot   
3  I went to there with my girl friend; I liked f...  t5_large    direct   
4  I went to there with my girl friend; I liked f...  t5_large  few_shot   

                    label  
0                    rant  
1                    rant  
2                    rant  
3             trustworthy  
4  not enough information  


In [None]:
import pandas as pd
from transformers import pipeline
from collections import Counter

# ------------------------------------------------------
# 1️⃣ Sample a smaller set for stress testing
# ------------------------------------------------------
sample_df = reviews_df.sample(100, random_state=42).reset_index(drop=True)

# ------------------------------------------------------
# 2️⃣ Candidate labels
# ------------------------------------------------------
candidate_labels = ["trustworthy", "advertisement", "rant", "irrelevant"]

# ------------------------------------------------------
# 3️⃣ T5 prompt templates (cleaner & parameterized)
# ------------------------------------------------------
PROMPTS = {
    "direct": """Classify the review into one of exactly these categories:
[trustworthy, advertisement, rant, irrelevant].
Return only the single category word.

Review: {review}""",

    "few_shot": """Here are some examples:
"Best pizza in town, will come again!" -> trustworthy
"Visit www.bestfoodpromo.com for deals!" -> advertisement
"I hate this place, never been there but looks bad." -> rant
"qwerty lorem ipsum" -> irrelevant

Now classify this review into one category word:
{review}""",

    "cot": """Think step by step and decide whether the review is trustworthy, an advertisement, a rant, or irrelevant.
At the end, output ONLY one word.

Review: {review}"""
}

# ------------------------------------------------------
# 4️⃣ Define models
# ------------------------------------------------------
MODELS = {
    "bart": pipeline("zero-shot-classification", model="facebook/bart-large-mnli"),
    "t5_base": pipeline("text2text-generation", model="google/flan-t5-base"),
    "t5_large": pipeline("text2text-generation", model="google/flan-t5-large"),
}

# ------------------------------------------------------
# 5️⃣ Generator parameters for T5
# ------------------------------------------------------
T5_GEN_PARAMS = {
    "max_length": 5,           # single-word output
    "temperature": 0.0,        # deterministic
    "num_return_sequences": 3, # generate multiple outputs
    "do_sample": False,
    "repetition_penalty": 1.2,
    "no_repeat_ngram_size": 2
}

# ------------------------------------------------------
# 6️⃣ Thresholds for BART
# ------------------------------------------------------
THRESHOLDS = {
    "trustworthy": 0.5,
    "advertisement": 0.65,
    "rant": 0.55,
    "irrelevant": 0.5
}

# ------------------------------------------------------
# 7️⃣ Function to classify a review
# ------------------------------------------------------
def classify_review(review, model_name, prompt_type):
    prompt = PROMPTS[prompt_type].format(review=review)
    model = MODELS[model_name]

    if model_name == "bart":
        result = model(review, candidate_labels=candidate_labels, multi_label=False)
        top_label = result["labels"][0]
        confidence = result["scores"][0]
        # Apply threshold
        threshold = THRESHOLDS.get(top_label.lower(), 0.5)
        final_label = top_label if confidence >= threshold else "uncertain"
        return final_label

    else:  # T5 models
        outputs = model(prompt, **T5_GEN_PARAMS)
        # Clean outputs and normalize
        predictions = [out["generated_text"].strip().lower() for out in outputs]
        normalize_map = {
            "[trustworthy]": "trustworthy",
            "[inappropriate]": "irrelevant",
            "untrustworthy": "rant"
        }
        predictions = [normalize_map.get(p, p) for p in predictions]
        # Majority vote
        most_common = max(set(predictions), key=predictions.count)
        return most_common

# ------------------------------------------------------
# 8️⃣ Stress test loop
# ------------------------------------------------------
results = []
for idx, row in sample_df.iterrows():
    review = row["text"]
    for model_name in MODELS.keys():
        for prompt_type in PROMPTS.keys():
            label = classify_review(review, model_name, prompt_type)
            results.append({
                "review": review,
                "model": model_name,
                "prompt": prompt_type,
                "label": label
            })

results_df = pd.DataFrame(results)
results_df.to_csv("stress_test_results.csv", index=False)
print(results_df.head())

# ------------------------------------------------------
# 9️⃣ Optional: compute agreement ratio
# ------------------------------------------------------
def compute_agreement(df):
    agreements = []
    for review, group in df.groupby("review"):
        labels = [str(l).lower().strip() for l in group["label"]]
        most_common_label, count = Counter(labels).most_common(1)[0]
        agreement_ratio = count / len(labels)
        agreements.append({
            "review": review,
            "most_common_label": most_common_label,
            "agreement_ratio": agreement_ratio,
            "all_labels": labels
        })
    return pd.DataFrame(agreements)

agreement_df = compute_agreement(results_df)
agreement_df.to_csv("agreement_summary.csv", index=False)
print(agreement_df.head())


Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Both `max_new_tokens` (=256) and `max_length`(=5) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Both `max_new_tokens` (=256) and `max_length`(=5) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Both `max_new_tokens` (=256) a

                                              review    model    prompt  \
0  I went to there with my girl friend; I liked f...     bart    direct   
1  I went to there with my girl friend; I liked f...     bart  few_shot   
2  I went to there with my girl friend; I liked f...     bart       cot   
3  I went to there with my girl friend; I liked f...  t5_base    direct   
4  I went to there with my girl friend; I liked f...  t5_base  few_shot   

           label  
0      uncertain  
1      uncertain  
2      uncertain  
3  advertisement  
4            bad  
                                              review most_common_label  \
0  A decent place. You can eat as a family. You c...       trustworthy   
1                                 A delicious doner.       trustworthy   
2  A good kebab place. The meat is very good; tas...       trustworthy   
3  Appetizers were good. The size of the lahmacun...       trustworthy   
4  As we were passing on the road; it was pleasin...       trustw

In [None]:
model_name = "distilbert-base-uncased"  # good balance of speed/accuracy
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

df = pd.read_csv("agreement_summary.csv")
dataset = Dataset.from_pandas(df)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

tokenized_dataset = dataset.map(tokenize, batched='True')

#train
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # ideally split into train/test
    tokenizer=tokenizer,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 100/100 [00:00<00:00, 6039.92 examples/s]


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
# Install Hugging Face with PyTorch support
!pip install transformers[torch]

# OR install accelerate directly
!pip install --upgrade accelerate>=0.26.0




In [None]:
import transformers, accelerate, torch
print("Transformers:", transformers.__version__)
print("Accelerate:", accelerate.__version__)
print("Torch:", torch.__version__)


Transformers: 4.55.4
Accelerate: 1.10.1
Torch: 2.8.0+cpu


In [27]:
# -----------------------------
# Imports
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer, FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline

# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("agreement_summary.csv")

# Convert 'all_labels' column from string to actual list
import ast
df['all_labels'] = df['all_labels'].apply(ast.literal_eval)

# Filter out classes with <2 samples to avoid stratify errors
value_counts = df['most_common_label'].value_counts()
valid_labels = value_counts[value_counts > 1].index
df = df[df['most_common_label'].isin(valid_labels)].reset_index(drop=True)

# -----------------------------
# 2. Features and target
# -----------------------------
X = df[['review', 'agreement_ratio', 'all_labels']]
y = df['most_common_label']

# -----------------------------
# 3. Preprocessing / Feature Engineering
# -----------------------------
# TF-IDF for review text
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# One-hot encoding for 'all_labels' (multi-label)
mlb = MultiLabelBinarizer()
all_labels_features = mlb.fit_transform(df['all_labels'])

# Helper to extract columns in pipeline
def get_column(col_name):
    return FunctionTransformer(lambda x: np.array(x[col_name]).reshape(-1,1), validate=False)

# -----------------------------
# 4. Build pipeline
# -----------------------------
combined_features = FeatureUnion([
    ('tfidf', Pipeline([
        ('selector', FunctionTransformer(lambda x: x['review'], validate=False)),
        ('tfidf', tfidf_vectorizer)
    ])),
    ('agreement_ratio', Pipeline([
        ('selector', get_column('agreement_ratio'))
    ])),
    ('all_labels', Pipeline([
        ('selector', FunctionTransformer(lambda x: mlb.transform(x['all_labels']), validate=False))
    ]))
])

clf_pipeline = Pipeline([
    ('features', combined_features),
    ('rf', RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=42))
])

# -----------------------------
# 5. Train/test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 6. Train classifier
# -----------------------------
clf_pipeline.fit(X_train, y_train)

# -----------------------------
# 7. Evaluate
# -----------------------------
y_pred = clf_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# -----------------------------
# 8. Predict new reviews
# -----------------------------
new_reviews = pd.DataFrame({
    'review': [
        "The service was amazing and the staff were friendly",
        "Buy this product now! Limited offer!"
    ],
    'agreement_ratio': [1.0, 0.2],
    'all_labels': [['trustworthy'], ['uncertain']]
})

preds = clf_pipeline.predict(new_reviews)
print("\nPredictions on new reviews:", preds)


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

        rant       1.00      1.00      1.00         4
 trustworthy       1.00      1.00      1.00        12
   uncertain       1.00      1.00      1.00         4

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20


Predictions on new reviews: ['trustworthy' 'uncertain']


In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import evaluate
import numpy as np

In [3]:
import pandas as pd

# Load CSV
df = pd.read_csv("pseudo_labels.csv")

# Check first few rows
print(df.head())

                                              review most_common_label  \
0  10 mussels + half a kokorec is 240TL. Also; th...         uncertain   
1  160-year-old Vefa Bozacisi has not lost its fl...         uncertain   
2  2 appetizers 1 salad the smell of blood and th...         uncertain   
3  2 roasted meat on rice + a ayran and a compote...         uncertain   
4  250 TL account was received for 1 portion of r...         uncertain   

   agreement_ratio                                         all_labels  
0         0.333333  ['uncertain', 'uncertain', 'uncertain', 'rant'...  
1         0.333333  ['uncertain', 'uncertain', 'uncertain', 'trust...  
2         0.333333  ['uncertain', 'uncertain', 'uncertain', 'rant'...  
3         0.333333  ['uncertain', 'uncertain', 'uncertain', 'trust...  
4         0.333333  ['uncertain', 'uncertain', 'uncertain', 'irrel...  


In [4]:
from datasets import Dataset

# Convert pandas dataframe to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [5]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["review_text"], padding=True, truncation=True)

tokenized_ds = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/1088 [00:00<?, ? examples/s]

KeyError: 'review_text'