In [3]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score
from torch.utils.data import WeightedRandomSampler
from torch.utils.data import RandomSampler
from transformers import T5Tokenizer, T5ForConditionalGeneration

from transformers import pipeline


2025-08-29 22:00:53.653325: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [4]:
model_feature_fewshot = ['author_name' ,'number_authors_comments', 
                  'rating', 'text',  
                   'photo_attached', 
                  'responses','number_of_responses','location_name', 'location_type', 'MISC' ]

photo_attached_categories = [
    "No",
    "Yes", 
    "Unknown"
]

In [5]:


def preprocess_scraped_data_for_fewshot(reviews, metadata):
    input_format = ['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id','label']
    shop_metadata_input_format = ['name', 'address', 'gmap_id', 'description', 'latitude', 'longitude',
       'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC',
       'state', 'relative_results', 'url']

    if not (set(input_format).issubset(set(reviews.columns))):
        raise ValueError("Reviews dataframe does not have the correct columns")
    if not (set(shop_metadata_input_format).issubset(set(metadata.columns))):
        raise ValueError("Meta dataframe does not have the correct columns")
    
    
    metadata['shop_name'] = metadata['name']
    metadata.drop(columns=['name'], inplace=True)
    metadata = metadata.drop_duplicates(subset=['gmap_id'])

    df_in = pd.merge(reviews, metadata, on='gmap_id', how='left')


    df_out = pd.DataFrame(columns=model_feature_fewshot) 

    df_out['author_name'] = df_in['name'].fillna("").astype(str)
    df_out['number_authors_comments'] = -1
    df_out['rating'] = df_in['rating'].fillna(-1).astype(int)
    df_out['text'] = df_in['text'].fillna("").astype(str)
    df_out['location_type'] = df_in['category'].fillna("Unknown").astype(str)
    df_out['location_name'] = df_in['shop_name'].fillna("Unknown").astype(str)
    df_out['MISC'] = df_in['MISC'].fillna("No MISC available").astype(str)
    #df_out['time'] = df_in['time'].fillna(-1).astype(int)
    df_out['photo_attached'] = df_in['pics'].apply(lambda x: "No" if pd.isna(x) else "Yes").astype(str)
    df_out['number_of_responses'] = df_in['resp'].apply(lambda x: 0 if pd.isna(x) else 1 ) # TODO: currently only one rsponse if multiple, correct in future 
    df_out['responses'] = df_in['resp'].fillna("").astype(str)
    return df_out   




In [6]:
meta = pd.read_json('data/reviews_2021/meta-other.json', lines=True)
reviews = pd.read_csv('review_other_head_labeled.csv').head(300)
features = preprocess_scraped_data_for_fewshot(metadata=meta, reviews=reviews)
# Split into train (80%) and test (20%)
x_train, x_test, y_train, y_test = train_test_split(features , reviews['label'], test_size=0.2, random_state=42, stratify=reviews['label'] ) # stratify keeps class balance))

In [7]:
print(str(x_test.iloc[1]))

author_name                                                Teijah Armbruster
number_authors_comments                                                   -1
rating                                                                     5
text                       Leslie had just finished a newborn photography...
photo_attached                                                            No
responses                                                                   
number_of_responses                                                        0
location_name                                    Leslie Carbajal Photography
location_type                      ['Photographer', 'Service establishment']
MISC                       {'From the business': ['Identifies as Black-ow...
Name: 108, dtype: object


In [12]:
#tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
#model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")


def evaluate_fewshot(examples_feature,examples_labels, data, format):
    pipe = pipeline("text-generation", model="google/flan-t5-small")
    #model_features = ['author_name' ,'number_authors_comments', 'rating', 'text','general_location_type', 'specific_location_type', 'time', 'photo_attached', 'responses','number_of_responses']
    # model_feature_fewshot = ['author_name' ,'number_authors_comments', 
    #              'rating', 'text',  
    #               'time', 'photo_attached', 
    #              'responses','number_of_responses','location_name', 'location_type', 'MISC' ]
    prompt = f"""Instruction:

    Reviews are given to you as input in the following format:
    with following information: 
    {format}

    

    Classify each review into one of the following labels based on the policies:

    

    Classify as ADVERTISEMENT 
        if the review fullfills for example one of the following:
        - contains links to other websites than the shop_website 
        - contains phone numbers
        - review mentions other businesses



    IRRELEVANT if if the review 
        - Talks about unrelated topics (not about the location). Take the location_type and the location_name into accoutn wehn evaluating if the location fits to the topic of the review.


    RANTS if if the review is a negative review where
        - the text indicates that the reviewer has never been to the location



    GOOD if the review violates none of the policies above. The following properties indicate additionalythat the review is a GOOD review:
        - there is a photo attached
        - there are responses meaningful responses to the review
        - the review refers to specific aspects of the location (e.g., service, ambiance, product quality). Consider the general_location_type of the location to determine relevant aspects.
        - negative reviews that do not violate the policies above are also classified as GOOD reviews.




    In addition here are some Examples:

    """
    print(examples_feature.shape[0])
    print(examples_labels.shape[0])

    for i in range(0,examples_feature.shape[0] ):
        prompt += "input:\n" + str(examples_feature.iloc[i]) +"\noutput: " + examples_labels.iloc[i] +"\n\n"

    prompt += "predict for following data: \ninput:\n" + str(data) + "\noutput: " 


    
    #input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    #outputs = model.generate(input_ids)
    result = pipe(prompt, max_new_tokens=20)
    return result

print(evaluate_fewshot(x_train.head(10), y_train.head(10), str(x_test.iloc[1]), model_feature_fewshot))

config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)a5b18a05535c9e14c7a355904270e15b0945ea86:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNe

10
10
[{'generated_text': "Instruction:\n\n    Reviews are given to you as input in the following format:\n    with following information: \n    ['author_name', 'number_authors_comments', 'rating', 'text', 'photo_attached', 'responses', 'number_of_responses', 'location_name', 'location_type', 'MISC']\n\n\n\n    Classify each review into one of the following labels based on the policies:\n\n\n\n    Classify as ADVERTISEMENT \n        if the review fullfills for example one of the following:\n        - contains links to other websites than the shop_website \n        - contains phone numbers\n        - review mentions other businesses\n\n\n\n    IRRELEVANT if if the review \n        - Talks about unrelated topics (not about the location). Take the location_type and the location_name into accoutn wehn evaluating if the location fits to the topic of the review.\n\n\n    RANTS if if the review is a negative review where\n        - the text indicates that the reviewer has never been to the lo