In [12]:
#from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score
from torch.utils.data import WeightedRandomSampler
from torch.utils.data import RandomSampler
#from transformers import T5Tokenizer, T5ForConditionalGeneration

from transformers import pipeline
import openai
from openai import OpenAI
import os




In [13]:
model_feature_fewshot = ['author_name' ,'number_authors_comments', 
                  'rating', 'text',  
                   'photo_attached', 
                  'responses','number_of_responses','location_name', 'location_type', 'MISC' ]

photo_attached_categories = [
    "No",
    "Yes", 
    "Unknown"
]

In [14]:


def preprocess_scraped_data_for_fewshot(reviews, metadata):
    input_format = ['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id','label']
    shop_metadata_input_format = ['name', 'address', 'gmap_id', 'description', 'latitude', 'longitude',
       'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC',
       'state', 'relative_results', 'url']

    if not (set(input_format).issubset(set(reviews.columns))):
        raise ValueError("Reviews dataframe does not have the correct columns")
    if not (set(shop_metadata_input_format).issubset(set(metadata.columns))):
        raise ValueError("Meta dataframe does not have the correct columns")
    
    
    metadata['shop_name'] = metadata['name']
    metadata.drop(columns=['name'], inplace=True)
    metadata = metadata.drop_duplicates(subset=['gmap_id'])

    df_in = pd.merge(reviews, metadata, on='gmap_id', how='left')


    df_out = pd.DataFrame(columns=model_feature_fewshot) 

    df_out['author_name'] = df_in['name'].fillna("").astype(str)
    df_out['number_authors_comments'] = -1
    df_out['rating'] = df_in['rating'].fillna(-1).astype(int)
    df_out['text'] = df_in['text'].fillna("No text available").astype(str)
    df_out['location_type'] = df_in['category'].fillna("Unknown").astype(str)
    df_out['location_name'] = df_in['shop_name'].fillna("Unknown").astype(str)
    df_out['MISC'] = df_in['MISC'].fillna("No MISC available").astype(str)
    #df_out['time'] = df_in['time'].fillna(-1).astype(int)
    df_out['photo_attached'] = df_in['pics'].apply(lambda x: "No" if pd.isna(x) else "Yes").astype(str)
    df_out['number_of_responses'] = df_in['resp'].apply(lambda x: 0 if pd.isna(x) else 1 ) # TODO: currently only one rsponse if multiple, correct in future 
    df_out['responses'] = df_in['resp'].fillna("No response available").astype(str)
    return df_out   




In [15]:
#meta = pd.read_json('data_clemens/reviews_2021/meta-other.json', lines=True)
examples = pd.read_csv('data_clemens/input_featurs_csv_example_based_on_reviews_2021_label.csv')
examples = examples.dropna(subset=['label'])
#features = preprocess_scraped_data_for_fewshot(metadata=meta, reviews=reviews)
#features.sample(n=20, random_state=42).to_csv("input_featurs_csv_example_based_on_reviews_2021.csv",index=False)
#print(examples)
features = examples.drop(columns=['label']).iloc[1:]
labels =examples['label'].iloc[1:]
test_features = examples.drop(columns=['label']).iloc[0]
test_label = examples['label'].iloc[0]
# Split into train (80%) and test (20%)
#x_train, x_test, y_train, y_test = train_test_split(examples , examples['label'], test_size=0.2, random_state=42, stratify=reviews['label'] ) # stratify keeps class balance))

In [None]:
#tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
#model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")


def generate_prompt(examples_feature,examples_labels, data, format):
    #pipe = pipeline("text-generation", model="google/flan-t5-small")
    #model_features = ['author_name' ,'number_authors_comments', 'rating', 'text','general_location_type', 'specific_location_type', 'time', 'photo_attached', 'responses','number_of_responses']
    # model_feature_fewshot = ['author_name' ,'number_authors_comments', 
    #              'rating', 'text',  
    #               'time', 'photo_attached', 
    #              'responses','number_of_responses','location_name', 'location_type', 'MISC' ]
    prompt = f"""Instruction:

    Reviews are given to you as input in the following format:
    with following information: 
    {format}

    

    Classify each review into one of the following labels based on the policies:

    

    Classify as ADVERTISEMENT 
        if the review fullfills for example one of the following:
        - contains links to other websites than the shop_website 
        - contains phone numbers
        - review mentions other businesses



    IRRELEVANT if if the review 
        - Talks about unrelated topics (not about the location). Take the location_type and the location_name into accoutn wehn evaluating if the location fits to the topic of the review.


    RANTS if if the review is a negative review where
        - the text indicates that the reviewer has never been to the location



    GOOD if the review violates none of the policies above. The following properties indicate additionalythat the review is a GOOD review:
        - there is a photo attached
        - there are responses meaningful responses to the review
        - the review refers to specific aspects of the location (e.g., service, ambiance, product quality). Consider the general_location_type of the location to determine relevant aspects.
        - negative reviews that do not violate the policies above are also classified as GOOD reviews.




    In addition here are some Examples:

    """
    print(examples_feature.shape[0])
    print(examples_labels.shape[0])

    for i in range(0,examples_feature.shape[0] ):
        prompt += "input:\n" + str(examples_feature.iloc[i]) +"\noutput: " + examples_labels.iloc[i] +"\n\n"

    prompt += "predict for following data: \ninput:\n" + str(data) + "\noutput: " 


 
    return prompt


def classify_with_openai(prompt: str, model="gpt-4o-mini"):
   

    client = OpenAI(
    api_key="" # paste api key in here 
    )

    response = client.responses.create(
    model="gpt-4o-mini",
    input=prompt,
    store=True,
    )

    
    return response.output_text

print(classify_with_openai(generate_prompt(features,labels , test_features, model_feature_fewshot)))



8
8
