In [None]:
import pandas as pd
from transformers import GPT2TokenizerFast

import openai
openai.api_key = open('./openai_aip_key.txt').read()

# Prepare training data

In [None]:
df_mapper = pd.read_csv('category.csv')

In [None]:
df = pd.read_csv('wolt_reviews.csv')

df = df[['Review comment', 'Comment attribut']]
df = df.rename(columns={'Review comment': 'text', 'Comment attribut': 'subcategory'})
df = df.dropna()
df = df.loc[~df['subcategory'].str.contains(',')]
df['subcategory'] = df['subcategory'].astype('int')

In [None]:
df = df.merge(df_mapper, left_on='subcategory', right_on='Code', how='left')

df.columns = df.columns.str.lower()

df = df.drop(columns='code', axis=1)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
train_df = df.sample(frac=0.8, replace=False, random_state=123)
test_df = df.drop(train_df.index)

# Change the level of classification here

In [None]:
# label = 'sentiment'
# label = 'category'
# label = 'category_2'
label = 'curb_attribute'


labels = list(train_df[label].unique())

fd = f'train_1_label_{label}.jl'
train_df[['text', label]].rename(columns={label: 'label'}) \
                         .to_json(fd, orient='records', lines=True)

# Upload training data to endpoint

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

labels = [label.strip().lower().capitalize() for label in labels]
labels_tokens = {label: tokenizer.encode(" " + label) for label in labels}
print(labels_tokens)

In [None]:
ls -ltrh

In [None]:
upload = openai.File.create(file=open(fd), purpose="classifications")
file_id = upload.id
print(file_id)

In [None]:
# Here the file id is logged
file_ids = {
    'sentiment': 'file-3z9sWq4ZTjifhEH2xmIKmv1g',
    'category': 'file-NJuLWvHLfxOTJyd1LJoeWzWQ',
    'curb_category': 'file-phtuGFsjXQBxR15PfxIlo9sT'
}

In [None]:
# query = 'I would have preferred a little less dressing' 

result_dict = {}
for index, row in test_df.iterrows():
    query = row['text']
    ground_true = row[label]
    try:
        result = openai.Classification.create(
                                file=file_id,
                                query=query,
                                search_model="ada",
                                model="curie",
                                max_examples=200,
                                labels=labels,
                                logprobs=len(labels)+1,
                                expand=["completion"])
        result_dict[query] = [result, ground_true]
    except:
        print(f'Skip: {query}')

In [None]:
result_df = pd.DataFrame.from_dict(result_dict, orient='index')\
                        .reset_index()\
                        .rename(columns={'index': 'text', 0: 'pred_dict', 1: 'ground_true'})

result_df['pred'] = [i['label'] for i in result_df.pred_dict]


result_df['ground_true'] = result_df['ground_true'].str.lower()
result_df['pred'] = result_df['pred'].str.lower()


result_df.head()
result_df.to_csv(f'results_{label}.csv')


labels = [l.lower() for l in labels]

# Evaluate the prediction

In [None]:
# label = 'sentiment'
# label = 'category'
# label = 'category_2'
label = 'curb_attribute'


result_df = pd.read_csv(f'./results_{label}.csv')
result_df['ground_true'] = result_df['ground_true'].str.lower()
result_df['pred'] = result_df['pred'].str.lower()

In [None]:
(result_df.groupby('ground_true', sort=True)['text']
           .count()
           .reset_index()
           .sort_values('text', ascending=False)
           .rename(columns={'text': 'counts'})
           .reset_index()
           .drop('index', axis=1)
)

In [None]:
(result_df.groupby('pred', sort=True)['text']
           .count()
           .reset_index()
           .sort_values('text', ascending=False)
           .rename(columns={'text': 'counts'})
           .reset_index()
           .drop('index', axis=1)
)

In [None]:
df_mapper = pd.read_csv('category.csv')
df_mapper['Curb_Attribute'] = df_mapper['Curb_Attribute'].str.lower()
df_mapper = df_mapper[['Curb_Attribute', 'Category']]

In [None]:
df_mapper.head()

In [None]:
result_df = (result_df.merge(df_mapper, left_on='ground_true', right_on='Curb_Attribute', how='left')
                     .drop('Curb_Attribute', axis=1)
                     .merge(df_mapper, left_on='pred', right_on='Curb_Attribute', how='left')
                     .drop('Curb_Attribute', axis=1))

In [None]:
result_df = result_df.dropna()

In [None]:
result_df.to_csv('tmp.csv')

In [None]:
from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve, classification_report

report = classification_report(result_df.Category_x, result_df.Category_y)
print(report)

In [None]:
from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve, classification_report

report = classification_report(result_df.ground_true, result_df.pred)
print(report)

In [None]:
from sklearn.metrics import (
                            confusion_matrix,
                            ConfusionMatrixDisplay,
                            accuracy_score,
                            top_k_accuracy_score,
                            balanced_accuracy_score,
                            )


acc = accuracy_score(result_df.ground_true, result_df.pred)
balanced_acc = balanced_accuracy_score(result_df.ground_true, result_df.pred)
# top_3_acc = top_k_accuracy_score(result_df.ground_true, result_df.pred, k=3)


In [None]:
import matplotlib.pyplot as plt

labels = set(result_df.ground_true.unique()) | set(result_df.pred.unique())
labels = list(labels)

cm = confusion_matrix(result_df.ground_true, result_df.pred, labels=labels, normalize='all')
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=labels,
                              )
disp.plot()

In [None]:
import numpy as np
from collections import defaultdict

# Take the starting tokens for probabilities estimation.
# Labels should have distinct starting tokens.
# Here tokens are case-insensitive.
first_token_to_label = {
    tokenizer.decode([tokens[0]]).strip().lower(): label 
    for label, tokens in labels_tokens.items()
}

top_logprobs = result["completion"]["choices"][0]["logprobs"]["top_logprobs"][0]
token_probs = defaultdict(float)
for token, logp in top_logprobs.items():
    token_probs[token.strip().lower()] += np.exp(logp)

label_probs = {
    first_token_to_label[token]: prob 
    for token, prob in token_probs.items()
    if token in first_token_to_label
}

# Fill in the probability for the special "Unknown" label.
if sum(label_probs.values()) < 1.0:
    label_probs["Unknown"] = 1.0 - sum(label_probs.values())

print(label_probs)