In [1]:
import os
os.chdir('../')
import sys
import pandas as pd
import numpy as np

In [42]:
pd.set_option('display.max_rows', 80)
pd.set_option('display.max_colwidth', None)

# Load data

In [54]:
labeled_data_file = 'data/processed/labeled_output.csv'

In [55]:
df = pd.read_csv(labeled_data_file)

In [56]:
# The free models werent adequate because of API call limit
model_list = ['claude', 'gpt4']

In [57]:
df['label_combo'] = df[[f'rating_{m}' for m in model_list]].agg(tuple, axis=1)
df['label_combo'].value_counts().head(10)

label_combo
(1.0, 1.0)      425
(-1.0, -1.0)    138
(0.0, 0.0)       43
(0.0, -1.0)      19
(0.0, 1.0)       13
(1.0, 0.0)        8
(-1.0, 0.0)       3
(1.0, -1.0)       2
Name: count, dtype: int64

In [58]:
df["agreement_score"] = df[[f'rating_{m}' for m in model_list]].nunique(axis=1)

In [59]:
df["agreement_score"].value_counts()

agreement_score
1    606
2     45
Name: count, dtype: int64

In [60]:
df['target'] = np.nan
df.loc[df['agreement_score']==1, 'target'] = df.loc[df['agreement_score']==1, 'rating_claude']

# Manual check consistency in the cases with no agreement

In [61]:
unagreement_index = df[df['agreement_score']==2].index

In [62]:
batch_size = 10
batches = [unagreement_index[i:i + batch_size] for i in range(0, len(unagreement_index), batch_size)]

In [63]:
df.loc[batches[0], ["text", "rating_claude", "rating_gpt4"]]

Unnamed: 0,text,rating_claude,rating_gpt4
25,the only thing i moderately enjoyed was their grilled chicken special with edamame puree .,0.0,-1.0
39,the last time i walked by it looked pretty empty . hmmm .,0.0,-1.0
49,"in the evening , this place attracted a well dressed , with it , ny crowd .",0.0,1.0
61,if you ' ve ever been along the river in weehawken you have an idea of the top of view the chart house has to offer .,0.0,1.0
72,"yes , they use fancy ingredients , but even fancy ingredients do n ' t make for good pizza unless someone knows how to get the crust right .",0.0,-1.0
115,"there was a small wait , but shorter than i expected .",0.0,1.0
116,located at the end of a magnificent block .,1.0,0.0
126,"but when we looked at the menu , there were n ' t a lot of choices , most of them were dumplings in the appetizer section .",0.0,-1.0
136,"admittedly some nights inside the restaurant were rather warm , but the open kitchen is part of the charm .",0.0,1.0
190,the staff is no nonsense .,1.0,0.0


In [64]:
target = [-1, 0, 0, 0, -1, 0, 1, -1, 0, 1]
df.loc[batches[0], 'target'] = target

In [65]:
df.loc[batches[1], ["text", "rating_claude", "rating_gpt4"]]

Unnamed: 0,text,rating_claude,rating_gpt4
204,"food is usually very good , though ocasionally i wondered about freshmess of raw vegatables in side orders .",0.0,1.0
226,the bagel was huge .,1.0,0.0
238,okay - i do n ' t mind the oily part ( cause most are cooked that way ) but it was way too bland .,0.0,-1.0
253,"my husbands was perfect , my was well done and dry .",-1.0,0.0
267,the atmosphere is noisy and the waiters are literally walking around doing things as fast as they can .,0.0,-1.0
281,"sometimes i get bad food and bad service , sometimes i get good good and bad service .",0.0,-1.0
283,the place is a bistro which means : simple dishes and wine served efficiently in a bustling atmosphere .,0.0,1.0
292,i started out with a bombay beer which was big enough for two .,1.0,0.0
330,"cute place , nice wait staff but would never go there again .",0.0,-1.0
357,it ' s very spicy but not offensive .,0.0,1.0


In [66]:
target = [0, 0, -1, 0, -1, -1, 1, 1, -1, 0]
df.loc[batches[1], 'target'] = target

In [67]:
df.loc[batches[2], ["text", "rating_claude", "rating_gpt4"]]

Unnamed: 0,text,rating_claude,rating_gpt4
366,"with the exception of our lemon salad that had so much pepper on it that our eyes started watering , the food here was decent , not great .",-1.0,0.0
376,if you want something really different than try jekyll and hyde .,1.0,0.0
387,food was good not great not worth the wait or another visit,0.0,-1.0
410,"the atmosphere is nothing special , but it feels like a sushi establishment in tokyo .",0.0,1.0
416,we were n ' t !,0.0,-1.0
417,the prices were cheap compared to the quality of service and food .,1.0,-1.0
419,the location and ambience is ok but the food is what makes up for it .,0.0,1.0
421,try green curry with vegetables .,1.0,0.0
423,"the service is ok , some of the people did n ' t get what they asked for .",0.0,-1.0
428,a little noise but i think that was because of our party !,0.0,1.0


In [68]:
target = [0, 1, -1, 0, 0, 1, 1, 0, -1, 1]
df.loc[batches[2], 'target'] = target

In [69]:
df.loc[batches[3], ["text", "rating_claude", "rating_gpt4"]]

Unnamed: 0,text,rating_claude,rating_gpt4
450,"this place is pricey , and yes , the food is worth it ; but the service makes you feel like you should be paying a quater of the price .",0.0,-1.0
452,amma is nothing special .,-1.0,0.0
456,this place is always packed .,0.0,1.0
470,not sure why this restaurant would be rated that highly .,0.0,-1.0
473,"the only things u could really taste are the very salty soy sauce ( even its low sodium ) , the vinegar - soaked rice , and the scallion on top of the fish .",0.0,-1.0
477,they wouldnt even let me finish my glass of wine before offering another .,1.0,-1.0
495,it ' s a rather cramped and busy restaurant and it closes early .,0.0,-1.0
506,the only fallback on this restaurant is the prices .,0.0,1.0
537,it is thick and slightly soggy .,0.0,-1.0
566,"my friends settled for rice dishes , but we came back the following day to try the dim sum , which was good . . . not outstanding , but good .",0.0,1.0


In [70]:
target = [0, -1, 0, -1, 0, 0, -1, 0, -1, 0]
df.loc[batches[3], 'target'] = target

In [71]:
df.loc[batches[4], ["text", "rating_claude", "rating_gpt4"]]

Unnamed: 0,text,rating_claude,rating_gpt4
579,"but , they were too big for the bun .",0.0,-1.0
583,it was $ 14 not really bad for a pound of pastrami - but it did n ' t have much taste - i ' ve had better for less elsewhere !,0.0,-1.0
608,"average to good thai food , but terrible delivery .",0.0,-1.0
626,go here .,1.0,0.0
632,the menu has so many fish items and oysters .,1.0,0.0


In [72]:
target = [0, -1, -1, 1, 0]
df.loc[batches[4], 'target'] = target

# Check neutral 

In [73]:
df['target'].value_counts()

target
 1.0    434
-1.0    153
 0.0     64
Name: count, dtype: int64

In [74]:
df.loc[df['target']==0, ['text', 'target']]

Unnamed: 0,text,target
14,"food was okay , nothing great .",0.0
34,the food is decent .,0.0
39,the last time i walked by it looked pretty empty . hmmm .,0.0
49,"in the evening , this place attracted a well dressed , with it , ny crowd .",0.0
58,"i like the somosas , chai , and the chole , but the dhosas and dhal were kinda disappointing .",0.0
59,"the service varys from day to day - sometimes they ' re very nice , and sometimes not .",0.0
61,if you ' ve ever been along the river in weehawken you have an idea of the top of view the chart house has to offer .,0.0
66,"the food was average to above - average ; the french onion soup filling yet not overly impressive , and the desserts not brilliant in any way .",0.0
93,this place is incredibly tiny .,0.0
106,"a large is $ 20 , and toppings are about $ 3 each .",0.0


# Processing

In [75]:
df_final = df.loc[df['target']!=0, ['text', 'target']]

In [76]:
df_final['target'] = df_final['target'].replace(-1, 0)

In [78]:
df_final['target'].value_counts()

target
1.0    434
0.0    153
Name: count, dtype: int64

In [79]:
df_final.to_csv('data/processed/final_label.csv', index=False)