In [10]:
import json
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
from ctt import clean

In [11]:
fields = ['ID','A','B','C','Positive']

stopwords_to_keep = {'no', 'not'}
stopwords = clean.nltk_stopwords - stopwords_to_keep

In [12]:
# open the labeled data
with open('unparsedreviews.json', 'r') as f:
    data = json.loads(f.read())

In [13]:
data[0]

{'ID': 'ckla2581k00003f6gov4e79it',
 'DataRow ID': 'ckl04eecfkfz70rhgc2akcxmm',
 'Labeled Data': '{"compare":{"A":"<b>my dogs love them:<br /> </b>Two big dogs....a German Shepherd  and a rott/lab mix.......will chew on them<br />for hours without destroying them.  Some different from other products which are<br />gone in five minutes","B":"<b>Not just Bonez?:<br /> </b>I was at a local toys r us and bought 25 cents worth of bonez and out came the bonez but i found an orange skull with them!!! wow weird? i didn\'t eat it :P","C":"<b>tasty with so many dishes:<br /> </b>it is extreemely rare and difficult to find hot rotels where i live, i can find the original and mild but i like mine hot!. these were a decent price and now i will have some on hand for a while. if you like the spicy foods i recommend cooking with this.","A_id":119485,"B_id":313209,"C_id":195058}}',
 'Label': [{'triplet_label': 'false'}],
 'Created By': 'dmboyer@clemson.edu',
 'Project Name': 'Qualitative Analysis Tripl

In [14]:
with open('review_labels.csv','w') as f:
    csvwriter = csv.writer(f) 
    csvwriter.writerow(fields)
    for i in range(len(data)):
        # Get ID
        data_id = data[i]['ID']

        # Get Text Data
        data_dict = json.loads(data[i]['Labeled Data'])['compare']
        data_a = data_dict['A']
        data_b = data_dict['B']
        data_c = data_dict['C']

        # Get review id da
        data_aid = data_dict['A_id']
        # Get Label Data
        if data[i]['Label'] == 'Skip':
            data_label = 'Skip'
        elif data[i]['Label'][0] == {}:
            data_label = 'Null'
        elif i < 1124:
            data_label = data[i]['Label'][0]['triplet_label']
        else:
            data_label = data[i]['Label'][0]['sentiment']
            
        
        # Write Data to CSV
        csvwriter.writerow([data_id, data_a, data_b, data_c, data_label])
    
print("Data trasnferred!")

Data trasnferred!


In [15]:
df = pd.read_csv('review_labels.csv')
df.head()

Unnamed: 0,ID,A,B,C,Positive
0,ckla2581k00003f6gov4e79it,<b>my dogs love them:<br /> </b>Two big dogs.....,<b>Not just Bonez?:<br /> </b>I was at a local...,<b>tasty with so many dishes:<br /> </b>it is ...,False
1,ckla25jyq00013f6g3tyhf7pa,<b>My cats love it...:<br /> </b>Our cats love...,<b>Love Stash tea:<br /> </b>Bought this assor...,<b>Jeremiah's Coffee Beans:<br /> </b>These be...,True
2,ckla25s3j00023f6g0vmu9gk0,<b>Favorite Tea Ever:<br /> </b>Gypsy rose is ...,<b>Quality Food:<br /> </b>I have been feeding...,<b>COLLEGE STUDENT'S BEST FRIEND:<br /> </b>Th...,False
3,ckla260fw00033f6g4sj9dpto,<b>Very Convincing Meat Replacement:<br /> </b...,<b>Way too overpriced:<br /> </b>I want the ab...,<b>The Perfect Cat Food:<br /> </b>This is the...,False
4,ckla26ftv00043f6gt0x3jenz,"<b>Great Value. Bold taste.:<br /> </b>OK, he...","<b>OK,not the real thing.But...:<br /> </b>You...","<b>Bad taste, tough, not worth it:<br /> </b>I...",False


### Cleaning steps:
1. drop cases where Label='Skip' or NaN
2. combine cases where reviewers agree
3. drop cases where reviewers disagree
4. randomly assign train/val/test split column

In [16]:
# 1. drop cases where Label is not in ['true', 'B', 'C']
is_valid = df.Positive.apply(lambda x: True if x in ['true', 'B','C'] else False)
df = df[is_valid]
df.shape

(249, 5)

In [17]:
# replace all true with B
replace_true = df.Positive.apply(lambda x: True if x in ['true'] else False)
df.loc[replace_true, 'Positive'] = 'B'

In [18]:
df

Unnamed: 0,ID,A,B,C,Positive
1,ckla25jyq00013f6g3tyhf7pa,<b>My cats love it...:<br /> </b>Our cats love...,<b>Love Stash tea:<br /> </b>Bought this assor...,<b>Jeremiah's Coffee Beans:<br /> </b>These be...,B
5,ckla26x6z00053f6ge9xen1e0,<b>The Perfect Afternoon Pick-Me-Up:<br /> </b...,<b>good to find decafe one:<br /> </b>Glad to ...,<b>Excellent product Reasonably priced.Highly ...,B
10,ckla28t9q000a3f6grvtvndjy,<b>Overpowering spices:<br /> </b>I am a big f...,<b>These tastes pretty good:<br /> </b>These t...,<b>Smooth and rich:<br /> </b>Green Mountain C...,B
11,ckla28zrc000b3f6gm4efjaq8,"<b>Cookies, cookies please!:<br /> </b>I reall...",<b>Awesome:<br /> </b>I like almost everything...,<b>Great purchase price for a great item:<br /...,B
17,ckla2do3300003f6gogli1bdo,<b>Yum:<br /> </b>So I was going to the store ...,<b>Coconut Curry Chicken Unexpectedly Amazing!...,<b>Chai-ing to like it:<br /> </b>I am a huge ...,B
22,ckla2fio900053f6gvcj2o6cr,<b>Yummy:<br /> </b>This is my favorite tea. R...,<b>U Bet It's The Best:<br /> </b>This is the ...,<b>My cat is ill because of it:<br /> </b>My 2...,B
27,ckla2hv4l000a3f6gtrtv30xi,<b>Awful is too generous:<br /> </b>Have a co-...,<b>A little too much bitterness & acid:<br /> ...,<b>Like most others:<br /> </b>Clear scalp and...,B
28,ckla2i3by000b3f6g4d3w9xxu,<b>Big & Bold:<br /> </b>BAM - A rich and bold...,"<b>Tastes great? Really?:<br /> </b>OK, let's...",<b>2.5:<br /> </b>Not too thrilled. Was expect...,B
36,ckla2kwbv000j3f6gymzaf6fe,<b>Awesome Beverage:<br /> </b>I bought the Ke...,"<b>Great flavor, great price!:<br /> </b>I've ...","<b>Great Product, but overpriced on this site:...",B
45,ckla2nv9100003f6gpiajk8r7,<b>Destroyed:<br /> </b>My dogs destroyed this...,<b>Wonderful Breakfast Bar!!:<br /> </b>These ...,<b>No Stevia in this Product:<br /> </b>If I r...,B


In [20]:
# clean data
df.A = df.A.apply(lambda x: clean.kitchen_sink(x, stopwords))
df.B = df.B.apply(lambda x: clean.kitchen_sink(x, stopwords))
df.C = df.C.apply(lambda x: clean.kitchen_sink(x, stopwords))

# for i in range(len(df)):
#     print(f'Replacing: {i}')
#     df.loc[i,'A'] = clean.kitchen_sink(df.loc[i,'A'], stopwords)
#     df.loc[i,'B'] = clean.kitchen_sink(df.loc[i,'B'], stopwords)
#     df.loc[i,'C'] = clean.kitchen_sink(df.loc[i,'C'], stopwords)


In [21]:
df

Unnamed: 0,ID,A,B,C,Positive
1,ckla25jyq00013f6g3tyhf7pa,cats love cats love soft cat food not surprise...,love stash tea bought assortment mainly guests...,jeremiah coffee beans beans produce nice tasti...,B
5,ckla26x6z00053f6ge9xen1e0,perfect afternoon pickmeup two jobs requires l...,good find decafe one glad find birthday presen...,excellent product reasonably recommended splen...,B
10,ckla28t9q000a3f6grvtvndjy,overpowering spices big fan chocolate cinnamon...,tastes pretty good tastes good not fried not b...,smooth rich green mountain coffee roasters beg...,B
11,ckla28zrc000b3f6gm4efjaq8,cookies cookies please really love cookies ord...,awesome like almost everything product start p...,great purchase price great item item available...,B
17,ckla2do3300003f6gogli1bdo,yum going store morning saw larabar pocket ate...,coconut curry chicken unexpectedly amazing saw...,chaiing like huge fan tea never liked chai tea...,B
22,ckla2fio900053f6gvcj2o6cr,yummy favorite tea regular green tea tastes li...,u bet best best chacolatei order line every co...,cat ill year old cindy developed allergic reac...,B
27,ckla2hv4l000a3f6gtrtv30xi,awful generous coworker drinks decaf got terri...,little much bitterness acid less bold flavor a...,like others clear scalp hair therapy another p...,B
28,ckla2i3by000b3f6g4d3w9xxu,big bold bam rich bold roast order prefer blen...,tastes great really ok let start important asp...,not thrilled expecting bliss received mediocri...,B
36,ckla2kwbv000j3f6gymzaf6fe,awesome beverage bought keurig husband really ...,great flavor great price purchase natural calm...,great product overpriced site using product ye...,B
45,ckla2nv9100003f6gpiajk8r7,destroyed dogs destroyed ball less minutes che...,wonderful breakfast bar bars great not even pa...,no stevia product realized no stevia product w...,B


In [22]:
# 2. combine duplicates where reviewers agree
is_dupe = df.duplicated(subset=['A', 'B', 'C', 'Positive'], keep='first')
df = df[~is_dupe].sort_values('A')
df.shape

(221, 5)

In [23]:
# 3. drop cases where reviewers disagree
is_dupe = df.duplicated(subset=['A', 'B', 'C'], keep=False)
df=df[~is_dupe]
df.shape

(219, 5)

In [24]:
df.Positive.value_counts()

B    196
C     23
Name: Positive, dtype: int64

In [25]:
# 4. randomly assign train/val/test split column
f_tr, f_vl, f_ts = 0.85, 0.05, 0.1

In [26]:
df_train_val, df_test = train_test_split(df, test_size=f_ts, stratify=df.Positive)

In [27]:
df_train, df_val = train_test_split(df_train_val, test_size = f_vl/(1-f_ts), stratify=df_train_val.Positive)

In [28]:
df_train['split']='train'
df_val['split']='val'
df_test['split']='test'
df = pd.concat([df_train, df_val, df_test])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [30]:
df

Unnamed: 0,ID,A,B,C,Positive,split
77,cklcopwjl000e3e5se9zfjdfx,fantastic tea buying tea years time favorite s...,really good sugar free hard candy highly recom...,good look good taste good hit highly recommend...,B,train
451,cklxvuo34000t3e5sb37zftvg,love em gummies perfect flavor opinion not swe...,nicer fragrance relaxing pleasing lavender fra...,expensive product tastes cheap cadylike way co...,B,train
939,ckmnv9ned000z3g6h2ih3fj96,multiple ordersgreat value ordered manitoba he...,heartland granola cereal order groceries onlin...,disappointed hoping reduce cost making cherry ...,B,train
1138,ckqvox4vb0003246g9o07cfti,cheaper walmart kids love thought would look a...,low acid lacks flavor normally got word gourme...,smooth refreshing love water water tastes grea...,B,train
537,cklzdh8vh00063e5s242rxn1h,cat crack bobbaaayyy cat loves stuff ignores r...,best dog food ever switched wellness puppy wel...,flavor not expected labeled green tea touch po...,B,train
899,ckmnujifq00093g6hx6ip5ddu,warning one chocolate creamer made mistake not...,product different box product came plain brown...,like vaseline coconut oil feels looks like vas...,B,train
97,cklcp790p000d3e5s8n6352i2,natural dogs love using dog treats long time d...,addicting tea addicted tea drink unsweetened t...,no flavor straight vinegar chemicals honey dij...,B,train
678,ckm6rziaw00153e5s6ia4082r,joke not matter tasty potato chips person list...,taste terrible way better navitas tasted prett...,fair two medium size dogs love treats thicknes...,B,train
733,ckmaz1ju6000g3e5ssf363b5i,mother loves buy coffee mother keurig likes bo...,vet amazed originally bought product michigan ...,merrick miss porky chews terrific buy little d...,B,train
531,cklzd9f2f00003e5s74ncsbel,great product best dark chocolate ever eat bad...,fantastic purchased past websites love excited...,rip star amazon stars product first false adve...,B,train


In [31]:
df.split.value_counts()/ len(df)

train    0.849315
test     0.100457
val      0.050228
Name: split, dtype: float64

In [32]:
df.to_csv('triplet_data.csv', index = False)