In [42]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import bz2
from transformers import T5ForConditionalGeneration, T5Tokenizer, BartForConditionalGeneration, BartTokenizer, pipeline
import torch
import copy
import re
import string

In [9]:
train_file = bz2.BZ2File('/kaggle/input/amazonreviews/train.ft.txt.bz2')
test_file = bz2.BZ2File('/kaggle/input/amazonreviews/test.ft.txt.bz2')

In [10]:
train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

In [11]:
del train_file
del test_file

In [12]:
test_file_lines[:20]

[b'__label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"\n',
 b"__label__2 One of the best game music soundtracks - for a game I didn't really play: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beautiful tracks I especially like, as there's not to

In [13]:
train_file_lines = list(map(lambda s: s.decode('utf-8'), train_file_lines))
test_file_lines = list(map(lambda s: s.decode('utf-8'), test_file_lines))

In [15]:
print(test_file_lines[0])

__label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life's hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"



In [16]:
def row_to_dict(s):
    label = s[:10].strip()
    review = s[11:].strip().split(':')
    title = review[0].strip()
    description = ""
    # In case there is a `:` in the description
    for t in review[1:]:
        description += t
    description = description.strip()
    return {
        "title": title,
        "description": description,
        "label": 1 if '1' in label else 2
    }
row_to_dict(test_file_lines[0])

{'title': 'Great CD',
 'description': 'My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"',
 'label': 2}

In [17]:
train_file_lines = list(map(lambda s: row_to_dict(s), train_file_lines))
test_file_lines = list(map(lambda s: row_to_dict(s), test_file_lines))

In [18]:
test_file_lines[:10]

[{'title': 'Great CD',
  'description': 'My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"',
  'label': 2},
 {'title': "One of the best game music soundtracks - for a game I didn't really play",
  'description': "Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beauti

In [19]:
train_df = pd.DataFrame(train_file_lines)
test_df = pd.DataFrame(test_file_lines)

In [20]:
test_df.head()

Unnamed: 0,title,description,label
0,Great CD,My lovely Pat has one of the GREAT voices of h...,2
1,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...,2
2,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...,1
3,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...,2
4,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...,2


In [21]:
train_df.head()

Unnamed: 0,title,description,label
0,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...,2
1,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,2
2,Amazing!,This soundtrack is my favorite music of all ti...,2
3,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...,2
4,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine...",2


In [22]:
# To not forget and change them later
amazonreviews1_train = train_df
amazonreviews1_test = test_df

In [2]:
amazonreviews2 = pd.read_csv('https://query.data.world/s/4rrp2d74nxz7ptyyhp6aghgxemdeoa?dws=00000')
amazonreviews2.head()

Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,...,reviews.didPurchase,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs
0,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,3,https://www.amazon.com/product-reviews/B00QWO9...,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...,Byger yang,"https://www.barcodable.com/upc/841710106442,ht..."
1,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,4,https://www.amazon.com/product-reviews/B00QWO9...,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...,ByMG,"https://www.barcodable.com/upc/841710106442,ht..."
2,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Well they are not Duracell but for the price i...,... are not Duracell but for the price i am ha...,BySharon Lambert,"https://www.barcodable.com/upc/841710106442,ht..."
3,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Seem to work as well as name brand batteries a...,... as well as name brand batteries at a much ...,Bymark sexson,"https://www.barcodable.com/upc/841710106442,ht..."
4,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,These batteries are very long lasting the pric...,... batteries are very long lasting the price ...,Bylinda,"https://www.barcodable.com/upc/841710106442,ht..."


In [3]:
amazonreviews2['label'] = amazonreviews2['reviews.rating'] // 4 + 1

In [4]:
amazonreviews2.head()

Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,...,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs,label
0,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,3,https://www.amazon.com/product-reviews/B00QWO9...,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...,Byger yang,"https://www.barcodable.com/upc/841710106442,ht...",1
1,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,4,https://www.amazon.com/product-reviews/B00QWO9...,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...,ByMG,"https://www.barcodable.com/upc/841710106442,ht...",2
2,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Well they are not Duracell but for the price i...,... are not Duracell but for the price i am ha...,BySharon Lambert,"https://www.barcodable.com/upc/841710106442,ht...",2
3,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Seem to work as well as name brand batteries a...,... as well as name brand batteries at a much ...,Bymark sexson,"https://www.barcodable.com/upc/841710106442,ht...",2
4,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,These batteries are very long lasting the pric...,... batteries are very long lasting the price ...,Bylinda,"https://www.barcodable.com/upc/841710106442,ht...",2


In [19]:
amazonshoes = pd.read_csv('https://query.data.world/s/4bsutouetck5cnyiph3fylefkws666?dws=00000')
amazonshoes.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at
0,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Jocelyn McSayles,Love em,Love these. Was looking for converses and thes...,5.0,True,Reviewed in the United States on 2 June 2020,2 people found this helpful,36eae4e5-2894-5279-a0b7-d2b330e2b814,24/12/2021 02:26:25
1,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Kenia Rivera,The plastic ripped,"The shoes are very cute, but after the 2nd day...",2.0,True,Reviewed in the United States on 28 October 2021,,f4778bb8-3070-5cb1-b5aa-ffce41a97b57,24/12/2021 02:26:25
2,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Chris Souza,Good quality,Good quality,5.0,True,Reviewed in the United States on 20 January 2021,,db5a7525-d40b-5265-84d8-df4f29837a3b,24/12/2021 02:26:25
3,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Amazon Customer,Good,Great,5.0,True,Reviewed in the United States on 22 April 2021,,75a42851-6462-54b5-988a-27d336221943,24/12/2021 02:26:25
4,https://www.amazon.co.uk/dp/B08SW434MG,"GUESS Women's Bradly Gymnastics Shoe, White, 7 UK",Graziella,PERFETTE!!,Ho scelto il modello bianco con rifinitura die...,5.0,True,Reviewed in Italy on 2 April 2021,2 people found this helpful,232dee43-849e-5d06-ba05-efb3f4814714,24/12/2021 02:26:25


In [20]:
amazonshoes['label'] = amazonshoes['review_rating'] // 4 + 1
amazonshoes['label'] = amazonshoes['label'].astype(int)

In [21]:
amazonshoes.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at,label
0,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Jocelyn McSayles,Love em,Love these. Was looking for converses and thes...,5.0,True,Reviewed in the United States on 2 June 2020,2 people found this helpful,36eae4e5-2894-5279-a0b7-d2b330e2b814,24/12/2021 02:26:25,2
1,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Kenia Rivera,The plastic ripped,"The shoes are very cute, but after the 2nd day...",2.0,True,Reviewed in the United States on 28 October 2021,,f4778bb8-3070-5cb1-b5aa-ffce41a97b57,24/12/2021 02:26:25,1
2,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Chris Souza,Good quality,Good quality,5.0,True,Reviewed in the United States on 20 January 2021,,db5a7525-d40b-5265-84d8-df4f29837a3b,24/12/2021 02:26:25,2
3,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Amazon Customer,Good,Great,5.0,True,Reviewed in the United States on 22 April 2021,,75a42851-6462-54b5-988a-27d336221943,24/12/2021 02:26:25,2
4,https://www.amazon.co.uk/dp/B08SW434MG,"GUESS Women's Bradly Gymnastics Shoe, White, 7 UK",Graziella,PERFETTE!!,Ho scelto il modello bianco con rifinitura die...,5.0,True,Reviewed in Italy on 2 April 2021,2 people found this helpful,232dee43-849e-5d06-ba05-efb3f4814714,24/12/2021 02:26:25,2


In [5]:
#paraphrased

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bart_model = BartForConditionalGeneration.from_pretrained('eugenesiow/bart-paraphrase')
bart_tokenizer = BartTokenizer.from_pretrained('eugenesiow/bart-paraphrase')

def paraphrase(input_sentence):
    model = bart_model.to(device)
    tokenizer = bart_tokenizer
    batch = tokenizer(input_sentence, return_tensors='pt', truncation=True, padding='max_length', max_length=512).to(device)
    generated_ids = model.generate(batch['input_ids'], max_new_tokens=1024)
    generated_sentence = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    return generated_sentence

print(paraphrase(
    "I love the style of this, but after a couple years, the DVD is giving me problems. It doesn't even work anymore and I use my broken PS2 Now. I wouldn't recommend this, I'm just going to upgrade to a recorder now. I wish it would work but I guess i'm giving up on JVC. I really did like this one... before it stopped working. The dvd player gave me problems probably after a year of having it."))

config.json:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/332 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



["I love the style of this, but after a couple years the DVD is giving me problems. It doesn't even work anymore and I'm just going to upgrade to a recorder now. I wish it would work but I guess i'm giving up on JVC."]


In [6]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
def summarize(s):
    if len(s.strip()) == 0: return ""
    l = len(s.split())
    if l <= 2: return s
    return summarizer(s, max_length=min(512, l), min_length=1,do_sample=False)[0]['summary_text']
print(summarize("Love these. Was looking for converses and these were half the price and so unique— I’ve never seen clear shoes like these; they fit great. The plastic takes a little getting used to but the style is so worth it."))

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

The plastic takes a little getting used to but the style is so worth it. I’ve never seen clear shoes like these; they fit great.


In [24]:
amazonshoes.drop_duplicates(subset=["review_text"], inplace=True)
amazonshoes['label'].value_counts()

label
2    4949
1    1589
Name: count, dtype: int64

In [27]:
balanced_amazon_shoes = []
for i, row in amazonshoes.iterrows():
    balanced_amazon_shoes.append(row)
    if row['label'] == 1:
        res = paraphrase(str(row["review_text"]))
        for t in res:
            r = copy.deepcopy(row)
            r["review_text"] = t
            balanced_amazon_shoes.append(r)


In [None]:
for i, row in amazonshoes.iterrows():
    if row['label'] == 1:
        res = summarize(str(row["review_text"]))
        r = copy.deepcopy(row)
        r["review_text"] = res
        balanced_amazon_shoes.append(r)


In [38]:
balanced_amazon_shoes[:10]

[url                             https://www.amazon.co.uk/dp/B07SBX32T5
 product_name         Klasified Women's Transparent Clear Sneaker Sh...
 reviewer_name                                         Jocelyn McSayles
 review_title                                                   Love em
 review_text          Love these. Was looking for converses and thes...
 review_rating                                                      5.0
 verified_purchase                                                 True
 review_date               Reviewed in the United States on 2 June 2020
 helpful_count                              2 people found this helpful
 uniq_id                           36eae4e5-2894-5279-a0b7-d2b330e2b814
 scraped_at                                         24/12/2021 02:26:25
 label                                                                2
 Name: 0, dtype: object,
 url                             https://www.amazon.co.uk/dp/B07SBX32T5
 product_name         Klasified Women's

In [29]:
balanced_amazon_shoes_df = pd.DataFrame(balanced_amazon_shoes)
balanced_amazon_shoes_df.drop_duplicates(subset=["review_text"], inplace=True)
balanced_amazon_shoes_df.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at,label
0,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Jocelyn McSayles,Love em,Love these. Was looking for converses and thes...,5.0,True,Reviewed in the United States on 2 June 2020,2 people found this helpful,36eae4e5-2894-5279-a0b7-d2b330e2b814,24/12/2021 02:26:25,2
1,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Kenia Rivera,The plastic ripped,"The shoes are very cute, but after the 2nd day...",2.0,True,Reviewed in the United States on 28 October 2021,,f4778bb8-3070-5cb1-b5aa-ffce41a97b57,24/12/2021 02:26:25,1
1,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Kenia Rivera,The plastic ripped,"The shoes are very cute, but after the 2nd day...",2.0,True,Reviewed in the United States on 28 October 2021,,f4778bb8-3070-5cb1-b5aa-ffce41a97b57,24/12/2021 02:26:25,1
2,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Chris Souza,Good quality,Good quality,5.0,True,Reviewed in the United States on 20 January 2021,,db5a7525-d40b-5265-84d8-df4f29837a3b,24/12/2021 02:26:25,2
3,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Amazon Customer,Good,Great,5.0,True,Reviewed in the United States on 22 April 2021,,75a42851-6462-54b5-988a-27d336221943,24/12/2021 02:26:25,2


In [30]:
balanced_amazon_shoes_df['label'].value_counts()

label
2    4949
1    3297
Name: count, dtype: int64

In [32]:
balanced_amazon_shoes_df.to_csv("/kaggle/working/amazon-shoes-review.csv")

In [34]:
amazonreviews1_train['label'].value_counts()

label
2    1800000
1    1800000
Name: count, dtype: int64

In [7]:
amazonreviews2['label'].value_counts()

label
2    25545
1     2787
Name: count, dtype: int64

In [8]:
amazonreviews2.head()

Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,...,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs,label
0,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,3,https://www.amazon.com/product-reviews/B00QWO9...,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...,Byger yang,"https://www.barcodable.com/upc/841710106442,ht...",1
1,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,4,https://www.amazon.com/product-reviews/B00QWO9...,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...,ByMG,"https://www.barcodable.com/upc/841710106442,ht...",2
2,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Well they are not Duracell but for the price i...,... are not Duracell but for the price i am ha...,BySharon Lambert,"https://www.barcodable.com/upc/841710106442,ht...",2
3,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Seem to work as well as name brand batteries a...,... as well as name brand batteries at a much ...,Bymark sexson,"https://www.barcodable.com/upc/841710106442,ht...",2
4,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,These batteries are very long lasting the pric...,... batteries are very long lasting the price ...,Bylinda,"https://www.barcodable.com/upc/841710106442,ht...",2


In [9]:
balanced_amazon_2 = []
for i, row in amazonreviews2.iterrows():
    balanced_amazon_2.append(row)
    if row['label'] == 1:
        res = paraphrase(str(row["reviews.text"]))
        for t in res:
            r = copy.deepcopy(row)
            r["reviews.text"] = t
            balanced_amazon_2.append(r)


In [10]:
for i, row in amazonreviews2.iterrows():
    balanced_amazon_2.append(row)
    if row['label'] == 1:
        res = summarize(str(row["reviews.text"]))
        for t in res:
            r = copy.deepcopy(row)
            r["reviews.text"] = t
            balanced_amazon_2.append(r)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [12]:
balanced_amazon_2_df = pd.DataFrame(balanced_amazon_2)
balanced_amazon_2_df.drop_duplicates(subset=["reviews.text"], inplace=True)
balanced_amazon_2_df.head()

Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,...,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs,label
0,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,3,https://www.amazon.com/product-reviews/B00QWO9...,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...,Byger yang,"https://www.barcodable.com/upc/841710106442,ht...",1
1,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,4,https://www.amazon.com/product-reviews/B00QWO9...,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...,ByMG,"https://www.barcodable.com/upc/841710106442,ht...",2
2,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Well they are not Duracell but for the price i...,... are not Duracell but for the price i am ha...,BySharon Lambert,"https://www.barcodable.com/upc/841710106442,ht...",2
3,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Seem to work as well as name brand batteries a...,... as well as name brand batteries at a much ...,Bymark sexson,"https://www.barcodable.com/upc/841710106442,ht...",2
4,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,These batteries are very long lasting the pric...,... batteries are very long lasting the price ...,Bylinda,"https://www.barcodable.com/upc/841710106442,ht...",2


In [13]:
balanced_amazon_2_df['label'].value_counts()

label
2    16165
1     3573
Name: count, dtype: int64

In [14]:
balanced_amazon_2 = []
for i, row in balanced_amazon_2_df.iterrows():
    balanced_amazon_2.append(row)
    if row['label'] == 1:
        res = paraphrase(str(row["reviews.text"]))
        for t in res:
            r = copy.deepcopy(row)
            r["reviews.text"] = t
            balanced_amazon_2.append(r)
for i, row in balanced_amazon_2_df.iterrows():
    balanced_amazon_2.append(row)
    if row['label'] == 1:
        res = summarize(str(row["reviews.text"]))
        for t in res:
            r = copy.deepcopy(row)
            r["reviews.text"] = t
            balanced_amazon_2.append(r)


In [15]:
balanced_amazon_2_df = pd.DataFrame(balanced_amazon_2)
balanced_amazon_2_df.drop_duplicates(subset=["reviews.text"], inplace=True)
balanced_amazon_2_df.head()

Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,...,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs,label
0,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,3,https://www.amazon.com/product-reviews/B00QWO9...,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...,Byger yang,"https://www.barcodable.com/upc/841710106442,ht...",1
1,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,4,https://www.amazon.com/product-reviews/B00QWO9...,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...,ByMG,"https://www.barcodable.com/upc/841710106442,ht...",2
2,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Well they are not Duracell but for the price i...,... are not Duracell but for the price i am ha...,BySharon Lambert,"https://www.barcodable.com/upc/841710106442,ht...",2
3,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Seem to work as well as name brand batteries a...,... as well as name brand batteries at a much ...,Bymark sexson,"https://www.barcodable.com/upc/841710106442,ht...",2
4,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,These batteries are very long lasting the pric...,... batteries are very long lasting the price ...,Bylinda,"https://www.barcodable.com/upc/841710106442,ht...",2


In [16]:
balanced_amazon_2_df['label'].value_counts()

label
2    16165
1     3982
Name: count, dtype: int64

In [22]:
majority_class = balanced_amazon_2_df[balanced_amazon_2_df['label'] == 2]
minority_class = balanced_amazon_2_df[balanced_amazon_2_df['label'] == 1]

In [24]:
majority_class_undersampled = majority_class.sample(len(minority_class))

In [26]:
df_undersampled = pd.concat([majority_class_undersampled, minority_class])
df_undersampled['label'].value_counts()

label
2    3982
1    3982
Name: count, dtype: int64

In [27]:
balanced_amazon_2_df.to_csv("/kaggle/working/amazon2_full.csv")

In [28]:
df_undersampled.to_csv("/kaggle/working/amazon2_undersampeled.csv")

# Loading downloaded files

In [25]:
amazonreviews1_train.head()

Unnamed: 0,title,description,label
0,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...,2
1,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,2
2,Amazing!,This soundtrack is my favorite music of all ti...,2
3,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...,2
4,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine...",2


In [31]:
balanced_amazon_2_df = pd.read_csv('/kaggle/input/amazon-2-undersampled/amazon2_undersampeled.csv')
balanced_amazon_2_df.rename(columns={'reviews.title': 'title','reviews.text': 'description'}, inplace=True)
balanced_amazon_2_df.head()

Unnamed: 0.1,Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,...,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,description,title,reviews.username,sourceURLs,label
0,10761,AVpe7xlELJeJML43ypLz,2015-12-03T01:23:41Z,2019-04-24T02:17:42Z,AmazonBasics AA Performance Alkaline Batteries...,"B00QWO9P0O,B01IB83NZG,B00MNV8E0C",Amazonbasics,"AA,AAA,Electronics Features,Health,Electronics...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,We go through a lot of double a batteries. Aft...,Great deal in my book,ByBobby,"https://www.barcodable.com/upc/841710106411,ht...",2
1,14825,AVpfw2hvilAPnD_xh0rH,2017-01-11T06:58:33Z,2019-03-09T07:13:43Z,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...",B018Y226XO,Amazon,"Fire Tablets,Learning Toys,Toys,Tablets,Amazon...","Toys & Games,Electronics",https://pisces.bbystatic.com/image2/BestBuy_US...,...,True,,0.0,5,http://reviews.bestbuy.com/3545/5026100/review...,I loved it. I purchased it for my daughter. We...,Great device for kids,Tanya,http://www.toysrus.com/product/index.jsp?produ...,2
2,16837,AVpfw2hvilAPnD_xh0rH,2017-01-11T06:58:33Z,2019-03-09T07:13:43Z,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...",B018Y226XO,Amazon,"Fire Tablets,Learning Toys,Toys,Tablets,Amazon...","Toys & Games,Electronics",https://pisces.bbystatic.com/image2/BestBuy_US...,...,True,,0.0,5,http://reviews.bestbuy.com/3545/5026100/review...,My 4 year old loves it. The amazon free time i...,My son loves it!,Bruzn,http://www.toysrus.com/product/index.jsp?produ...,2
3,2874,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,last a long time,Five Stars,Bypling,"https://www.barcodable.com/upc/841710106442,ht...",2
4,9032,AVpe7xlELJeJML43ypLz,2015-12-03T01:23:41Z,2019-04-24T02:17:42Z,AmazonBasics AA Performance Alkaline Batteries...,"B00QWO9P0O,B01IB83NZG,B00MNV8E0C",Amazonbasics,"AA,AAA,Electronics Features,Health,Electronics...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,...,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Can't beat the price. I love this product. It ...,Five Stars,ByKas M,"https://www.barcodable.com/upc/841710106411,ht...",2


In [32]:
balanced_amazon_shoes_df = pd.read_csv('/kaggle/input/amazon-shoes/amazon-shoes-review.csv')
balanced_amazon_shoes_df.rename(columns={'review_title': 'title','review_text': 'description'}, inplace=True)
balanced_amazon_shoes_df.head()

Unnamed: 0.1,Unnamed: 0,url,product_name,reviewer_name,title,description,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at,label
0,0,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Jocelyn McSayles,Love em,Love these. Was looking for converses and thes...,5.0,True,Reviewed in the United States on 2 June 2020,2 people found this helpful,36eae4e5-2894-5279-a0b7-d2b330e2b814,24/12/2021 02:26:25,2
1,1,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Kenia Rivera,The plastic ripped,"The shoes are very cute, but after the 2nd day...",2.0,True,Reviewed in the United States on 28 October 2021,,f4778bb8-3070-5cb1-b5aa-ffce41a97b57,24/12/2021 02:26:25,1
2,1,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Kenia Rivera,The plastic ripped,"The shoes are very cute, but after the 2nd day...",2.0,True,Reviewed in the United States on 28 October 2021,,f4778bb8-3070-5cb1-b5aa-ffce41a97b57,24/12/2021 02:26:25,1
3,2,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Chris Souza,Good quality,Good quality,5.0,True,Reviewed in the United States on 20 January 2021,,db5a7525-d40b-5265-84d8-df4f29837a3b,24/12/2021 02:26:25,2
4,3,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Amazon Customer,Good,Great,5.0,True,Reviewed in the United States on 22 April 2021,,75a42851-6462-54b5-988a-27d336221943,24/12/2021 02:26:25,2


In [59]:
common_columns = balanced_amazon_shoes_df.columns.intersection(amazonreviews1_train.columns)

train_df = pd.concat([amazonreviews1_train, balanced_amazon_shoes_df, balanced_amazon_2_df], axis=0, ignore_index=True)[common_columns]
train_df.head()

Unnamed: 0,title,description,label
0,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...,2
1,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,2
2,Amazing!,This soundtrack is my favorite music of all ti...,2
3,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...,2
4,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine...",2


In [37]:
train_df['label'].value_counts()

label
2    1808931
1    1807279
Name: count, dtype: int64

In [39]:
train_df.drop_duplicates(subset=['description'], inplace=True)
train_df['label'].value_counts()

label
2    1807600
1    1803478
Name: count, dtype: int64

In [58]:
# process

def preprocess(sentence):
    ret = sentence.lower()
    ret = ret.translate(str.maketrans('', '', string.punctuation))
    NON_ASCII = re.compile(r'[^A-Za-z0-9\s]')
    ret = NON_ASCII.sub(r'', ret)
    return ret.strip()
preprocess("Hello, I'm testing this. لا كلام بالعربية في الناتج")

'hello im testing this'

In [60]:
train_df.isna().sum()
train_df = train_df.dropna()

In [61]:
train_df['title'] = train_df['title'].apply(preprocess)
train_df['description'] = train_df['description'].apply(preprocess)
train_df.head()

Unnamed: 0,title,description,label
0,stuning even for the nongamer,this sound track was beautiful it paints the s...,2
1,the best soundtrack ever to anything,im reading a lot of reviews saying that this i...,2
2,amazing,this soundtrack is my favorite music of all ti...,2
3,excellent soundtrack,i truly like this soundtrack and i enjoy video...,2
4,remember pull your jaw off the floor after hea...,if youve played the game you know how divine t...,2


In [62]:
test_df.isna().sum()

title          0
description    0
label          0
dtype: int64

In [63]:
test_df['title'] = test_df['title'].apply(preprocess)
test_df['description'] = test_df['description'].apply(preprocess)
test_df.head()

Unnamed: 0,title,description,label
0,great cd,my lovely pat has one of the great voices of h...,2
1,one of the best game music soundtracks for a ...,despite the fact that i have only played a sma...,2
2,batteries died within a year,i bought this charger in jul 2003 and it worke...,1
3,works fine but maha energy is better,check out maha energys website their powerex m...,2
4,great for the nonaudiophile,reviewed quite a bit of the combo players and ...,2


In [None]:
train_df.to_csv("/kaggle/working/amazon_reviews_train_cleaned.csv")
test_df.to_csv("/kaggle/working/amazon_reviews_test_cleaned.csv")