In [17]:
import spacy
from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
from spacy.util import minibatch
from tqdm import tqdm # loading bar
from spacy.training.example import Example
import random

#spacy.prefer_gpu()

nlp = spacy.load("en_core_web_sm")

config = {
   "threshold": 0.5,
   "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
}


# Create Text categorizer instance
#textcat = nlp.add_pipe("textcat",config=config)
textcat = nlp.add_pipe("textcat")

#Disable all other pipes except Text Categorizer
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
nlp.disable_pipes(*other_pipes)

# Add our desired labels for the Text Categorizer 
textcat.add_label("1 Star")
textcat.add_label("2 Star")
textcat.add_label("3 Star")
textcat.add_label("4 Star")
textcat.add_label("5 Star")


exampleText = "i used to beats headphones but after their partnership with monster cables ended, the quality of their headphones went down hill. i was replacing my beats with new ones every 9-12 months since the headphones keep blowing out. these v-moda headphones are great, never had any issue on the construction and durability of these headphones. the sound quality is top notch and provides a deeper bass sound than beats and bose in-ear headphones. bit on the pricey side, but worth the purchase if you're an avid listener."

annot = {
    "cats":{
        "5 Star" : True,
        "4 Star" : False,
        "3 Star" : False,
        "2 Star" : False,
        "1 Star" : False,
        }
    }

TRAIN_DATA = list()

for i in range(1):
    nlp.make_doc(exampleText)
    exampleTuple = (exampleText,annot)
    TRAIN_DATA.append(exampleTuple)

exampleList = list()
for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    exampleList.append(example)

textcat.initialize(lambda: exampleList, nlp=nlp)

#print(train_data)

#Training
optimizer = nlp.resume_training()
for itn in tqdm(range(5)):
    print("Starting iteration " + str(itn))
    random.shuffle(TRAIN_DATA)
    #create batches of training data
    batches = minibatch(TRAIN_DATA, size=50)
    losses = {}
    #Implement batching
    for batch in batches:
        exampleLst = []
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            exampleLst.append(example)
        losses = textcat.update(exampleLst, sgd=optimizer)
        print(losses)

doc2 = nlp("Great Energy Level VI tr 4343avel charger! Very compact and fast charging. It is the most convenient product I never seen before ! I like it. I am very happy with this seller . Order packed very well and ship fast .Great energy level VI slimmest travel charger . Courteous service !")
print(doc2.cats)


##print('Iterations',iterations,'ExecutionTime',time.time()-start)


def createExampleObject(Review: str, stars: int):
    print()


100%|██████████| 5/5 [00:00<00:00, 29.79it/s]

Starting iteration 0
{'textcat': 0.1600000113248825}
Starting iteration 1
{'textcat': 0.15161681175231934}
Starting iteration 2
{'textcat': 0.1377629041671753}
Starting iteration 3
{'textcat': 0.11752791702747345}
Starting iteration 4
{'textcat': 0.09192810952663422}
{'1 Star': 0.156964510679245, '2 Star': 0.156964510679245, '3 Star': 0.156964510679245, '4 Star': 0.156964510679245, '5 Star': 0.37214195728302}





In [4]:
import numpy as np
import pandas as pd
import warnings
import re
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
import spacy


warnings.filterwarnings('ignore') # Hides warning
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Data cleaning

missing_value = ["N/a", "na", np.nan, np.NAN, np.NaN, "null"]
df = pd.read_table('amazon_reviews_us_Electronics_v1_00.tsv', error_bad_lines=False, na_values=missing_value)

df = df.dropna()
data = df.copy()
review_id = len(data["review_id"].unique())
#print("review_id: " + str(review_id))


# Visualizing the distributions of numerical variables:

#data.hist(bins=50, figsize=(20,15))
#plt.show()



# Train/Test Split
training_data, testing_data = train_test_split(data, test_size=0.2, random_state=25)

#data.info()

# Normalization : 1- converting all the characters to lowercase

training_data['review_body'] = training_data['review_body'].str.lower()
training_data['review_headline'] = training_data['review_headline'].str.lower()



# Normalization : 2- converting all whitespace and punctuation into a single space to get rid of any inconsistencies.
review_body = re.sub(' +', ' ', str(training_data['review_body']))
review_headline = re.sub(' +', ' ', str(training_data['review_headline']))



review_body = re.sub(r"""
               [,.;@#?!&$]+  # Accept one or more copies of punctuation
               \ *           # plus zero or more copies of a space,
               """,
               " ",          # and replace it with a single space
               str(training_data['review_body']), flags=re.VERBOSE)
review_headline = re.sub(r"""
               [,.;@#?!&$]+  # Accept one or more copies of punctuation
               \ *           # plus zero or more copies of a space,
               """,
               " ",          # and replace it with a single space
               str(training_data['review_headline']), flags=re.VERBOSE)


# Noise Removal: Removing HTML Tags (using BeautifulSoup’s)

review_body = BeautifulSoup(review_body, "lxml").text



# Noise Removal: Expanding Contractions

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    return phrase

review_body = decontracted(review_body)


print(len(review_body))
print(review_body)


# TODO: Either save cleaned dataset as new file or warp in function and return to model.py

b'Skipping line 9076: expected 15 fields, saw 22\nSkipping line 19256: expected 15 fields, saw 22\nSkipping line 24313: expected 15 fields, saw 22\nSkipping line 47211: expected 15 fields, saw 22\nSkipping line 54295: expected 15 fields, saw 22\nSkipping line 56641: expected 15 fields, saw 22\nSkipping line 63067: expected 15 fields, saw 22\n'
b'Skipping line 93796: expected 15 fields, saw 22\n'
b'Skipping line 132806: expected 15 fields, saw 22\nSkipping line 164631: expected 15 fields, saw 22\nSkipping line 167019: expected 15 fields, saw 22\nSkipping line 167212: expected 15 fields, saw 22\n'
b'Skipping line 198103: expected 15 fields, saw 22\nSkipping line 199191: expected 15 fields, saw 22\nSkipping line 202841: expected 15 fields, saw 22\nSkipping line 218228: expected 15 fields, saw 22\nSkipping line 235900: expected 15 fields, saw 22\n'
b'Skipping line 277761: expected 15 fields, saw 22\nSkipping line 304582: expected 15 fields, saw 22\nSkipping line 312029: expected 15 fields,

672
1509930    i used to beats headphones but after their par 
1036164    it is not difficult to set up the alarm and eve 
240069                           they work great no issuess
2235881    easy mount perfect mount and easy to install  
1503924    worked great until it popped after 9 months p 
                                  
822215                        my grand daughter loves these 
2227643    bought them for a hard to buy older boy who ab 
2713881    these headphones are great but only for mp3 p 
1055292    the sound is great and they are super simple t 
29829      nice and compact comfortable to wear for long 
Name: review_body Length: 2472696 dtype: object
