In [13]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
!pip install lxml
from bs4 import BeautifulSoup 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/choijihyeok/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import urllib.request

In [15]:
# Save Reviews.csv to data
# 10k rows
data = pd.read_csv("./Reviews.csv", nrows = 100000)
print('Total amount of Reviews :', len(data))

Total amount of Reviews : 100000


In [16]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [17]:
# save text and summary column
data = data[['Text', 'Summary']]
data.head()

Unnamed: 0,Text,Summary
0,I have bought several of the Vitality canned d...,Good Quality Dog Food
1,Product arrived labeled as Jumbo Salted Peanut...,Not as Advertised
2,This is a confection that has been around a fe...,"""Delight"" says it all"
3,If you are looking for the secret ingredient i...,Cough Medicine
4,Great taffy at a great price. There was a wid...,Great taffy


In [18]:
# print random sample
# summary is short compared to Text
data.sample(10)

Unnamed: 0,Text,Summary
15291,"These weren't awful, they just weren't good. T...",Not so great
50249,Great texture and balance of sweet to butter i...,Very Good!
83031,When I first got these K-cups I looked at them...,ONE OF THE BEST
2115,You can purchase this for $35.00 including fre...,Much Cheaper through Penta's Website
72927,This is a great tea for autumn drinking. Real...,Wonderful autumn tea
97953,This has change and now contains about 25% or ...,Good Trailmix
44641,These almonds are one of my favorite snacks. I...,Yummilicious Chocolate-y Goodness
21437,"Between 11:30 and noon every day, my dog start...",Greenies...yum!!!
3798,I tried this coffee two years ago and found it...,My favorite cofee
51064,"Well, I was envious when I gave this my dog. ...",yum


In [19]:
# data preprocessing
print('the amount of samples except duplication in Text column:', data['Text'].nunique())
print('the amount of samples except duplication in Summary column:', data['Summary'].nunique())

the amount of samples except duplication in Text column: 88426
the amount of samples except duplication in Summary column: 72348


In [20]:
# remove duplication data in Text column
data.drop_duplicates(subset=['Text'], inplace=True)
print('total amount of sample:', len(data))

total amount of sample: 88426


In [21]:
# check null data
print(data.isnull().sum())

Text       0
Summary    1
dtype: int64


In [22]:
# remove null data
data.dropna(axis=0, inplace=True)
print('total amount of sample:', len(data))

total amount of sample: 88425


In [23]:
# https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
# word regularation dictionary
contractions = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}

In [24]:
# deused word
stop_words = set(stopwords.words('english'))
print('the amount of deused word:', len(stop_words))
print(stop_words)

the amount of deused word: 179
{'or', 'above', 'ain', 'yours', 'should', 'here', 'hadn', "hasn't", 'was', 'had', 'don', 'under', 'after', 'them', 'all', 'being', 'mightn', 'the', 'of', 'there', "doesn't", 'didn', 'have', 'needn', 't', 'an', 'o', 'my', "shan't", 'am', 'ma', 'weren', "couldn't", 'so', 'few', 'your', 'shan', 'ours', 'between', 'into', "weren't", 'these', 'she', "you'd", 'about', 'i', 'themselves', 'very', 'in', 'were', 'doing', "it's", 'than', 'other', 'too', 'wouldn', 'which', 'during', 'why', "haven't", 'and', 'from', 'him', 'won', 'its', 'because', 'y', 'as', 'are', 'hers', 'then', 'any', 'over', 've', 'his', 'hasn', 'myself', 'same', 'their', 'down', 'mustn', 'yourself', 'wasn', 'again', 'more', 'm', 'by', 'how', 'himself', 'once', 's', "don't", 'for', "aren't", 'those', "needn't", 'until', 'each', 'nor', 'aren', 'it', 're', "wasn't", 'if', 'both', 'will', 'haven', "you'll", 'does', 'when', 'only', 'just', 'below', 'but', 'll', 'ourselves', "should've", "she's", 'they

In [25]:
# design the preprocessing function
def preprocess_sentence(sentence, remove_stopwords = True):
    # lower case text
    sentence = sentence.lower()
    # remove html tag
    sentence = BeautifulSoup(sentence, "lxml").text
    # remove broket closed string
    sentence = re.sub(r'\([^)]*\)', '', sentence)
    # remove double quotation mark
    sentence = re.sub('"', '', sentence)
    # abbreviation normalization
    sentence = ' '.join([contractions[t] if t in contractions else t for t in sentence.split(" ")])
    # remove possessive class
    sentence = re.sub(r"'s\b", "", sentence)
    # change number and special characters to space
    sentence = re.sub("[^a-zA-Z]", " ", sentence)
    # change the amount of m if m is over 2
    sentence = re.sub('[m]{2,}', 'mm', sentence)
    
    # remove deused word in Text
    if remove_stopwords:
        tokens = ' '.join(word for word in sentence.split() if not word in stop_words if len(word) > 1)
    # not remove deused word in Sumamry
    else:
        tokens = ' '.join(word for word in sentence.split() if len(word) > 1)
    return tokens

In [27]:
# look at the result with sample
temp_text = 'Everything I bought was great, infact I ordered twice and the third ordered was<br />for my mother and father.'
temp_summary = 'Great way to start (or finish) the day!!!'
print(preprocess_sentence(temp_text))
print(preprocess_sentence(temp_summary, 0))

everything bought great infact ordered twice third ordered wasfor mother father
great way to start the day
