In [1]:
import re # Regular expression
import time

import pandas as pd
import seaborn as sns


import nltk # Natural Language Tool Kit
from nltk.corpus import stopwords

## Load data

In [2]:
df = pd.read_pickle('./finefood_dataset.pkl')
df.head()

Unnamed: 0,helpfulness,productId,profileName,score,summary,text,time,userId
0,1/1,B001E4KFG0,delmartian,5.0,Good Quality Dog Food,I have bought several of the Vitality canned d...,1303862400,A3SGXH7AUHU8GW
1,0/0,B00813GRG4,dll pa,1.0,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,1346976000,A1D87F6ZCVE5NK
2,1/1,B000LQOCH0,"Natalia Corres ""Natalia Corres""",4.0,"""Delight"" says it all",This is a confection that has been around a fe...,1219017600,ABXLMWJIXXAIN
3,3/3,B000UA0QIQ,Karl,2.0,Cough Medicine,If you are looking for the secret ingredient i...,1307923200,A395BORC6FGVXV
4,0/0,B006K2ZZ7K,"Michael D. Bigham ""M. Wassir""",5.0,Great taffy,Great taffy at a great price. There was a wid...,1350777600,A1UQRSCLF8GW1T


In [3]:
df.drop(["helpfulness","productId","profileName", "time", "userId"], axis=1, inplace=True)
df.head()

Unnamed: 0,score,summary,text
0,5.0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1.0,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,4.0,"""Delight"" says it all",This is a confection that has been around a fe...
3,2.0,Cough Medicine,If you are looking for the secret ingredient i...
4,5.0,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
df.shape

(568454, 3)

## Dataset Cleaning

credit to: https://github.com/hlamba28/Amazon-Food-Review/blob/master/1.%20Exploratory%20Analysis%20and%20Cleaning/EDA%20and%20Cleaning%20-%20Amazon%20Reviwes.ipynb

### Duplicates

In [5]:
df_dedup = df.drop_duplicates(subset={'score', 'summary','text'}).copy()
df_dedup.shape

(395006, 3)

### HTML tags 

In [6]:
i = 0
for sen in df_dedup['text'].values:
    if(len(re.findall('<.*?>', sen))): # Find all strings starting with '<' and ending with '>'
        print(sen,"\n\n")
        i += 1
    if i == 5:
        break

I don't know if it's the cactus or the tequila or just the unique combination of ingredients, but the flavour of this hot sauce makes it one of a kind!  We picked up a bottle once on a trip we were on and brought it back home with us and were totally blown away!  When we realized that we simply couldn't find it anywhere in our city we were bummed.<br /><br />Now, because of the magic of the internet, we have a case of the sauce and are ecstatic because of it.<br /><br />If you love hot sauce..I mean really love hot sauce, but don't want a sauce that tastelessly burns your throat, grab a bottle of Tequila Picante Gourmet de Inclan.  Just realize that once you taste it, you will never want to use any other sauce.<br /><br />Thank you for the personal, incredible service! 


Twizzlers, Strawberry my childhood favorite candy, made in Lancaster Pennsylvania by Y & S Candies, Inc. one of the oldest confectionery Firms in the United States, now a Subsidiary of the Hershey Company, the Company

### Stopwords

In [7]:
stop_words = set(stopwords.words('english')) #set of stopwords
print(stop_words)

{'down', 'needn', 'weren', 'of', 'i', 'myself', "wouldn't", 'will', 'yourself', 'against', 'being', "didn't", 'or', 'haven', 'up', "you'd", 'the', 'shan', 'who', 'whom', 'just', 'ain', 'now', "isn't", "you've", 'aren', 'which', 'when', 'had', "shan't", 'our', 'd', 'from', 'some', 'is', 's', 'doing', "shouldn't", 'if', 'off', 'after', 'all', 'yourselves', 'why', "doesn't", 'you', 'any', "aren't", 'shouldn', 'there', 'himself', 'don', 've', 'y', 'in', 'between', 'here', 'few', 'above', 'this', 'do', 'because', 'how', 'ourselves', "don't", 'didn', 'he', 'under', 'each', 'your', 'other', 'ours', 'her', "it's", 'most', 'out', 'be', "couldn't", 'yours', 'his', 'own', 'doesn', 'below', 'does', 'such', 'an', 'before', 'should', 'more', 're', 'hadn', 'we', "haven't", "wasn't", 'to', 'can', "weren't", 'herself', 'him', 'by', 'ma', "won't", 'until', 'has', 'about', 'll', 'as', 'again', 'but', 'themselves', 'they', "mightn't", 'hers', "mustn't", 'me', 'hasn', 'into', 'did', 'only', 'wasn', 'been',

### Stemming

In [8]:
sno = nltk.stem.SnowballStemmer('english')

### Function to clean the data from the above

In [9]:
def data_cleaning(series):
    '''The function takes a Pandas Series object containing text in all the cells
       And performs following Preprocessing steps on each cell:
       1. Clean text from html tags
       2. Clean text from punctuations and special characters
       3. Retain only non-numeric Latin characters with lenght > 2
       4. Remove stopwords from the sentence
       5. Apply lower casing
       6. Apply stemming to all the words in the sentence
       
       Return values:
       1. final_string - List of cleaned sentences
       2. list_of_sent - List of lists which will be used as input to the W2V model'''
    
    i = 0
    string = ""
    final_string = []    ## This list will contain cleaned sentences
    list_of_sent = []    ## This is a list of lists used as input to the W2V model at a later stage
    cleanr = re.compile('<.*?>') # Compile re to remove html tags
    
    for sent in series.values:
        filtered_sent = []
        sent = re.sub(cleanr, ' ', sent) # remove html tags
        sent = re.sub('[^a-zA-Z0-9\n]', ' ', sent) # remove special characters
        sent = re.sub('\s+',' ', sent) # replace multiple spaces with single space
        sent = sent.lower() # convert all characters to lower case
        for word in sent.split():
            if word not in stop_words and len(word)>2:
                word = sno.stem(word) # Apply Stemming using snowball stemmer
                filtered_sent.append(word)
        list_of_sent.append(filtered_sent) # This list is used later
        string = " ".join(filtered_sent) # Cleaned sentence
        final_string.append(string) # List of cleaned sentences
        i+=1
    return final_string, list_of_sent

#### Cleaning Example

In [10]:
print('Before Cleaning\n\n')
for x in df_dedup['text'].iloc[:5].values:
    print(x,"\n\n")

Before Cleaning


I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most. 


Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo". 


This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch. 


If you a

In [11]:
print("After cleaning \n\n")
final_string, list_of_sent = data_cleaning(df_dedup['text'].iloc[:5])
for x in final_string:
    print(x,"\n\n")

After cleaning 


bought sever vital can dog food product found good qualiti product look like stew process meat smell better labrador finicki appreci product better 


product arriv label jumbo salt peanut peanut actual small size unsalt sure error vendor intend repres product jumbo 


confect around centuri light pillowi citrus gelatin nut case filbert cut tini squar liber coat powder sugar tini mouth heaven chewi flavor high recommend yummi treat familiar stori lewi lion witch wardrob treat seduc edmund sell brother sister witch 


look secret ingredi robitussin believ found got addit root beer extract order good made cherri soda flavor medicin 


great taffi great price wide assort yummi taffi deliveri quick taffi lover deal 




### Cleaning the whole texts in dataset

In [12]:
start = time.time()
df_dedup['text_cleaned'], list_of_sent = data_cleaning(df_dedup['text'])
df_dedup['summary_cleaned'], list_of_sent = data_cleaning(df_dedup['summary'])
end = time.time()
print("Time takes in seconds =", end - start)

Time takes in seconds = 208.69525051116943


In [13]:
df_dedup.head()

Unnamed: 0,score,summary,text,text_cleaned,summary_cleaned
0,5.0,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought sever vital can dog food product found ...,good qualiti dog food
1,1.0,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arriv label jumbo salt peanut peanut a...,advertis
2,4.0,"""Delight"" says it all",This is a confection that has been around a fe...,confect around centuri light pillowi citrus ge...,delight say
3,2.0,Cough Medicine,If you are looking for the secret ingredient i...,look secret ingredi robitussin believ found go...,cough medicin
4,5.0,Great taffy,Great taffy at a great price. There was a wid...,great taffi great price wide assort yummi taff...,great taffi


## Feature engineering

### Word counts

In [14]:
df_dedup['summary'] = df_dedup['summary'].str.lower()
df_dedup['text'] = df_dedup['text'].str.lower()

In [15]:
df_dedup["word_counts_summary"] = df_dedup["summary"].apply(lambda x: len(x.split()))
df_dedup["word_counts_text"] = df_dedup["text"].apply(lambda x: len(x.split()))
df_dedup["word_counts_summary_cleaned"] = df_dedup["summary_cleaned"].apply(lambda x: len(x.split()))
df_dedup["word_counts_text_cleaned"] = df_dedup["text_cleaned"].apply(lambda x: len(x.split()))

In [18]:
df_dedup.head()

Unnamed: 0,score,summary,text,text_cleaned,summary_cleaned,word_counts_summary,word_counts_text,word_counts_summary_cleaned,word_counts_text_cleaned
0,5.0,good quality dog food,i have bought several of the vitality canned d...,bought sever vital can dog food product found ...,good qualiti dog food,4,48,4,23
1,1.0,not as advertised,product arrived labeled as jumbo salted peanut...,product arriv label jumbo salt peanut peanut a...,advertis,3,31,1,18
2,4.0,"""delight"" says it all",this is a confection that has been around a fe...,confect around centuri light pillowi citrus ge...,delight say,4,94,2,39
3,2.0,cough medicine,if you are looking for the secret ingredient i...,look secret ingredi robitussin believ found go...,cough medicin,2,41,2,18
4,5.0,great taffy,great taffy at a great price. there was a wid...,great taffi great price wide assort yummi taff...,great taffi,2,27,2,13


#### Explanatory analysis of the word_counts

In [19]:
df_dedup.groupby('score').agg(['mean', 'sum', 'max', 'min', 'std'])[['word_counts_summary','word_counts_text', 'word_counts_summary_cleaned', 'word_counts_text_cleaned']]


Unnamed: 0_level_0,word_counts_summary,word_counts_summary,word_counts_summary,word_counts_summary,word_counts_summary,word_counts_text,word_counts_text,word_counts_text,word_counts_text,word_counts_text,word_counts_summary_cleaned,word_counts_summary_cleaned,word_counts_summary_cleaned,word_counts_summary_cleaned,word_counts_summary_cleaned,word_counts_text_cleaned,word_counts_text_cleaned,word_counts_text_cleaned,word_counts_text_cleaned,word_counts_text_cleaned
Unnamed: 0_level_1,mean,sum,max,min,std,mean,sum,max,min,std,mean,sum,max,min,std,mean,sum,max,min,std
score,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
1.0,4.213861,153827,29,1,2.834258,82.701466,3019017,2149,3,77.71793,2.676674,97712,16,0,1.665292,40.007396,1460470,1026,1,38.888307
2.0,4.452269,92812,27,1,2.731847,90.323371,1882881,1612,6,81.13666,2.671304,55686,16,0,1.624054,43.397055,904655,816,2,40.478574
3.0,4.72306,140955,26,1,2.816306,96.249497,2872470,3432,7,88.777909,2.843352,84857,14,0,1.676644,46.18141,1378238,1930,2,44.282302
4.0,4.390628,246784,31,1,2.690685,91.973544,5169557,2061,6,87.579048,3.015123,169471,16,0,1.587675,44.721743,2513675,1066,2,43.576232
5.0,3.89989,981228,42,1,2.457054,73.971948,18611638,2520,3,72.373733,2.808894,706729,17,0,1.522305,36.364656,9149493,1527,0,36.47966


## Store Dataset

In [20]:
df_dedup.to_pickle(path="./finefood_dataset_cleaned.pkl")