In [1]:
import numpy as np
import pandas as pd
import regex as re

import matplotlib.pyplot as plt
import seaborn as sns



%matplotlib inline

In [2]:
df = pd.read_csv('fake_reviews_dataset.csv')

In [3]:
df.shape

(40432, 4)

In [4]:
df.sample(10)

Unnamed: 0,category,rating,label,text_
22147,Pet_Supplies_5,5.0,CG,The go to meal for a puppy and she loves it to...
16508,Tools_and_Home_Improvement_5,5.0,CG,Nice and easy fix as usual. The only problem i...
37105,Clothing_Shoes_and_Jewelry_5,1.0,OR,Strange white streaking down both sides of the...
2409,Home_and_Kitchen_5,5.0,CG,Purchased this for my fiancee and she loves it...
34957,Toys_and_Games_5,3.0,CG,"The colors are very dull, and the suction cups..."
26513,Kindle_Store_5,4.0,OR,Once again I enjoyed another part of this stor...
1818,Home_and_Kitchen_5,5.0,CG,"I like the thickness of the containers, the lo..."
39154,Clothing_Shoes_and_Jewelry_5,5.0,OR,This is the second pair I have purchased & I l...
10587,Electronics_5,3.0,OR,"Fast delivered, easy to work on, easy to setup..."
13134,Movies_and_TV_5,5.0,CG,Great entertainment and neat special effects. ...


In [5]:
df.label.unique()

array(['CG', 'OR'], dtype=object)

In [6]:
df.label.value_counts()

CG    20216
OR    20216
Name: label, dtype: int64

In [7]:
df.category.unique()

array(['Home_and_Kitchen_5', 'Sports_and_Outdoors_5', 'Electronics_5',
       'Movies_and_TV_5', 'Tools_and_Home_Improvement_5',
       'Pet_Supplies_5', 'Kindle_Store_5', 'Books_5', 'Toys_and_Games_5',
       'Clothing_Shoes_and_Jewelry_5'], dtype=object)

In [8]:
import unicodedata
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [9]:
def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')
    unaccented_text = unicodedata.normalize('NFKD',text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower()
    words= re.sub(r'[^\w\s]', ' ', unaccented_text).split()
    clean_text = [lemmatizer.lemmatize(word) for word in words if word not in stopwords]
    return ' '.join(clean_text)  

In [10]:
df.rename(columns = {'text_':'text'},inplace=True)
df['is_fake'] = df.label.map({'CG': 1, 'OR':0})
df.drop('label', axis=1, inplace=True)
df['clean_text'] = df['text'].apply(clean_text)
df['word_count'] = df.clean_text.str.len()

In [11]:
df.sample(10)

Unnamed: 0,category,rating,text,is_fake,clean_text,word_count
27347,Kindle_Store_5,3.0,I enjoyed most of this book. I liked how the a...,0,enjoyed book liked author took time develop ch...,410
4788,Sports_and_Outdoors_5,5.0,"Great product! Multi pack, has the wide brim ...",1,great product multi pack wide brim two large p...,63
2340,Home_and_Kitchen_5,5.0,I absolutely love that the spout is wide and y...,1,absolutely love spout wide turn around throw t...,99
11298,Electronics_5,5.0,"Installation was a snap. However, I purposely...",0,installation snap however purposely purchased ...,445
30371,Books_5,3.0,"Lots of big ideas to chew on in this book, but...",0,lot big idea chew book wish multitude plot lin...,155
21927,Pet_Supplies_5,5.0,"love this shear, i bought this to clean the fa...",0,love shear bought clean face maltese shitzu pu...,143
26721,Kindle_Store_5,4.0,What a delightful find. Logan and Juliette are...,1,delightful find logan juliette alike story sta...,201
32310,Books_5,1.0,I hate to give one star to any book but this w...,0,hate give one star book really awful perhaps s...,696
38686,Clothing_Shoes_and_Jewelry_5,5.0,"The bottom of the earrings are a little wide, ...",1,bottom earring little wide nice bonus sure wel...,72
12701,Movies_and_TV_5,4.0,"Got it to view via this video, you will love i...",1,got view via video love dvd player,34


In [12]:
# nltk.download('vader_lexicon')

In [13]:
def sentiment(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_dict = sia.polarity_scores(text)
    if sentiment_dict['compound'] >= 0.05:
        return 'Positve'
    elif sentiment_dict['compound']<= -0.05:
        return 'Negative'
    else:
        return 'Neutral'   

In [14]:
df2 = df.sample(10).copy()

In [15]:
df2['sentiment'] = df2['clean_text'].apply(sentiment)

In [16]:
df2.sample(10)

Unnamed: 0,category,rating,text,is_fake,clean_text,word_count,sentiment
1392,Home_and_Kitchen_5,5.0,It's hard to describe the feeling I get every ...,0,hard describe feeling get every time use thing...,95,Positve
14507,Movies_and_TV_5,4.0,A very discreet movie. Unusual for an action m...,1,discreet movie unusual action movie good movie...,258,Positve
19921,Pet_Supplies_5,5.0,These are nice doggie cookies I am sure I will...,0,nice doggie cooky sure getting future,37,Positve
32970,Toys_and_Games_5,5.0,Works as is supposed to. Talking is clear. Chi...,0,work supposed talking clear child love,38,Positve
30533,Books_5,3.0,In the tradition of The Handmaids Tale this bo...,0,tradition handmaid tale book talk happen relig...,179,Positve
6469,Sports_and_Outdoors_5,5.0,"This is the old style, anodized aluminum. I wi...",1,old style anodized aluminum make new one great...,110,Positve
24352,Kindle_Store_5,5.0,"Decent story aside from the cliffhanger, the b...",1,decent story aside cliffhanger book nice read ...,60,Positve
21283,Pet_Supplies_5,3.0,"From reading previous reviews, I knew the size...",0,reading previous review knew size would bit sm...,129,Positve
33294,Toys_and_Games_5,5.0,Perfect Easter basket stuffer for my three-yea...,0,perfect easter basket stuffer three year old t...,91,Positve
10673,Electronics_5,5.0,"Easy to configure, and expandable too. Nice th...",0,easy configure expandable nice serf extender b...,277,Positve


In [17]:
df['sentiment'] = df['clean_text'].apply(sentiment)

In [18]:
df.sample(10)

Unnamed: 0,category,rating,text,is_fake,clean_text,word_count,sentiment
37025,Clothing_Shoes_and_Jewelry_5,5.0,"I love it. As a woman with short legs, these f...",1,love woman short leg fit perfectly wide foot,44,Positve
6695,Sports_and_Outdoors_5,5.0,"I always have a knife on me. However, working ...",0,always knife however working financial institu...,249,Positve
2921,Home_and_Kitchen_5,5.0,These are absolutely great to use for the summ...,1,absolutely great use summer winter also large ...,153,Positve
12436,Movies_and_TV_5,2.0,A 9 mile hole in the ground and an even bigger...,0,9 mile hole ground even bigger hole wednesday ...,51,Neutral
13992,Movies_and_TV_5,3.0,They have a lot to recover from with all the d...,0,lot recover death young people previous season...,183,Negative
15356,Movies_and_TV_5,5.0,...Famous WW2 saying/slogan for the time. The ...,1,famous ww2 saying slogan time movie good histo...,460,Positve
12823,Movies_and_TV_5,5.0,I saw this movie a long time ago. I loved it. ...,0,saw movie long time ago loved bought funny pri...,54,Positve
12061,Movies_and_TV_5,2.0,"Not a very good copy, not a very good movie",0,good copy good movie,20,Positve
1526,Home_and_Kitchen_5,5.0,I USED THESE TO WASH MY WINDOWS AND WHAT SUPER...,0,used wash window super nice cloth bit lint han...,74,Positve
26512,Kindle_Store_5,5.0,This set of books is an excellent read.... if ...,0,set book excellent read enjoy drama friendship...,285,Positve
