# Proyek PBA oleh Kelompok Impostor

Sentiment Analysis for Amazon Food Review Using SVM

In [None]:
import pandas as pd
import numpy as np
import string
import csv
import re
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer, WordNetLemmatizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import metrics
pd.set_option('display.max_colwidth', 200)

import matplotlib.pyplot as plt
import seaborn as sns


import warnings
warnings.filterwarnings("ignore")

# Load Data

In [None]:
dataset = pd.read_csv("./Reviews.csv",error_bad_lines=False)

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.info

In [None]:
dataset.Summary.head()

In [None]:
dataset.Text.head()

In [None]:
#Dropping the unwanted columns from our data frame.
dataset.drop("Id", inplace=True, axis=1)
dataset.drop("ProductId", inplace=True, axis=1)
dataset.drop("ProfileName", inplace=True, axis=1)
dataset.drop("HelpfulnessNumerator", inplace=True, axis=1)
dataset.drop("HelpfulnessDenominator", inplace=True, axis=1)
dataset.drop("Time", inplace=True, axis=1)
dataset.head()

In [None]:
#Make all 'Score' less than 3 equal to -ve class and 
# 'Score' greater than 3 equal to +ve class.
dataset.loc[dataset['Score']<3, 'Score'] = [0]
dataset.loc[dataset['Score']>3, 'Score'] = [1]

In [None]:
dataset.head()

In [None]:
total_size=len(dataset)

train_size=int(0.70*total_size)

#untuk training dataset
train=dataset.head(train_size)
#untuk test dataset
test=dataset.tail(total_size - train_size)

In [None]:
train.Score.value_counts()

In [None]:
test.Score.value_counts()

In [None]:
# menghapus semua baris dimana nilai sama dengan 3
train = train[train.Score != 3]
test = test[test.Score != 3]

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train['Score'].value_counts()

In [None]:
test.Score.value_counts()

# Explore Data

Pada tahap eksplorasi data akan dilakukan visualisasi review customer menggunakan Word Cloud. Word cloud (atau disebut juga tag cloud) adalah representasi visual dari data teks, biasanya digunakan untuk menggambarkan metadata atau untuk memvisualisasikan suatu bentuk teks secara bebas. Wordcloud (atau Tag cloud) adalah representasi visual dari data teks ini.  Ini menampilkan daftar kata dengan berbagai ukuran font atau warna yang berguna untuk memahami istilah yang paling menonjol dengan cepat 

In [None]:
! pip install wordcloud

In [None]:
from wordcloud import WordCloud
from wordcloud import STOPWORDS

In [None]:
# Merancang data frame "review" untuk ,menampilkan hasil eksplorasi data untuk dianalisis
review = df
# Menghilangkan nilai null
review.dropna(inplace=True)

In [None]:
#Membangun nilai (score) pada setiap review
score_1 = review[review['Score'] == 1]
score_2 = review[review['Score'] == 2]
score_3 = review[review['Score'] == 3]
score_4 = review[review['Score'] == 4]
score_5 = review[review['Score'] == 5]

In [None]:
sampel = pd.concat([score_1,score_2,score_3,score_4,score_5],axis=0)
sampel.reset_index(drop=True,inplace=True)

In [None]:
#WordCloud membutuhkan inputan single string dari teks
#Ringkasan review akan digabungkan menjadi single string
# similarly akan dibangun melalui atribut Text
review_str = sampel.Summary.str.cat()
wordcloud = WordCloud(background_color='white').generate(reviews_str)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Split review yang bersifat buruk dengan score 1 dan 2 serta review baik dengan score 4 dan 5.
bad_reviews = sampel[sampel['Score'].isin([1,2]) ]
good_reviews = sampel[sampel['Score'].isin([4,5]) ]
# Transform menjadi single string
bad_reviews_str = bad_reviews.Summary.str.cat()
good_reviews_str = good_reviews.Summary.str.cat()

In [None]:
wordcloud_bad = WordCloud(background_color='white').generate(bad_reviews_str)
wordcloud_good = WordCloud(background_color='black').generate(good_reviews_str)
# Plot
fig = plt.figure(figsize=(10,10))
ax1 = fig.add_subplot(211)
ax1.imshow(wordcloud_negative,interpolation='bilinear')
ax1.axis("off")
ax1.set_title('Review dengan Score Buruk',fontsize=20)

In [None]:
fig = plt.figure(figsize=(10,10))
ax2 = fig.add_subplot(212)
ax2.imshow(wordcloud_positive,interpolation='bilinear')
ax2.axis("off")
ax2.set_title('Review dengan Score Baik',fontsize=20)
plt.show()

### Analysis on Score (Target Variable)

In [None]:
dataset.info()

In [None]:
dataset = dataset[dataset['Score'] != 3]
dataset.shape 
#Checking to see how much % of data still remains
print(f'Remaining data is {((dataset.shape[0]*1.0)/(dataset.shape[0]*1.0))*100}')

In [None]:
score = dataset['Score'].apply(lambda x: 1 if x > 3 else 0)
dataset['Score'] = score
dataset.head(3)

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(df['Score'])
plt.title('Target')
plt.show()

### Analysis On ProductId and UserId

In [None]:
df = pd.read_csv("./Reviews.csv",error_bad_lines=False)

In [None]:
# we will see the products brought by each customer

purchases = df[['ProductId','UserId']].groupby('UserId').agg({'ProductId': ['count']})
purchases.columns = ['No_of_products_purchased']
purchases = purchases.reset_index()
purchases.head(2)

In [None]:
plt.figure(figsize=(5,5))
purchases['No_of_products_purchased'].hist()
plt.xlabel('No of purchases')
plt.ylabel('No of users')
plt.show()
print(purchases['No_of_products_purchased'].describe())

### Analyisis of Reviews over time

In [None]:
df['date'] = pd.to_datetime(df['Time'],unit='s')
dff = df[['date','Text','Score']]
dff.date = df.date.dt.strftime('%Y-%m')
# dff['date'] = dff['date'].dt.to_timestamp()
dff = dff.sort_values(by=['date']).reset_index(drop=True)
dff_1 = dff[dff['Score'] == 1]
dff_2 = dff[dff['Score'] == 2]
dff_3 = dff[dff['Score'] == 3]
dff_4 = dff[dff['Score'] == 4]
dff_5 = dff[dff['Score'] == 5]

dff_1 = dff_1.groupby('date')['Score'].count().reset_index()
dff_2 = dff_2.groupby('date')['Score'].count().reset_index()
dff_3 = dff_3.groupby('date')['Score'].count().reset_index()
dff_5 = dff_4.groupby('date')['Score'].count().reset_index()
dff_4 = dff_4.groupby('date')['Score'].count().reset_index()

plt.figure(figsize=(20,8))

plt.plot_date(x=dff_1['date'],y=dff_1['Score'],label='Score=1')
plt.plot_date(x=dff_2['date'],y=dff_2['Score'],label='Score=2')
plt.plot_date(x=dff_3['date'],y=dff_3['Score'],label='Score=3')
plt.plot_date(x=dff_4['date'],y=dff_4['Score'],label='Score=4')
plt.plot_date(x=dff_5['date'],y=dff_5['Score'],label='Score=5')
plt.grid(linewidth=0.5,alpha=0.75)
plt.xticks(rotation=90)
plt.xlim('2000-01','2012-10')
plt.xlabel('Date',fontsize=22)
plt.ylabel('Number of review',fontsize=22)
plt.title('Review trend from 2000 to 2012',fontsize=24);
plt.savefig('review_trend.png')
plt.legend()
plt.show()

# Text Processing

Text Preprocessing akan menggunakan TextBlob Library. Dalam Text Preprocessing akan melakukan remove stop words, punctuations, convert into lower cases, lemmatize,

In [None]:
! pip install textblob

In [None]:
from nltk.corpus import stopwords 
from textblob import TextBlob
from textblob import Word

In [None]:
# Lower casing and removing punctuations
df['Text'] = df['Text'].apply(lambda x: " ".join(x.lower() for
x in x.split()))

In [None]:
df['Text'] = df['Text'].str.replace('[^\w\s]', "")
df.Text.head(7)

In [None]:
#remove the stopwords
stop = stopwords.words('english')
df['Text'] = df['Text'].apply(lambda x: " ".join(x for x in
x.split() if x not in stop))
df.Text.head()

In [None]:
#Lemmatization
df['Text'] = df['Text'].apply(lambda x: " ".join([Word(word).
lemmatize() for word in x.split()]))
df.Text.head()