In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('review.csv')

In [3]:
df = df[['summary', 'overall']]

Kita akan membuat sentiment analysis dari data summary. Dalam projek ini, akan dibuat model yang memprediksi apakah sebuah review itu memiliki sentimen yang baik (Great) atau sentimen yang buruk (Bad). Untuk mencapai tujuan ini, kita akan membagi dua kategori data yaitu Bad dan Great berdasarkan kolom rating. Apabila rating lebih kecil dari 3 maka masuk ke dalam kategori Bad. Sedangkan, apabila rating lebih besar dari 3 maka masuk ke dalam kategori Great. Kita tidak akan memakai rating 3. 

In [4]:
def change_overall(ser):
    if ser < 3:
        return 'Bad'
    elif ser == 3:
        return 'Neutral'
    else:
        return 'Great'

In [5]:
# membuat kolom baru bernama overall2 yang berisi Bad atau Great berdasarkan nilai overall
df['overall2'] = df['overall'].apply(change_overall)

In [6]:
df['overall2'].value_counts()

Great      120044
Neutral     17514
Bad         13696
Name: overall2, dtype: int64

In [7]:
idx_neutral = df[df['overall2'] == 'Neutral'].index

In [8]:
df.drop(idx_neutral, inplace=True)

In [9]:
df.reset_index(drop=True, inplace=True)

In [10]:
df['overall2'].value_counts()

Great    120044
Bad       13696
Name: overall2, dtype: int64

Jumlah data Great dan Bad tidak seimbang. Jumlah data Great 10 kali lipat lebih banyak dibandingkan dengan data Bad. Apabila kita membuat model dari data seperti ini, maka model tersebut akan lebih cenderung untuk memprediksi Great. Terdapat beberapa cara untuk mengatasi data yang unbalance data, salah satunya adalah dengan teknik downsampling dimana data yang banyak akan dihapus sehingga data menjadi seimbang.

## Data Balancing

Dalam proses data balancing ini, kita akan menggunakan teknik downsample. Artinya, kita akan membuang data Great sehingga jumlahnya tidak jauh dengan data Bad. Teknik ini lebih mudah dilakukan dibandingkan teknik balancing yang lain, tetapi dengan dihapusnya data tentu saja akan ada informasi yang hilang. Teknik downsampling tidak cocok digunakan untuk data yang berjumlah sedikit.

Dalam projek ini, terdapat 150 ribu lebih data review. Apabila kita melakukan downsampling, data yang akan tersisa adalah sekitar 62 ribu. Data ini masih cukup banyak untuk bisa dijadikan sebagai model.

In [11]:
# downsampling menggunakan sample, method dari pandas
df_great = df[df['overall2'] == 'Great'].sample(n=len(df[df['overall2'] == 'Bad']), random_state=101)

In [12]:
df_bad = df[df['overall2'] == 'Bad']

In [13]:
# menggabungkan kembali data bad dengan data great yang sudah dikurangi jumlahnya
df_balance = pd.concat([df_great, df_bad]).reset_index(drop=True)

In [14]:
df_balance.head()

Unnamed: 0,summary,overall,overall2
0,Pretty Good in The Morning,4.0,Great
1,All I Could Want in A Taco Shell...,5.0,Great
2,Delicious!,4.0,Great
3,Smooooth!!!!,5.0,Great
4,Deliciously Authentic Bold Salsa Flavor,5.0,Great


In [15]:
df_balance.tail()

Unnamed: 0,summary,overall,overall2
27387,Not to my taste at all.,2.0,Bad
27388,This was unnecessarily complicated to cook - y...,2.0,Bad
27389,"Easy, but way too sweet.",2.0,Bad
27390,Meh,2.0,Bad
27391,"Betty, This One's a Boo-Boo",2.0,Bad


In [16]:
df_balance.shape

(27392, 3)

## Clean Summary Text

Seperti yang kita lihat ketika membuat plot kata-kata yang sering muncul dari summary. Kategori Bad masih memiliki kata-kata yang seharusnya tidak terdapat di dalam kategori tersebut. Beberapa kata yang secara logika seharusnya tidak terdapat di dalam kategori tersebut adalah:

1. Good
2. Great
3. Like
4. Taste
5. Flavor

Kita akan melihat lebih jauh kata-kata ini pad data dengan kategori Bad, apa yang sebenarnya maksud dari kata tersebut di dalam kategori tersebut.

In [17]:
hasil_bad = []
for element in df_balance[df_balance['overall2'] == 'Bad']['summary']:
    hasil_bad.append("good" in element.lower())
df_balance[df_balance['overall2'] == 'Bad'].loc[hasil_bad]

Unnamed: 0,summary,overall,overall2
13704,Not good,1.0,Bad
13750,Good,1.0,Bad
13754,Product Ok but overpriced and packaging not good.,1.0,Bad
13782,Not good,1.0,Bad
13816,"Not green, not good",1.0,Bad
13821,Product is good but Seller stinks,2.0,Bad
13824,Not Very Good,2.0,Bad
13829,Doesn't Taste Good,2.0,Bad
13845,Not so good....,1.0,Bad
13889,"After branching out and trying other brands, i...",1.0,Bad


Kata-kata good dalam kategori Bad kebanyakan adalah not good, not so good, not that good, dll. Kata-kata ini akan diubah menjadi bad, agar model yang dibuat akan menjadi lebih baik.

In [18]:
#Mengubah seluruh teks menjadi lower
df_balance['summary_lower'] = df_balance['summary'].apply(lambda ser: ser.lower())

In [19]:
import re

In [20]:
words_not_good_great = ["not good", 
"not so good", 
"not a good", 
"not that good",
"not as good as",
"no good",
"not very good",
"doesn't taste good",
"not taste good",
"not nearly as good as",
"isn't good",
"aren't good",
"not great", 
"not so great", 
"not a great", 
"not that great",
"not as great as",
"no great",
"not very great",
"doesn't taste great",
"not taste great",
"not nearly as great as",
"isn't great",
"aren't great"]

def change_word_good_great(ser):
    for word in words_not_good_great:
        ser = ser.replace(word, 'bad')
    return ser

In [21]:
'elga not good'.replace('not good', 'bad')

'elga bad'

In [22]:
df_balance['summary_lower_change'] = df_balance['summary_lower'].apply(change_word_good_great)

Dengan cara yang sama dengan apa yang dilakukan dengan kata good, kita akan melakukan hal tersebut terhadap kata flavor, taste, dan like. 

In [23]:
flavor_words = ['no flavor',
'weird flavor',
'flavorless',
'strange flavor',
'lack flavor',
'lacking in flavor',
'not much flavor',
'weak flavor',
'odd flavor']

def change_word_flavor(ser):
    for word in flavor_words:
        ser = ser.replace(word, 'flavor_problem')
    return ser

In [24]:
df_balance['summary_lower_change'] = df_balance['summary_lower_change'].apply(change_word_flavor)

In [25]:
taste_words = ["doesn't taste",
"does not taste",
"no taste",
'tasteless']

def change_word_taste(ser):
    for word in taste_words:
        ser = ser.replace(word, 'taste_problem')
    return ser

def change_question_mark(ser):
    return ser.replace('?', 'questionmark', 1)

In [26]:
df_balance['summary_lower_change'] = df_balance['summary_lower_change'].apply(change_word_taste)

In [27]:
df_balance['summary_lower_change'] = df_balance['summary_lower_change'].apply(change_question_mark)

In [28]:
like_words = [
    "didn't like",
    'not like',
    "don't like"
]

def change_word_like(ser):
    for word in like_words:
        ser = ser.replace(word, 'hate')
    return ser

In [29]:
df_balance['summary_lower_change'] = df_balance['summary_lower_change'].apply(change_word_like)

Kita akan mengubah tanda tanya pada summary menjadi questionmark. Hal ini dilakukan karena pada kategori Bad lebih banyak ditemukan summary dengan tanda tanya.

In [30]:
def change_question_mark(ser):
    ser = ser.replace('?', ' questionmark', 1)
    return ser

## Clean Summary Part 2

Summary yang kata-kata nya telah diubah dalam part pertama kemudian akan dibersihkan lebih lanjut dengan beberapa proses. Proses tersebut diantaranya adalah menghilangkan karakter lain selain huruf alfabet, menghilangkan stopword, dan membuat seluruh huruf menjadi huruf kecil.

In [31]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [32]:
stopwords_normal = stopwords.words('english')

In [33]:
def clean_summary(text):
    letter_text = re.sub('[^a-zA-Z]', ' ', text)
    letter_text = letter_text.lower()
    list_text = word_tokenize(letter_text)
    text_without_stopwords = [word for word in list_text if word not in stopwords_normal]
    return ' '.join(text_without_stopwords)

In [34]:
df_balance['clean_summary'] = df_balance['summary_lower_change'].apply(clean_summary)

In [35]:
df_balance.head()

Unnamed: 0,summary,overall,overall2,summary_lower,summary_lower_change,clean_summary
0,Pretty Good in The Morning,4.0,Great,pretty good in the morning,pretty good in the morning,pretty good morning
1,All I Could Want in A Taco Shell...,5.0,Great,all i could want in a taco shell...,all i could want in a taco shell...,could want taco shell
2,Delicious!,4.0,Great,delicious!,delicious!,delicious
3,Smooooth!!!!,5.0,Great,smooooth!!!!,smooooth!!!!,smooooth
4,Deliciously Authentic Bold Salsa Flavor,5.0,Great,deliciously authentic bold salsa flavor,deliciously authentic bold salsa flavor,deliciously authentic bold salsa flavor


kolom clean_summary sudah berisi teks yang siap untuk dijadikan sebagai data untuk memprediksi apakah suatu teks merupakan sentimen yang baik (Great) atau sentimen yang buruk (Bad)

## Modelling

Model yang dibuat untuk sentiment analysis ini diantaranya adalah:
1. Random Forest
2. Logistic Regression
3. K Nearest Neighbors
4. Multinomial Naïve Bayes

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score, cross_val_predict

In [37]:
import warnings
warnings.filterwarnings("ignore")

In [38]:
list_model = ['RandomForestClassifier', 'KNeighborsClassifier', 'LogisticRegression', 'MultinomialNB']
i = 0
for model in [RandomForestClassifier, KNeighborsClassifier, LogisticRegression, MultinomialNB]:
    pipe = Pipeline([
        ('count_vec', CountVectorizer()),
        ('tf_idf', TfidfTransformer()),
        ('model', model())
    ])   
    print(f'Score {list_model[i]} : {cross_val_score(pipe, df_balance["clean_summary"], df_balance["overall2"], cv=10).mean()}')
    i += 1

Score RandomForestClassifier : 0.810491994262955
Score KNeighborsClassifier : 0.750510149131179
Score LogisticRegression : 0.8265909103026878
Score MultinomialNB : 0.8227933704072982


Score Accuracy dari 10-fold cross validation paling tinggi adalah Logistic Regression dan Multinomial Naive Bayes dengan masing-masing accuracy 82,8% dan 82,4%. Kita dapat menggunakan dua-duanya, karena perbedaanya sangat tidak terlalu besar. Kita akan menggunakan Logistic Regression untuk membuat model kita.

In [39]:
pipe = Pipeline([
        ('count_vec', CountVectorizer()),
        ('tf_idf', TfidfTransformer()),
        ('model', LogisticRegression())])

In [40]:
X = df_balance['clean_summary']
y = df_balance['overall2']

In [41]:
from sklearn.model_selection import train_test_split

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=101)

In [43]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('count_vec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...)),
                ('tf_idf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('model',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=

In [44]:
y_predict = pipe.predict(X_test)

In [45]:
from sklearn.metrics import classification_report, confusion_matrix

In [46]:
confusion_matrix(y_test, y_predict)

array([[3538,  571],
       [ 780, 3329]], dtype=int64)

In [47]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         Bad       0.82      0.86      0.84      4109
       Great       0.85      0.81      0.83      4109

    accuracy                           0.84      8218
   macro avg       0.84      0.84      0.84      8218
weighted avg       0.84      0.84      0.84      8218



**Hyperparameter Tuning**

Kita akan mencoba melakukan tuning hyperparameter terhadap model Logistic Regression untuk mengetahui apakah bisa mendapatkan model yang lebih baik dengan menggunakan Grid Search CV.

In [48]:
from sklearn.model_selection import GridSearchCV

In [49]:
param = {'penalty':['l1', 'l2'], 'C':np.logspace(-4, 4, 20)}

In [50]:
grid = GridSearchCV(LogisticRegression(), param, cv=3, n_jobs=-1)

In [51]:
count_vec_transformer = CountVectorizer().fit(X_train)
count_vec_result = count_vec_transformer.transform(X_train)
tfidf_transformer = TfidfTransformer().fit(count_vec_result)
tfidf_result = tfidf_transformer.transform(count_vec_result)

In [52]:
grid.fit(tfidf_result, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([1.00000000e...3,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
     

In [53]:
grid.best_params_

{'C': 4.281332398719396, 'penalty': 'l2'}

In [54]:
grid.best_score_

0.8222593094815897

Penggunaan tuning dengan hyperparameter terhadap model logistic regression tidak menghasilkan score yang lebih baik. Sehingga kita akan menggunakan model dengan default parameter terhadap mdoel logistic regression

In [55]:
words_not_good_great = ["not good", 
"not so good", 
"not a good", 
"not that good",
"not as good as",
"no good",
"not very good",
"doesn't taste good",
"not taste good",
"not nearly as good as",
"isn't good",
"aren't good",
"not great", 
"not so great", 
"not a great", 
"not that great",
"not as great as",
"no great",
"not very great",
"doesn't taste great",
"not taste great",
"not nearly as great as",
"isn't great",
"aren't great"]

flavor_words = ['no flavor',
'weird flavor',
'flavorless',
'strange flavor',
'lack flavor',
'lacking in flavor',
'not much flavor',
'weak flavor',
'odd flavor']

taste_words = ["doesn't taste",
"does not taste",
"no taste",
'tasteless']

like_words = [
    "didn't like",
    'not like',
    "don't like"
]

def change_clean_word(ser):
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    
    stopwords_normal = stopwords.words('english')
    
    # change
    ser = ser.lower()
    for word in words_not_good_great:
        ser = ser.replace(word, 'bad')
    for word in flavor_words:
        ser = ser.replace(word, 'flavor_problem')
    for word in taste_words:
        ser = ser.replace(word, 'taste_problem')
    for word in like_words:
        ser = ser.replace(word, 'hate')
    ser = ser.replace('?', 'questionmark', 1)
    
    # clean
    ser = re.sub('[^a-zA-Z]', ' ', ser)
    list_text = word_tokenize(ser)
    text_without_stopwords = [word for word in list_text if word not in stopwords_normal]
    return ' '.join(text_without_stopwords)

In [56]:
ser_to_predict = pd.Series(["Not good"]).apply(change_clean_word)

In [57]:
pipe.predict_proba(ser_to_predict)

array([[0.96019307, 0.03980693]])

In [58]:
pipe.predict(ser_to_predict)

array(['Bad'], dtype=object)

In [59]:
import pickle

In [60]:
pickle.dump(pipe, open('pipe.sav', 'wb'))