IMPORT LIBRARIES & DATA FILE

In [1]:
# import libraries / install packages (if required)

import os
import pandas as pd
import numpy as np
import re

from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

import nltk
from nltk.corpus import stopwords


In [2]:
# mount drive, upload text_mining_reviews.csv (HBB reviews file)

from google.colab import drive
drive.mount('/content/drive')

from google.colab import files
uploaded = files.upload()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Saving text_mining_reviews.csv to text_mining_reviews (1).csv


In [280]:
# CSV to dataframe from os.path

path = '/content/text_mining_reviews.csv'
df = pd.read_csv(path, encoding = "ISO-8859-1") #couldn't read utf-8

In [281]:
# preliminary exploratoin

# print(df.columns)
# print(df.shape)
# print(df.isna().sum())

df = df[['rating', 'Review']]

print(df.rating.value_counts(normalize = True))
print(df.rating.value_counts())

5.0    0.609526
1.0    0.155143
4.0    0.107326
3.0    0.067938
2.0    0.060067
Name: rating, dtype: float64
5.0    65674
1.0    16716
4.0    11564
3.0     7320
2.0     6472
Name: rating, dtype: int64


In [282]:
# trim df / rename columns / drop dups

cols = {df.columns[1]:'review'}
        
df.rename(columns = cols, inplace=True)

df.drop_duplicates(inplace= True)

In [283]:
# remove null reviews / remove null star rating / remove 3-star ratings

df.dropna(subset = ['review', 'rating'], inplace= True)

df = df[df.rating != 3]

df.reset_index(drop = True, inplace = True)

df.shape

(47955, 2)

In [None]:
''' ENTITY RECOGNITION '''

In [285]:
# apply ER to text blob

import polyglot
from polyglot.text import Text

    # # test set to ensure working:
    # blob1 = """The Israeli Prime Minister Benjamin Netanyahu has warned that Iran poses a "threat to the entire world"."""
    # text = Text(blob1)

# combining random reviews

import random

rand = [random.randint(1,1000) for i in range(0,10)]
b = [f'df.review[{r}]' for r in rand]
c = ', '.join(b)

# copy / paste c into .join
blob = '. '.join([df.review[304], df.review[114], df.review[85], 
           df.review[367], df.review[194], df.review[462], 
           df.review[492], df.review[749], df.review[827], df.review[668]])

text = Text(blob)

print(blob)
print(text.entities)

# ONE ENTITY FOUND; 'Hamilton' and while it is a location, it appears this classified it as a city name

Bought this last year and now it not longer works. I barely used it and it still spins but has not power to mix. Its basically a piece of junk now. I like everything about it I got it specially to grind my coffee And there?ÇÖs nothing I don?ÇÖt like about it I love this little grinder. I have not needed to try this out yet, but it does match all my other Hamilton Beach small appliances in red and the glass pitcher is quite substancial and looks so clear and clean.  The pour spout on the pitcher seem a like an innovative and welcome feature. The electric cord is just exactly long enough.  I trust H.B. appliances to give me long lasting dependability.. Easy to use. Not only was this a fabulous mixer but I was able to use my 20% coupon! The store held this mixer for me.. I used this in our RV. Very convenient!. This is my third brewmaster. The model changes every time and this one, after using only a couple of months, leaks!. I LOVE this opener. At 80 years of age, My hands don't have the

In [286]:
''' SENTIMENT ANALYSIS:

    round 1: remove special characters
    
    round 2: HTML tag removal, tokenization, lemitization, stop word removal
        
'''

' SENTIMENT ANALYSIS:\n\n    round 1: remove special characters\n    \n    round 2: HTML tag removal, tokenization, lemitization, stop word removal\n        \n'

In [287]:
# function to mark rating as positive (>3) = 1, or negative (<3) = 0

def pos_review(rating):
    try:
        if rating > 3:
            return 1 # pos
        else:
            return 0 # neg
    except:
        return None   

df['pos_rating'] = df['rating'].apply(pos_review)
df.head()

df[['rating' , 'pos_rating']]

print(df.rating.value_counts())
print(df.pos_rating.value_counts())

5.0    31547
1.0     7688
4.0     5671
2.0     3049
Name: rating, dtype: int64
1    37218
0    10737
Name: pos_rating, dtype: int64


In [288]:
# clean reviews of special characters

def clean_review_text(raw_val):
    try:
        re_pattern = "[^A-Za-z0-9\s&-/,!.\\\\]+"
        val = re.sub(re_pattern, '', raw_val)
        return val
    except:
        return None

df['reviews_clean'] = df['review'].apply(clean_review_text)

In [289]:
# 1: train-test-split / decision-tree pipeline 

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer

# t-t-s

X = df['reviews_clean']
y = df['pos_rating']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 17325)

# pipe

clf = Pipeline(steps =[
          ('preprocessing', CountVectorizer()),
          ('classifier', DecisionTreeClassifier(class_weight='balanced'))
                      ])

clf.fit(x_train,y_train)

# score

clf.score(x_test,y_test)


0.8429832487662473

In [290]:
clf1 = clf.score(x_test,y_test)

print(f'Score with special characters removed: {100 * clf1:.1f}%')

Score with special characters removed: 84.3%


In [291]:
# copy data set / further processing

dfa = df

dfa.head()

Unnamed: 0,rating,review,pos_rating,reviews_clean
0,5.0,I bought this for my bedroom and has really he...,1,I bought this for my bedroom and has really he...
1,5.0,"It is in the corner of my small living room, i...",1,"It is in the corner of my small living room, i..."
2,5.0,I love this thing. Recently adopted two very y...,1,I love this thing. Recently adopted two very y...
3,4.0,The product works. However the amazon bag arri...,1,The product works. However the amazon bag arri...
4,5.0,I have an Aussie and he sleeps in my room and ...,1,I have an Aussie and he sleeps in my room and ...


In [292]:
dfb = dfa

In [293]:
dfb.head()

Unnamed: 0,rating,review,pos_rating,reviews_clean
0,5.0,I bought this for my bedroom and has really he...,1,I bought this for my bedroom and has really he...
1,5.0,"It is in the corner of my small living room, i...",1,"It is in the corner of my small living room, i..."
2,5.0,I love this thing. Recently adopted two very y...,1,I love this thing. Recently adopted two very y...
3,4.0,The product works. However the amazon bag arri...,1,The product works. However the amazon bag arri...
4,5.0,I have an Aussie and he sleeps in my room and ...,1,I have an Aussie and he sleeps in my room and ...


In [294]:
''' cleaning function for text processing;
    
    SOURCE: these cleaning functions were sourced from 
    https://www.kaggle.com/sid321axn/amazon-alexa-reviews
    and modified for use on my data set in this notebook;

'''

#HTML tag removal
dfb.reviews_clean = dfb.reviews_clean.apply(lambda words: re.sub('<.*?>','',words))

# # tokenize reviews - new column name
dfb['reviews_tok'] = dfb.reviews_clean.apply(nltk.word_tokenize)

#Upper to lower case
dfb.reviews_tok = dfb.reviews_tok.apply(lambda words: [x.lower() for x in words])

#Lemmatization
dfb.reviews_tok = dfb.reviews_tok.apply(lambda words: " ".join([Word(x).lemmatize() for x in words]))


In [None]:
dfa.reviews_tok.head()

In [296]:
# 2: train-test-split / decision-tree pipeline 

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer

# t-t-s

X = dfb['reviews_tok']
y = dfb['pos_rating']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# pipe

clf = Pipeline(steps =[
          ('preprocessing', CountVectorizer()),
          ('classifier', DecisionTreeClassifier(class_weight='balanced'))
                      ])

clf.fit(x_train,y_train)

# score

clf2 = clf.score(x_test,y_test)

In [298]:
print(f'SCORE removing special characters: {100 * clf1:.1f}%; \n'
f'SCORE adding corrected spelling, lemitization: {100 * clf2:.1f}%;'
      )

print('''SUMMARY OF EXERCISE:It appears the changes made a small 
        difference in the accuracy of the review outcome. 
        It may be due to the nature of the product reviews having 
        "good" words preceived as "bad" words
        (e.g. "hot" or "I like my toast 'burnt' and this toaster 
        is powerful enough to do that quickly.")
        
        As I continue to explore this I would investigate misclassifications 
        more to look for additional words to add to a custom stopwords list.

      ''')

SCORE removing special characters: 84.3%; 
SCORE adding corrected spelling, lemitization: 85.1%;
SUMMARY OF EXERCISE:It appears the changes made a small 
        difference in the accuracy of the review outcome. 
        It may be due to the nature of the product reviews having 
        "good" words preceived as "bad" words
        (e.g. "hot" or "I like my toast 'burnt' and this toaster 
        is powerful enough to do that quickly.")
        
        As I continue to explore this I would investigate misclassifications 
        more to look for additional words to add to a custom stopwords list.

      


In [274]:
dfb.to_csv('text_mining_clean2.csv', index = False)