### 1 - Import packages

In [1]:
import pandas as pd
import html

import nltk
from nltk.stem import WordNetLemmatizer

import sys
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.parse.malt import MaltParser
from nltk.corpus import words
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jewel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jewel\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jewel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jewel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 2 - Import data

In [2]:
reviews_df = pd.read_csv("../sa-data/shopee-reviews.csv")

In [3]:
reviews_df.head()

Unnamed: 0,username,rating,comment
0,s*****y,5,Gorgeous tanzanite pieces! Quality of the jewe...
1,s*****8,5,Well received via my doorstep. Have been buyin...
2,n*****9,5,Received the bracelet within in 6 days. Love i...
3,s*****y,5,Gorgeous tanzanite pieces! Quality of the jewe...
4,merielyng.salvador,5,"Super nice and elegant, worth it and very good..."


In [4]:
reviews_df.isnull().sum()

username     0
rating       0
comment     57
dtype: int64

### 3 - Filter out reviews without comments

In [5]:
real_reviews_df = reviews_df[~reviews_df.comment.isnull()]

In [6]:
real_reviews_df.isnull().sum()

username    0
rating      0
comment     0
dtype: int64

### 4 - Helper functions for data preprocessing

In [18]:
"""
Code references:
    https://pythonguides.com/remove-unicode-characters-in-python/
    https://www.kite.com/python/answers/how-to-decode-html-entities-in-python
"""
def decode_text(text):
    # remove non-ASCII characters in string
    decoded_text = text.encode('ascii', 'ignore').decode('utf-8')

    # decode HTML entities
    decoded_html = html.unescape(decoded_text)
    return ''.join([word for word in decoded_html if word.isprintable()])

"""
Code reference:
    https://catriscode.com/2021/03/02/extracting-or-removing-mentions-and-hashtags-in-tweets-using-python/
"""
# def remove_mentions(text):
#     return re.sub("@[A-Za-z0-9_]+","", text)

def remove_stopwords(words_list):
    stop_list = stopwords.words("english")
    return [word for word in words_list if word not in stop_list]

def pos_to_wordnet(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_words(word_list):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    # POS (part-of-speech) tagging
    # nltk_tagged -> a list of tuples (word, pos tag)
    nltk_tagged = nltk.pos_tag(word_list)

    # returns a list of tuples of words and their wordnet_tag (after conversion from NLTK tag)
    wordnet_tagged = list(map(lambda x: (x[0], pos_to_wordnet(x[1])), nltk_tagged))

    # lemmatizing
    lemmatized_words = []
    for word, tag in wordnet_tagged:
        if tag is not None:
            # need POS tag as 2nd argument as it helps lemmatize the words more accurately
            lemmatized_words.append(lemmatizer.lemmatize(word, tag))
        elif tag in [wordnet.NOUN] or word == "lavval" or word == "newagefsg":
            lemmatized_words.append(lemmatizer.lemmatize(word))
    return lemmatized_words

def clean_original_text(text):
    text = text.lower()
    clean_list = []
    sentence_list = nltk.sent_tokenize(text)
    for sentence in sentence_list:
        decoded_sentence = decode_text(sentence)
        words_list = nltk.RegexpTokenizer(r'\w+').tokenize(decoded_sentence)
        lemmatized_words = lemmatize_words(words_list)
        useful_words = remove_stopwords(lemmatized_words)

        if len(useful_words) > 0:
            clean_list.extend(useful_words)
    clean_text = ' '.join(clean_list)
    
    return clean_text

### 5 - Preprocess the data to obtain clean reviews

In [19]:
real_reviews_df["clean_comment"] = real_reviews_df["comment"].apply(clean_original_text)
real_reviews_df["clean_tokens"] = real_reviews_df["clean_comment"].apply(nltk.word_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_reviews_df["clean_comment"] = real_reviews_df["comment"].apply(clean_original_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_reviews_df["clean_tokens"] = real_reviews_df["clean_comment"].apply(nltk.word_tokenize)


In [20]:
real_reviews_df

Unnamed: 0,username,rating,comment,clean_comment,clean_tokens
0,s*****y,5,Gorgeous tanzanite pieces! Quality of the jewe...,gorgeous tanzanite piece quality jewelry box good,"[gorgeous, tanzanite, piece, quality, jewelry,..."
1,s*****8,5,Well received via my doorstep. Have been buyin...,well receive doorstep buy many bracelet jamsto...,"[well, receive, doorstep, buy, many, bracelet,..."
2,n*****9,5,Received the bracelet within in 6 days. Love i...,receive bracelet day love colour beautiful abl...,"[receive, bracelet, day, love, colour, beautif..."
3,s*****y,5,Gorgeous tanzanite pieces! Quality of the jewe...,gorgeous tanzanite piece quality jewelry box good,"[gorgeous, tanzanite, piece, quality, jewelry,..."
4,merielyng.salvador,5,"Super nice and elegant, worth it and very good...",super nice elegant worth good idea gift someone,"[super, nice, elegant, worth, good, idea, gift..."
...,...,...,...,...,...
562,mozz19,5,very fast delivery! super pretty bracelets w v...,fast delivery super pretty bracelet w nice she...,"[fast, delivery, super, pretty, bracelet, w, n..."
563,thepersiancat,5,Repeated purchases. Always buying from thi seller,repeat purchase always buy thi seller,"[repeat, purchase, always, buy, thi, seller]"
564,thepersiancat,5,Excellent workmanship!,excellent workmanship,"[excellent, workmanship]"
565,n*****l,5,Ordered on 23 Jan & received on 29 Jan.This is...,order jan receive jan order jamstones love item,"[order, jan, receive, jan, order, jamstones, l..."


### 6 - Save the cleaned data to a new csv file in a separate directory

In [25]:
real_reviews_df.to_csv("../sa-data/clean-shopee-reviews.csv", encoding="UTF-8")