In [1]:
# Imports
import numpy as np
import pandas as pd

In [2]:
# Create a dataframe of all reviews (with comments and star ratings only)
github_url = 'https://raw.githubusercontent.com/csbanon/bert-product-rating-predictor/master/data/reviews_comments_stars.csv'
df = pd.read_csv(github_url)
df = df[['comment', 'stars']]
df.head()

Unnamed: 0,comment,stars
0,I could sit here and write all about the specs...,5
1,A very reasonably priced laptop for basic comp...,4
2,"This is the best laptop deal you can get, full...",5
3,A few months after the purchase....It is still...,5
4,BUYER BE AWARE: This computer has Microsoft 10...,1


In [3]:
# Create review dataframes for each category of star rating
df1 = df[df['stars']==1]
df2 = df[df['stars']==2]
df3 = df[df['stars']==3]
df4 = df[df['stars']==4]
df5 = df[df['stars']==5]

df1.head()

Unnamed: 0,comment,stars
4,BUYER BE AWARE: This computer has Microsoft 10...,1
14,Why Acer install this version of Windows 10 wi...,1
15,As I write this I am on the phone with Microso...,1
23,"Just abominable, at the absolute worst time in...",1
26,Beware of Windows 10s. I’ve been on live chat ...,1


In [4]:
# Required modules
!pip install Unidecode
!pip install word2number



In [5]:
# More imports
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer 

import string
import unidecode
import re
import unidecode

# !python -m spacy download en_core_web_md
# import spacy
# from word2number import w2n

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
# Tokenizer instantiation
tokenizer = RegexpTokenizer(r"\w+[a-zA-Z]")

In [7]:
# TODO?: 
# Convert number words to numeric characters

# load spacy model
# nlp = spacy.load('en_core_web_md')

# def words_to_nums(text):
#   doc = nlp(text)
#   tokens = [w2n.word_to_num(token.text) if token.pos_ == 'NUM' else token for token in doc]
#   return tokens

In [8]:
# Remove numeric characters
def remove_nums(text):
  text = re.sub("\d+", " ", text)
  return text

In [9]:
# Remove stop words
def remove_stopwords(text):
  stoplist = stopwords.words('english')
  not_stopwords = {'no', 'not'} # should not remove these if doing sentiment analysis
  stoplist = set([word for word in stoplist if word not in not_stopwords])
  words = [w for w in text if w not in stoplist]
  return words

In [10]:
# Lemmatization
lemmatizer = WordNetLemmatizer() 
def lemmatize(text):
  lem_text = [lemmatizer.lemmatize(i) for i in text]
  return lem_text

In [11]:
# Convert accented characters to ascii
def remove_accents(text):
  text = unidecode.unidecode(text)
  return text

In [12]:
# Perform preprocessing for each category of star rating
df_list = [df1, df2, df3, df4, df5]

for frame in df_list:
  frame['comment'] = frame['comment'].apply(lambda x: remove_accents(x))
  frame['comment'] = frame['comment'].apply(lambda x: remove_nums(x))
  frame['comment'] = frame['comment'].apply(lambda x: tokenizer.tokenize(x.lower())) # tokenizes and convert to lowercase
  frame['comment'] = frame['comment'].apply(lambda x: remove_stopwords(x))
  frame['comment'] = frame['comment'].apply(lambda x: lemmatize(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/p

In [13]:
# Test
df1.head()

Unnamed: 0,comment,stars
4,"[buyer, aware, computer, microsoft, scam, soft...",1
14,"[acer, install, version, window, forced, mode,...",1
15,"[write, phone, microsoft, attempting, get, mul...",1
23,"[abominable, absolute, worst, time, life, poss...",1
26,"[beware, window, live, chat, microsoft, hour, ...",1


In [14]:
# Test
df2.head()

Unnamed: 0,comment,stars
18,"[update, computer, fried, simply, boot, anymor...",2
34,"[using, acer, laptop, decade, fact, still, usi...",2
44,"[result, shelter, place, initiative, bought, t...",2
52,"[initially, loved, laptop, would, give, star, ...",2
60,"[laptop, received, review, opened, prior, actu...",2


In [15]:
# Test: Inspect one set of tokens for one review comment
# stringg = df2.iloc[0:1,0:1]
# stringg.to_csv('sample.csv')

In [16]:
# Test
df5.head()

Unnamed: 0,comment,stars
0,"[could, sit, write, spec, computer, already, d...",5
2,"[best, laptop, deal, get, full, stop, touchesc...",5
3,"[month, purchase, still, running, good, bought...",5
5,"[update, labtop, completely, died, near, end, ...",5
7,"[run, amazing]",5


In [17]:
# Create composite dataframe of tokens for all reviews, for all star ratings
mega_df = pd.concat(objs=[df1,df2,df3,df4,df5])

In [18]:
# Save as csv files
mega_df.to_csv('all_tokens_stars.csv')

for idx, frame in enumerate(df_list):
  filename = 'tokens_rating_'+str(idx+1)+'.csv'
  frame.to_csv(filename)

#References:
1. https://towardsdatascience.com/nlp-text-preprocessing-a-practical-guide-and-template-d80874676e79
2. https://towardsdatascience.com/nlp-for-beginners-cleaning-preprocessing-text-data-ae8e306bef0f