In [1]:
import nltk
import string
from textblob import TextBlob

nltk.download('stopwords')
from nltk.corpus import stopwords
stoplist = stopwords.words('english') + ['though']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Importing Data Files
import pandas as pd
import numpy  as np
products_df = pd.read_csv('/content/products.tsv', sep='\t', header=0)
reviews_df = pd.read_csv('/content/reviews.tsv', sep='\t', header=0)

In [None]:
pd.set_option('display.max_rows', None)

In [3]:
reviews_df['EnglishReview'] = np.where(reviews_df['languageCode.1']=='en-US', reviews_df['reviewText'], reviews_df['translation.reviewText'])

## Data Cleaning
reviews_df.loc[reviews_df['id']=='46952c82-d750-41c5-b8f2-92579bb8039c','languageCode'] = 'ru-RU'
reviews_df.loc[reviews_df['id']=='46952c82-d750-41c5-b8f2-92579bb8039c','languageCode.1'] = 'ru-RU'
reviews_df.loc[reviews_df['id']=='46952c82-d750-41c5-b8f2-92579bb8039c','translation.reviewText'] = "The mask, in principle, is cool. Made of pleasant natural fabric, thin. But!!! you need to know that after the first wash it does not significantly decrease in size. Suitable for a child or a girl with a small face. Therefore, for a small face - 5+++! For a standard adult - 4 not because of quality, but because of size"

reviews_df = reviews_df.drop_duplicates(inplace=False, ignore_index=False)
product_id = 101955
bad_reviews = reviews_df.loc[(reviews_df['productId']==product_id) & ((reviews_df['ratingValue']==40) | (reviews_df['ratingValue']==50))]['EnglishReview']#.apply(remove_stopwords,stopwords=bad_stopwords)
# bad_reviews.to_csv(filename,index=False)
bad_reviews

2884    The mask is comfortable. Fits tightly to the f...
2885    I do not know what is there with the certifica...
2886    The masks are pretty good. I ordered after wea...
2887          It fits well to the face and does not slip.
2888    The masks are great. Sits comfortably on the f...
                              ...                        
3534                                    wonderful product
3535                                           Very good 
3536                                        I love I Herb
3538            They are fine, the same as any other KN95
3539                                           Just right
Name: EnglishReview, Length: 565, dtype: object

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3))
# matrix of ngrams
ngrams = c_vec.fit_transform(bad_reviews)
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})

In [None]:
df_ngram['polarity'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram['subjective'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).subjectivity)

In [None]:
df_ngram.loc[df_ngram['polarity']<-0.1]

Unnamed: 0,frequency,bigram/trigram,polarity,subjective
11,13,tightly face,-0.178571,0.285714
38,6,small face,-0.25,0.4
55,5,hard breathe,-0.291667,0.541667
70,4,much expensive,-0.5,0.7
72,4,masks tight,-0.178571,0.285714
93,4,face small,-0.25,0.4
103,4,39 hard,-0.291667,0.541667
107,3,turned small,-0.25,0.4
109,3,times expensive,-0.5,0.7
110,3,tightly face slip,-0.178571,0.285714


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline
tfidf_vectorizer = TfidfVectorizer(stop_words=stoplist, ngram_range=(2,3))
nmf = NMF(n_components=3)
pipe = make_pipeline(tfidf_vectorizer, nmf)
pipe.fit(bad_reviews)
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
print_top_words(nmf, tfidf_vectorizer.get_feature_names_out(), n_top_words=3)

Topic #0: good quality, good quality mask, quality mask
Topic #1: good product, good product like, product recommend
Topic #2: good masks, nice masks, reasonable price



**Review Tagging** 

As a list in the dataframe

In [18]:
chkdict = {
  'damage_words' : ['broke','tore','break','tear','torn','snapped','snap','fraying','fray'],
  'costly_words' : ['expensive','costly','pricy'],
  'economical_words': ['inexpensive','discount','bargain','cheap','affordable','economical','steal','offer'],
  'skin_words' : ['allergy','makeup','make-up','rashes','rash','swollen','cut','blood','sweat'],
  'fit_words' : ['small','fit','right','loose','big','small','large','little','smaller','larger'],
  'quality_words' : ['thick','thin','cotton','nylon','elastic','band'],
  'shipping_words' : ['shipped','late','arrived','early','packed','tax','import'],
  'fogging_words' : ['lenses','eyewear','glasses','fogging','mist'],
  'washing_words' : ['wash','washing','detergent','water','rinse','soap'],
  'water_test_words' : ['water test','water'],
  'injury_words' : ['injured','injure','blood','bruised','bruise'],
  'makeup_words' : ['makeup','make-up','make up','lipstick','mascara','eyeliner','smudge']
}
def check_in(text):
  lc_text = text.translate(str.maketrans('', '', string.punctuation))
  lc_text = lc_text.lower().split()
  rtags = []
  for key,value in chkdict.items():
    if any( i in value for i in lc_text):
      rtags.append(key)
  return rtags#','.join(rtags)

reviews_df['Tags'] = reviews_df['EnglishReview'].apply(check_in)
# reviews_df.loc[reviews_df['Tags']]
tag_df = reviews_df[reviews_df['Tags'].map(lambda tags: 'costly_words' in tags or 'economical_words' in tags)]
pd.set_option('max_colwidth', None)
# Slice Positive Reviews
tag_df = tag_df.loc[((tag_df['ratingValue']==40) | (tag_df['ratingValue']==50))]
# Slice Negative Reviews
# tag_df = tag_df.loc[((tag_df['ratingValue']==10) | (tag_df['ratingValue']==20) | (tag_df['ratingValue']==30))]
tagdf = tag_df[['productId','EnglishReview','Tags']]
tagdf = tagdf.loc[tagdf['productId']==101955]
# Reviews of Mask 101955 that have positive things about price.
tagdf

Unnamed: 0,productId,EnglishReview,Tags
2898,101955,"Affordable, good quality",[economical_words]
2915,101955,good product and very inexpensive,[economical_words]
2948,101955,"Mask with a high degree of protection KN95, I add this product to almost every parcel, the quality is excellent, the elastic bands are soft, I breathe easier in it than in a regular medical mask. Attractive price for iHerb, in Russia such masks are much more expensive.","[costly_words, quality_words]"
2955,101955,Good inexpensive masks,[economical_words]
2973,101955,"Great masks at a great price. In our city, such masks are 4 times more expensive. Convenient packing of 10 pieces. Excellent nose fixer and comfortable rubber bands.",[costly_words]
2997,101955,"Super masks, and for such a price, just a gift. Our similar is much more expensive.",[costly_words]
3022,101955,"My brother introduced me to this style, and now that I have my own supply, I've shared some with my elderly Mom. These masks fit us better and are more comfortable than the blue and white medical masks that were so prevalent at the beginning of the epidemic. The adjustable nose piece keeps the mask safely anchored over my nose, and the sides fit the contours of my face without gaps. Breathing is more comfortable than with other masks. The four-pack discount makes them very affordable.","[economical_words, fit_words]"
3026,101955,"Nice, well-made masks, excellent metal rivets! Considering that in Russia such masks are several times more expensive, I am very pleased with the purchase!",[costly_words]
3045,101955,"Masks of high quality and quite comfortable, at the time of purchase on the site, one such one could be bought in stores for about 200 rubles. And here are 10 pieces for 461 rubles without a discount. I will take more if necessary.",[economical_words]
3144,101955,Nice masks. The elastic bands will not come off. In Russia they sell more expensive,"[costly_words, quality_words]"
