# Daria Morgan

# Supplementary jupyter notebook with the code for ScatterText

In [1]:
import pandas as pd
import numpy as np

In [77]:
from nltk.corpus import stopwords

In [2]:
df = pd.read_pickle('counter_dataset')

In [5]:
df['category'].unique()

array(["Women's Shoes", 'Hair Care', 'MakeUp', 'SkinCare',
       "Men's Clothing", 'Headphones', "Men's Watches", "Women's Watches",
       "Women's Handbags", "Women's Clothing", "Men's Shoes"],
      dtype=object)

In [7]:
df[df['category'] == "Women's Shoes"]['brand'].unique()

array(['Birkenstock', 'TOMS', 'Converse', 'JIMMY CHOO',
       'Polo Ralph Lauren', 'Diane von Furstenberg', 'New', 'ASICS',
       'Timberland', 'NIKE', 'adidas', 'Vans', 'Reebok', 'PUMA', 'Cole',
       'Pumas', 'Lauren by Ralph Lauren', 'Stuart', 'BALLY',
       'Ralph Lauren Polo'], dtype=object)

In [63]:
# filter DataFrame for TOMS and Birkenstock

df_toms_brst = df[df['brand'].isin(['TOMS', 'Birkenstock'])]

In [86]:
# filter DataFrame for 3 star and below reviews

df_toms_brst = df_toms_brst[df_toms_brst['rating'] < 4]

In [87]:
df_toms_brst['brand_combination'] = 'TOMS and Birkenstock'

In [88]:
# filter DataFrame for ASICS and New Balance

df_ASICS_new = df[df['brand'].isin(['ASICS', 'New'])]

In [89]:
# filter DataFrame for 3 star and below reviews

df_ASICS_new = df_ASICS_new[df_ASICS_new['rating'] < 4]

In [90]:
df_ASICS_new['brand_combination'] = 'ASICS and New Balance'

In [91]:
df_all = pd.concat([df_toms_brst,df_ASICS_new])

In [92]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13293 entries, 5 to 359042
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   asin                        13293 non-null  object 
 1   review_post_id              13293 non-null  object 
 2   body                        13284 non-null  object 
 3   review_text                 13293 non-null  object 
 4   helpful_count               13293 non-null  int64  
 5   num_images                  13293 non-null  int64  
 6   rating                      13293 non-null  int64  
 7   date_posted                 13293 non-null  object 
 8   is_verified_purchase        13293 non-null  bool   
 9   is_vine_voice               13293 non-null  bool   
 10  num_comments                13293 non-null  int64  
 11  has_video                   13293 non-null  bool   
 12  category                    13293 non-null  object 
 13  list_price                  0 

In [94]:
df_all.dropna(subset=['body'], inplace=True)

### ScatterText

In [95]:
df_all.columns

Index(['asin', 'review_post_id', 'body', 'review_text', 'helpful_count',
       'num_images', 'rating', 'date_posted', 'is_verified_purchase',
       'is_vine_voice', 'num_comments', 'has_video', 'category', 'list_price',
       'price_low', 'price_high', 'is_add_on', 'is_prime', 'is_prime_pantry',
       'is_prime_exclusive', 'is_fresh', 'has_sns', 'offer_fulfiller',
       'offer_merchant', 'lowest_price_new_condition', 'brand', 'num_reviews',
       'num_images_pdp', 'brand_combination'],
      dtype='object')

In [96]:
df_all = df_all.reset_index().drop(columns=['index'])

In [97]:
import scattertext as st

In [98]:
df_all['brand_combination'].value_counts()

ASICS and New Balance    12148
TOMS and Birkenstock      1136
Name: brand_combination, dtype: int64

In [108]:
from nltk.corpus import stopwords

In [15]:
stop_words_ = stopwords.words('english')

In [117]:
new_stop_words = ['birkenstock', 'birkenstocks', 'toms', 'asics', 42, '42', '4e', '2e', '80', 80, '22s',
                  'gt', 'nimbus', '2000', 2000, 'e', 'v1', 'v2', '18s', 'ee', 'shoe', 'balance', 'new',
                  'kayano', 'shoes', 'sneakers', 'like', 'made', 'tom', '#', 'xl', 'small', 39, '39',
                  45, '45', 0, '0', '23', 23, 22, '22', 'kayanos', 'sandal', 'sandals', 'cumulus',
                  'velcro', 'asic', 'nb', 'birks', 'cork', 'buckle', 'sneaker', '17', '16', '1st', 'v3']

In [118]:
stop_words_.extend(new_stop_words)

In [119]:
corpus = st.CorpusFromPandas(df_all,
                             category_col='brand_combination',
                             text_col='body',
                             nlp=st.whitespace_nlp_with_sentences
                            ).build().remove_terms(stop_words_, ignore_absences=True)

In [133]:
html = st.produce_scattertext_explorer(
        corpus,
        category='TOMS and Birkenstock',
        category_name='TOMS and Birkenstoc',
        not_category_name='ASICS and New Balance',
        minimum_term_frequency=10,
        pmi_threshold_coefficient=5,
        width_in_pixels=1000,
        metadata=df_all['review_post_id'],
        )


open('ScatterText_Amazon_Ascics_Toms.html', 'wb').write(html.encode('utf-8'));

# Word frequency within Toms & New Balance vs. BURBERRY,Coach, & LV

Now I want to try to compare word frequencies between different categories among 1-3 star reviews to avoid category specific terms, e.g. buckle (Birkinstock sandals)

In [134]:
# separate DataFrame for desired brands to compare and filter to sub 4 star review (not included)

df_other_brands = df[(df['brand'].isin(['BURBERRY', 'Louis Vuitton', 'Coach', 'Anastasia']))&(df['rating']<4)]

In [137]:
df_other_brands['brand_combination'] = 'BURBERRY and others'

In [160]:
df_all_other = pd.concat([df_all,df_other_brands])

In [161]:
df_all_other = df_all_other.reset_index().drop(columns=['index'])

In [162]:
df_all_other.dropna(subset=['body'], inplace=True)

In [163]:
# change value in 'brand_combination' that BURBERRY and Toms for 'Known grey market brands'

df_all_other.loc[df_all_other['brand_combination'] == 'BURBERRY and others','brand_combination'] = 'Known grey market brands'

In [164]:
df_all_other.loc[df_all_other['brand_combination'] == 'TOMS and Birkenstock','brand_combination'] = 'Known grey market brands'

In [165]:
df_all_other['brand_combination'].unique()

array(['Known grey market brands', 'ASICS and New Balance'], dtype=object)

In [135]:
from nltk.corpus import stopwords

In [172]:
stop_words_ = stopwords.words('english')

In [187]:
new_stop_words_ = [0,'made','sephora','anastasia','22s','xl','small','burberry','kayanos','asic','beautiful',
                   22,'shoe', 23,'cork','16','42','coach','17',39,'gt',42,'nimbus','tom',45,'22','v2','buckle',
                   'palette','like','45','bag','80','sneakers','velcro','sneaker','v3','louis vuitton',
                   'asics','e','2000','balance','4e','39','brush',80,2000,'kayano','birkenstock','23',
                   'cumulus','birkenstocks','#','18s','0','sandals','shoes','1st','nb','2e','new',
                   'nice','v1','toms','feet','sandal','ee','birks','purse','flip','thong','canvas','gold',
                   'pigment','95',95,'shiny','ulta']

In [188]:
stop_words_.extend(new_stop_words_)

In [189]:
corpus = st.CorpusFromPandas(df_all_other,
                             category_col='brand_combination',
                             text_col='body',
                             nlp=st.whitespace_nlp_with_sentences
                            ).build().remove_terms(stop_words_, ignore_absences=True)

In [190]:
html = st.produce_scattertext_explorer(
        corpus,
        category='Known grey market brands',
        category_name='Known grey market brands',
        not_category_name='ASICS and New Balance',
        minimum_term_frequency=10,
        pmi_threshold_coefficient=5,
        width_in_pixels=1000,
        metadata=df_all_other['review_post_id'],
        )


open('ScatterText_Amazon_other_brands.html', 'wb').write(html.encode('utf-8'));