# Analysis of Category Variable

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [3]:
amazon_data = pd.read_csv('../Datasets/clean_amazon_data.csv').drop(columns = ['Unnamed: 0'])

In [4]:
amazon_data.columns

Index(['category', 'title', 'price', 'rating', 'No of reviews'], dtype='object')

In [5]:
amazon_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9972 entries, 0 to 9971
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   category       9972 non-null   object 
 1   title          9972 non-null   object 
 2   price          9972 non-null   float64
 3   rating         9972 non-null   float64
 4   No of reviews  9972 non-null   int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 389.7+ KB


In [6]:
amazon_data.head()

Unnamed: 0,category,title,price,rating,No of reviews
0,"['Kitchen & Dining', 'Kitchen Utensils & Gadge...",leifheit comfortline gourmet garlic slicer | f...,15.99,4.4,403
1,"['Health & Household', 'House Supplies', 'Hous...",food grade mineral oil cutting boards countert...,19.8,4.9,321
2,"['Automotive', 'RV Parts & Accessories', ' Lig...",srrb direct 1139 1383 led replacement r12 ba15...,15.99,4.4,321
3,"['Toys & Games', 'Hobbies', 'Trains & Accessor...",bachmann industries 50 sliding door box santa ...,53.99,4.5,9
4,"['Sports & Outdoors', 'Outdoor Recreation', 'C...",unisex cycling cap breathable anti-sweat helme...,29.95,4.0,11


In [7]:
amazon_data.columns = ['category', 'title', 'price', 'rating', 'num_reviews']
amazon_data

Unnamed: 0,category,title,price,rating,num_reviews
0,"['Kitchen & Dining', 'Kitchen Utensils & Gadge...",leifheit comfortline gourmet garlic slicer | f...,15.99,4.4,403
1,"['Health & Household', 'House Supplies', 'Hous...",food grade mineral oil cutting boards countert...,19.80,4.9,321
2,"['Automotive', 'RV Parts & Accessories', ' Lig...",srrb direct 1139 1383 led replacement r12 ba15...,15.99,4.4,321
3,"['Toys & Games', 'Hobbies', 'Trains & Accessor...",bachmann industries 50 sliding door box santa ...,53.99,4.5,9
4,"['Sports & Outdoors', 'Outdoor Recreation', 'C...",unisex cycling cap breathable anti-sweat helme...,29.95,4.0,11
...,...,...,...,...,...
9967,"['Sports & Outdoors', 'Fan Shop', 'Home & Kitc...",gtei dallas cowboys desk lamp,39.99,4.1,6
9968,"['Beauty & Personal Care', 'Skin Care', ' Suns...",jergens 20938 natural glow instant sun body mo...,10.68,4.3,8185
9969,"['Automotive', 'Performance Parts & Accessorie...",arp 201-6303 connecting rod bolt kit,8.68,5.0,2
9970,"['Automotive', 'Performance Parts & Accessorie...",walker 31533 exhaust gasket,10.20,4.2,55


In [8]:
amazon_data.iloc[0].title

'leifheit comfortline gourmet garlic slicer | finely cut slice vegetableswhite red'

## Formatting Category Variable For Easier Analysis

In [9]:
amazon_data['category'].dtype

dtype('O')

In [10]:
se = amazon_data['category'].iloc[0]
se

"['Kitchen & Dining', 'Kitchen Utensils & Gadgets', 'Graters, Peelers & Slicers', ' Mandolines & Slicers']"

In [11]:
def format_category(s):
    res = s[2:-2].replace("'", "").replace('"',"").split(", ")  # convert string to list 
    result = [s.strip() for s in res]           # noticed some leading and trailing whitespace in entries (see above cell)
    return set(result)                          # convert to set() for O(1) search 

In [12]:
format_category(amazon_data['category'][1])

{'Health & Household',
 'House Supplies',
 'Household Cleaning',
 'Waxes & Oils',
 'Wood Conditioners'}

In [13]:
type(format_category(amazon_data['category'][1]))

set

In [14]:
amazon_data['clean_category'] = amazon_data['category'].apply(format_category)
amazon_data['clean_category']

0       {Mandolines & Slicers, Kitchen & Dining, Kitch...
1       {Waxes & Oils, Wood Conditioners, Health & Hou...
2          {RV Parts & Accessories, Lighting, Automotive}
3       {Boxcars, Trains & Accessories, Train Cars, To...
4       {Outdoor Recreation, Clothing, Cycling, Sports...
                              ...                        
9967    {Lamps, Sports & Outdoors, Home & Kitchen, Fan...
9968    {Beauty & Personal Care, Sunscreens & Tanning ...
9969    {Bolts & Nuts, Engines & Engine Parts, Perform...
9970    {Performance Parts & Accessories, Automotive, ...
9971    {Accessories, Outdoor Recreation, Skateboards ...
Name: clean_category, Length: 9972, dtype: object

In [15]:
categories = amazon_data['clean_category']

In [16]:
categories.describe()

count                                                  9972
unique                                                 7482
top       {Teachers Calendars & Planners, Office Product...
freq                                                      6
Name: clean_category, dtype: object

## Extracting and Examining Unique Category Values

In [17]:
category_list = set()
for cats in categories.values:
    category_list = category_list.union(cats)
category_list = sorted(list(category_list))

In [18]:
category_list = {}
for cats in categories.values:
    for c in cats:
        category_list[c] = category_list.setdefault(c, 0) + 1

In [19]:
category_list.items()

sories', 1), ('Noise Filters', 1), ('L-Lysine', 1), ('Parking Brake Shoes', 1), ('Pick-Up Tubes & Screens', 1), ('Temperature Probes & Sensors', 1), ('Raised Bowls', 2), ('Iron-on Transfers', 1), ('Students Scissors', 1), ('Marching Snare Drums', 1), ('Coffee Cups & Mugs', 1), ('Taco Sauce', 2), ('Bases & Melts', 2), ('Video Glasses', 1), ('Floss', 1), ('Fluorescent Lamps', 1), ('Commercial Food Preparation Equipment', 2), ('Bag Sealers', 1), ('Computer Paper', 1), ('Single Ear Bluetooth Headsets', 2), ('Chefs Knives', 1), ('Airlocks', 1), ('Lantern Accessories', 1), ('Dried Figs', 1), ('Grooving Inserts', 1), ('Incense Holders', 1), ('Multipurpose Bathroom Cleaners', 2), ('Holders', 1), ('Faucet Valves', 1), ('Home Digital Pianos', 1), ('Electronic Keyboards', 1), ('Picks', 1), ('Solar Battery Chargers & Charging Kits', 1), ('Spare & Replacement Parts', 1), ('Shampoos Plus Conditioners', 1), ('Feed Necks', 1), ('Lab Chemicals', 1), ('Backpacking Stoves', 1), ('Optical Drives', 1), ('P

In [20]:
category_df = pd.DataFrame.from_dict(category_list.items())
category_df.columns = ['category', 'frequency']
category_df.head()

Unnamed: 0,category,frequency
0,Mandolines & Slicers,1
1,Kitchen & Dining,447
2,Kitchen Utensils & Gadgets,83
3,Peelers & Slicers,2
4,Graters,2


In [21]:
category_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6656 entries, 0 to 6655
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   category   6656 non-null   object
 1   frequency  6656 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 104.1+ KB


In [35]:
category_df.sort_values('frequency', ascending = False, inplace = True, ignore_index = True)
category_df

Unnamed: 0,category,frequency,avg_rating
0,Sports & Outdoors,1390,
1,Clothing,1358,
2,Shoes & Jewelry,1111,
3,Automotive,1109,
4,Industrial & Scientific,886,
...,...,...,...
6651,Ginger Snaps,1,
6652,Motion Controllers,1,
6653,Umbrella Lights,1,
6654,Fruit Leather,1,


## Top Categories

In [36]:
print(category_df['category'].values[0:5])

['Sports & Outdoors' 'Clothing' 'Shoes & Jewelry' 'Automotive'
 'Industrial & Scientific']


## Average Review by Category

In [37]:
'Graters' in amazon_data['clean_category'] 

False

In [38]:
amazon_data['clean_category'].apply(lambda x: 'Graters' in x)

0        True
1       False
2       False
3       False
4       False
        ...  
9967    False
9968    False
9969    False
9970    False
9971    False
Name: clean_category, Length: 9972, dtype: bool

In [39]:
amazon_data[amazon_data['clean_category'].apply(lambda x: 'Graters' in x)]['rating'].mean()

3.6

In [42]:
category_df['category'].values

array(['Sports & Outdoors', 'Clothing', 'Shoes & Jewelry', ...,
       'Umbrella Lights', 'Fruit Leather', 'Bolts & Nuts'], dtype=object)

In [43]:
category_df.index

RangeIndex(start=0, stop=6656, step=1)

In [44]:
category_df.iloc[0]

category      Sports & Outdoors
frequency                  1390
avg_rating                  NaN
Name: 0, dtype: object

In [54]:
for i in category_df.index:
    row = category_df.loc[i]
    category = row['category']
    contains_category = amazon_data['clean_category'].apply(lambda x: category in x)
   
    category_df.at[i, 'avg_rating'] =  amazon_data[contains_category]['rating'].mean()

In [55]:
category_df

Unnamed: 0,category,frequency,avg_rating
0,Sports & Outdoors,1390,4.376691
1,Clothing,1358,4.347054
2,Shoes & Jewelry,1111,4.341404
3,Automotive,1109,4.411903
4,Industrial & Scientific,886,4.429684
...,...,...,...
6651,Ginger Snaps,1,4.700000
6652,Motion Controllers,1,4.500000
6653,Umbrella Lights,1,4.400000
6654,Fruit Leather,1,4.700000


## Average value for any variable by category

In [60]:
def store_average_value(variable):

    for i in category_df.index:
        row = category_df.loc[i]        # row in df corresponding to category
        category = row['category']      # category name 
        contains_category = amazon_data['clean_category'].apply(lambda x: category in x)    # does the product fall in the category?
    
        category_df.at[i, "avg_"+variable] =  amazon_data[contains_category][variable].mean()  # if so, average the variable we desire

In [61]:
store_average_value("price")

In [62]:
category_df

Unnamed: 0,category,frequency,avg_rating,avg_price
0,Sports & Outdoors,1390,4.376691,49.290036
1,Clothing,1358,4.347054,36.191775
2,Shoes & Jewelry,1111,4.341404,36.406724
3,Automotive,1109,4.411903,59.004734
4,Industrial & Scientific,886,4.429684,54.669944
...,...,...,...,...
6651,Ginger Snaps,1,4.700000,31.020000
6652,Motion Controllers,1,4.500000,27.990000
6653,Umbrella Lights,1,4.400000,11.490000
6654,Fruit Leather,1,4.700000,17.800000
