# Data Research on Product, Class, and Recommended Content

In [None]:
'''
Import packages and display settings
'''
## supress warnings
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
## display settings
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 10000000)
pd.set_option('max_info_columns', 10000000)
import os
## Assign dataset path
path_data = 'C:/Users/Cody_Black/JupyterNotebook/Dataset'
os.chdir(path_data)
## Data viz lib
# %matplotlib notebook
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
from matplotlib.pyplot import xticks

In [2]:
df_A_Test = pd.read_excel(open('Babson Hackathon _ Exhibits A _ B.xlsx', 'rb'), sheet_name='Exhibit A | Test Results') 
df_ContentRecomm = pd.read_csv('ContentRecommendationData.csv')

In [3]:
## Revise column names
for df in [df_A_Test, df_ContentRecomm]:
    df.columns = df.columns.str.replace(" ", "_")

## Add prefix to columns in each df
df_ContentRecomm = df_ContentRecomm.add_prefix('CR_')
df_A_Test = df_A_Test.add_prefix('AT_')

In [4]:
for col in ['AT_clid', 'AT_clmkcid', 'AT_clname', 'AT_biccontenttypename', 'AT_mkcname']:
    df_A_Test[col] = df_A_Test[col].astype('category')
for col in ['CR_prsku', 'CR_clid', 'CR_biccontenttypename', 'CR_hasbiccontenttype' , 'CR_biccontenttypename']:
    df_ContentRecomm[col] = df_ContentRecomm[col].astype('category')

# Research on class and content

In [5]:
'''
Let's take class 'Beds' as an example, and check how many contents are under this class.

The result shows that each content will only show once.
'''
## In df_A_Test dataset
df_A_Test[df_A_Test['AT_clname']=='Beds']['AT_biccontenttypename'].value_counts()

Materials / How Its Made           1
Compatibility & Adjustability      1
Dimensions                         1
Product Feature                    1
What's In the Box                  0
Assembly & Installation            0
Awards                             0
Cleaning & Care                    0
How to Use                         0
Product Capacity                   0
Weight                             0
Relative Size & Fit                0
Safety Certifications & Details    0
Touch & Feel                       0
Visual Details                     0
Warranty & Guarantees              0
About the Brand                    0
Name: AT_biccontenttypename, dtype: int64

In [6]:
## In df_ContentRecomm dataset
# Since there's no clname attribute in df_ContentRecomm, we need to find out its associated id
print('The class id associated with Beds is', df_A_Test[df_A_Test['AT_clname']=='Beds']['AT_clid'].values[0])

df_ContentRecomm[df_ContentRecomm['CR_clid']==12]['CR_biccontenttypename'].value_counts()

The class id associated with Beds is 12


Relative Size & Fit                16743
Product Feature                    16743
Dimensions                         16743
Materials / How Its Made           16743
What's In the Box                      0
How to Use                             0
Assembly & Installation                0
Awards                                 0
Cleaning & Care                        0
Compatibility & Adjustability          0
Product Capacity                       0
Weight                                 0
Safety Certifications & Details        0
Touch & Feel                           0
Visual Details                         0
Warranty & Guarantees                  0
About the Brand                        0
Name: CR_biccontenttypename, dtype: int64

<h4>Every content has same number of rows. Does it mean under same class, each product will have same recommended contents?

In [7]:
'''
If we unique number of product(CR_prsku) with unique content(CR_biccontenttypename)
which is 16743 * 4 = 66972
It verifies that
1. There are multiple recommended contents
2. The recommended contents under each class will distribute to all products under the same class
'''
df_ContentRecomm[df_ContentRecomm['CR_clid']==12].describe(include = 'all')

Unnamed: 0,CR_prsku,CR_clid,CR_percentilerank,CR_biccontenttypename,CR_hasbiccontenttype
count,66972,66972.0,58204.0,66972,66972.0
unique,16743,1.0,,4,2.0
top,DHOM1113,12.0,,Relative Size & Fit,0.0
freq,4,66972.0,,16743,66518.0
mean,,,0.858281,,
std,,,0.171722,,
min,,,0.0,,
25%,,,0.8,,
50%,,,0.91,,
75%,,,0.96,,


In [8]:
### Sort by products
df_ContentRecomm[df_ContentRecomm['CR_clid']==12].sort_values(by=['CR_prsku', 'CR_biccontenttypename']).head(8)

Unnamed: 0,CR_prsku,CR_clid,CR_percentilerank,CR_biccontenttypename,CR_hasbiccontenttype
40525,AAFM1014,12,0.96,Dimensions,0
7793,AAFM1014,12,0.96,Materials / How Its Made,0
236630,AAFM1014,12,0.96,Product Feature,0
269158,AAFM1014,12,0.96,Relative Size & Fit,0
68458,AAFM1015,12,0.94,Dimensions,0
227305,AAFM1015,12,0.94,Materials / How Its Made,0
218097,AAFM1015,12,0.94,Product Feature,0
2701,AAFM1015,12,0.94,Relative Size & Fit,0


<h4> Let's check the whole classes including Beds

In [10]:
'''
We can see that every class has 4 contents
and each content has same number of product
'''
temp = df_ContentRecomm.groupby(['CR_clid', 'CR_biccontenttypename']
              ).agg(num_product=('CR_prsku', 'count'))
temp[temp['num_product']!=0]

Unnamed: 0_level_0,Unnamed: 1_level_0,num_product
CR_clid,CR_biccontenttypename,Unnamed: 2_level_1
1,Materials / How Its Made,15949
1,Product Capacity,15949
1,Relative Size & Fit,15949
1,Visual Details,15949
2,Compatibility & Adjustability,16233
2,Dimensions,16233
2,Materials / How Its Made,16233
2,Product Feature,16233
3,Assembly & Installation,16683
3,Dimensions,16683


In [16]:
'''
Use for loop to test whether every product has same contents under same class.
Since the dataset is large, we only test first class as an exmple.
'''

df_ContentRecomm = df_ContentRecomm.sort_values(by=['CR_clid', 'CR_prsku'])

# Only test first class for example
for class_id in [df_ContentRecomm['CR_clid'].unique()[0]]:
    _1stProduct = df_ContentRecomm[(df_ContentRecomm['CR_clid']==class_id)]['CR_prsku'].iloc[0]
                                                  
    setContent_1stProduct = set(df_ContentRecomm[(df_ContentRecomm['CR_clid']==class_id) & \
                                                 (df_ContentRecomm['CR_prsku']==_1stProduct)
                                                ]['CR_biccontenttypename'].values)
    print('Contents of first product,', _1stProduct,
          'under class', class_id, 
          'is:', setContent_1stProduct)
    for item in df_ContentRecomm[df_ContentRecomm['CR_clid']==class_id]['CR_prsku'].unique():
        setContent = set(df_ContentRecomm[(df_ContentRecomm['CR_clid']==class_id) & \
                     (df_ContentRecomm['CR_prsku']==item)]['CR_biccontenttypename'].values)
        if setContent != setContent_1stProduct:
            print('product', item, 'does not have same content as', df_ContentRecomm['CR_prsku'].iloc[0])
            break
        elif item==df_ContentRecomm[df_ContentRecomm['CR_clid']==class_id]['CR_prsku'].unique()[-1]:
            print('Prodcuts under', class_id, 'all have contents:', setContent_1stProduct)
        #else: 
            #print("Prodcut", item, 'has same content as', df_ContentRecomm[(df_ContentRecomm['CR_clid']==class_id)]['CR_prsku'].iloc[0], end=';')
    #print()

Contents of first product, AAGE1000 under class 1 is: {'Materials / How Its Made', 'Product Capacity', 'Relative Size & Fit', 'Visual Details'}
Prodcuts under 1 all have contents: {'Materials / How Its Made', 'Product Capacity', 'Relative Size & Fit', 'Visual Details'}


# Conclusion

Each product under the same class will have **same** recommended (4) contents