In [23]:
'''
Import packages and display settings
'''
## supress warnings
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
## display settings
pd.set_option('display.max_columns', 1000)
# pd.set_option('display.max_rows', 10000000)
# pd.set_option('max_info_columns', 10000000)
import os
## Assign dataset path
path_data = 'C:/Users/Cody_Black/JupyterNotebook/Dataset'
os.chdir(path_data)

In [24]:
'''
Load data sets
'''

df_A_Test = pd.read_excel(open('Babson Hackathon _ Exhibits A _ B.xlsx', 'rb'), sheet_name='Exhibit A | Test Results') 
# df_B_Content = pd.read_excel(open('Babson Hackathon _ Exhibits A _ B.xlsx', 'rb'), sheet_name='Exhibit B | Content Type Defini')  
df_ContentRecomm = pd.read_csv('ContentRecommendationData.csv')
df_ModuleCount = pd.read_csv('ModuleCountData.csv')
# df_Supplier = pd.read_csv('SupplierData.csv')
# df_Supplier_UK = pd.read_csv('SupplierData_Wayfair UK.csv')
df_Supplier_US = pd.read_csv('SupplierData_Wayfair.csv')

In [25]:
'''
Revise column names
'''
for df in [df_A_Test, df_ContentRecomm, df_ModuleCount, df_Supplier_US]:
    df.columns = df.columns.str.replace(" ", "_")


## Add prefix to columns in each df
df_Supplier_US = df_Supplier_US.add_prefix('SP_')
df_ContentRecomm = df_ContentRecomm.add_prefix('CR_')
df_ModuleCount = df_ModuleCount.add_prefix('MC_')
df_A_Test = df_A_Test.add_prefix('AT_')

In [26]:
'''
Merging purpose: Create class name for df_ContentRecomm
'''

### Since there is no class name in df_ContentRecomm, we use supplier data set to fill in class name by using clid as keys
df_Supplier = pd.read_csv('SupplierData.csv')
df_Supplier = df_Supplier.add_prefix('SP_')
df_Supplier_subset = df_Supplier[['SP_clid', 'SP_clname']]
df_Supplier_subset.drop_duplicates(subset =['SP_clid', 'SP_clname'],
                                      keep = 'first', inplace = True) 
df_merge = pd.merge(df_ContentRecomm, df_Supplier_subset,
                   how='left',
                   left_on='CR_clid', right_on='SP_clid')
# print('Check if there is any row in CR can not match with SP: ')
# print(df_merge[df_merge['SP_clname'].isnull()])
df_ContentRecomm['CR_clname'] = df_merge['SP_clname']
del df_Supplier, df_Supplier_subset, df_merge

'''
Re-order columns
'''
# cols = df_ContentRecomm.columns.tolist()
cols= ['CR_prsku',
       'CR_clid',
       'CR_clname',
       'CR_percentilerank',
       'CR_biccontenttypename',
       'CR_hasbiccontenttype']
df_ContentRecomm = df_ContentRecomm[cols]

<code>Check if there is any row in CR can not match with SP: 
Empty DataFrame
Columns: [CR_prsku, CR_clid, CR_percentilerank, CR_biccontenttypename, CR_hasbiccontenttype, SP_clid, SP_clname]
Index: []</code>

In [27]:
'''
recast types
'''

for col in ['AT_clmkcid']:
    df_A_Test[col] = df_A_Test[col].astype('category')
for col in ['CR_prsku', 'CR_clid', 'CR_hasbiccontenttype']:
    df_ContentRecomm[col] = df_ContentRecomm[col].astype('category')
for col in ['MC_prsku', 'MC_clid']:
    df_ModuleCount[col] = df_ModuleCount[col].astype('category')
for col in ['SP_soid', 'SP_clid', 'SP_suid']: 
    df_Supplier_US[col] = df_Supplier_US[col].astype('category')
# df_Supplier_US['SP_iscurrent'] = df_Supplier_US['SP_iscurrent'].astype('int')

<code>array(['Bedding Sets', 'Vanities', 'Outdoor Conversation Sets',
       'End Tables', 'Area Rugs', 'Cribs', 'Kids Dressers & Chests',
       'Wall Art', 'TV Stands & Entertainment Centers', 'Headboards',
       'Accent Chests / Cabinets', 'Desks', 'Sofas', 'Dressers & Chests',
       'Bar Stools', 'Dining Table Sets', 'Garage Storage Cabinets',
       'Beds', 'Chandeliers', 'Air Fryers', 'Mattress Toppers and Pads',
       'Classroom Storage', 'Adjustable Beds', 'Pool Tables',
       'Filing Cabinets', 'Outdoor Fireplaces', 'Bathroom Storage',
       'Kitchen Islands', 'Interior Doors', 'Reception Seating Chairs',
       'Ceiling Fans', 'Gliders', 'Adirondack Chairs', 'Patio Sofas',
       'Kids Beds', 'Pantry Cabinets', 'Charcoal Grills',
       'Wine Refrigerators', 'Tubs And Whirlpools',
       'Innerspring Mattresses', 'Gas Grills',
       'Cat Litter Boxes & Litter Box Enclosures', 'Ranges', 'Mantels',
       'Electric Grills', 'Swing Sets & Playgrounds', 'Smokers',
       'Wood Pellet Grills'], dtype=object)</code>

In [28]:
'''
Filtering for analysis purpose
'''

### Drop class id since it can not match with class in in supplier data set 
# df_A_Test = df_A_Test.drop(columns='AT_clid')

### Filter out suid = 1, since it is aggregation of the other suppliers
## Save it to new dataframe which uses _m as modified
df_Supplier_US_m = df_Supplier_US[df_Supplier_US['SP_suid']!=1]

### Select rows with is_current = 1 where is our current interest target
df_Supplier_US_m = df_Supplier_US_m[df_Supplier_US_m['SP_iscurrent']==1]

### Select product under class TV Stands & Entertainment Centers, since it is
# a simpler component compared to other classes
df_Supplier_US_m = df_Supplier_US_m[df_Supplier_US_m['SP_clname']=='TV Stands & Entertainment Centers']
df_ContentRecomm_m = df_ContentRecomm[df_ContentRecomm['CR_clname']=='TV Stands & Entertainment Centers']
df_A_Test_m = df_A_Test[df_A_Test['AT_clname']=='TV Stands & Entertainment Centers']
df_ModuleCount_m = df_ModuleCount[df_ModuleCount['MC_clname']=='TV Stands & Entertainment Centers']

### Only select products including in df_Supplier_US data set 
df_ContentRecomm_m = df_ContentRecomm_m[df_ContentRecomm_m['CR_prsku'].isin(set(df_Supplier_US_m['SP_prsku'].values).intersection(set(df_ContentRecomm_m['CR_prsku'].values)))]
df_ModuleCount_m = df_ModuleCount_m[df_ModuleCount_m['MC_prsku'].isin(set(df_Supplier_US['SP_prsku'].values).intersection(set(df_ModuleCount_m['MC_prsku'].values)))]

In [29]:
[df_Supplier_US_m.SP_prsku.nunique(), df_ContentRecomm_m.CR_prsku.nunique(), df_ModuleCount_m.MC_prsku.nunique()]

[5562, 5562, 531]

<code>[(7344, 20), (22248, 6), (491, 5), (4, 5)]<code/>

<h3>Merge all data sets

<h5>Merge df_ModuleCount with df_A_Test</h5>
    
First, let's see a row from each dataframe

In [30]:
pd.concat([df_ModuleCount_m.head(1),
          df_A_Test_m.head(1)])

Unnamed: 0,MC_prsku,MC_modulecount,MC_clid,MC_clname,MC_mkcname,AT_clid,AT_clname,AT_biccontenttypename,AT_Conversion_Rate_Change,AT_clmkcid,AT_mkcname
408,BYDT9752,8.0,1.0,TV Stands & Entertainment Centers,Entertainment Furniture,,,,,,
0,,,,,,6.0,TV Stands & Entertainment Centers,Materials / How Its Made,0.03,7.0,Entertainment Furniture


In [31]:
'''
Merge df_ModuleCount with df_A_Test
'''

df_merge = pd.merge(df_ModuleCount_m.rename(columns={'MC_clname':'clname'}),
                    df_A_Test_m.rename(columns={'AT_clname':'clname'}),
                    how = 'left',
                    on = 'clname').sort_values(by=['MC_prsku'])

In [32]:
'''
See few rows from the merging result
'''
df_merge.iloc[[0,5,9], :]

Unnamed: 0,MC_prsku,MC_modulecount,MC_clid,clname,MC_mkcname,AT_clid,AT_biccontenttypename,AT_Conversion_Rate_Change,AT_clmkcid,AT_mkcname
4,AAGE1000,4,1,TV Stands & Entertainment Centers,Entertainment Furniture,6,Materials / How Its Made,0.03,7,Entertainment Furniture
1206,AAGE1001,5,1,TV Stands & Entertainment Centers,Entertainment Furniture,6,Relative Size & Fit,0.05,7,Entertainment Furniture
2001,AAGE1002,5,1,TV Stands & Entertainment Centers,Entertainment Furniture,6,Product Capacity,-0.01,7,Entertainment Furniture


<code>True    1964
dtype: int64</code>

In [33]:
'''
Drop column MC_mkcname
'''
df_merge.drop(columns=['MC_mkcname'], inplace=True)

<h5>Next, merge df_ContentRecomm_US with df_merge</h5>
    
Again, let's first see a row from each dataframe

In [34]:
pd.concat([df_ContentRecomm_m.head(1),
          df_merge.head(1)])

Unnamed: 0,CR_prsku,CR_clid,CR_clname,CR_percentilerank,CR_biccontenttypename,CR_hasbiccontenttype,MC_prsku,MC_modulecount,MC_clid,clname,AT_clid,AT_biccontenttypename,AT_Conversion_Rate_Change,AT_clmkcid,AT_mkcname
139,AAGE1020,1.0,TV Stands & Entertainment Centers,,Materials / How Its Made,0.0,,,,,,,,,
4,,,,,,,AAGE1000,4.0,1.0,TV Stands & Entertainment Centers,6.0,Materials / How Its Made,0.03,7.0,Entertainment Furniture


In [35]:
df_merge = pd.merge(df_ContentRecomm_m.rename(columns={'CR_prsku':'prsku', 'CR_clid':'clid', 'CR_clname': 'clname'}), 
                    df_merge.rename(columns={'MC_prsku':'prsku', 'MC_clid':'clid'}),
                    how = 'outer',
                    left_on = ['prsku', 'clid', 'clname', 'CR_biccontenttypename'], right_on = ['prsku', 'clid', 'clname', 'AT_biccontenttypename']).sort_values(by=['prsku'])

In [36]:
'''
See few rows from the merging result
'''

pd.concat([df_merge[df_merge['AT_biccontenttypename'].notnull()].head(4),
           df_merge[df_merge['AT_biccontenttypename'].isnull()].head(4)])

Unnamed: 0,prsku,clid,clname,CR_percentilerank,CR_biccontenttypename,CR_hasbiccontenttype,MC_modulecount,AT_clid,AT_biccontenttypename,AT_Conversion_Rate_Change,AT_clmkcid,AT_mkcname
152,AAGE1000,1,TV Stands & Entertainment Centers,,Relative Size & Fit,1,4.0,6.0,Relative Size & Fit,0.05,7.0,Entertainment Furniture
176,AAGE1000,1,TV Stands & Entertainment Centers,,Visual Details,0,4.0,6.0,Visual Details,0.02,7.0,Entertainment Furniture
1047,AAGE1000,1,TV Stands & Entertainment Centers,,Materials / How Its Made,0,4.0,6.0,Materials / How Its Made,0.03,7.0,Entertainment Furniture
373,AAGE1000,1,TV Stands & Entertainment Centers,,Product Capacity,0,4.0,6.0,Product Capacity,-0.01,7.0,Entertainment Furniture
307,AAI1628,1,TV Stands & Entertainment Centers,,Product Capacity,0,,,,,,
682,AAI1628,1,TV Stands & Entertainment Centers,,Materials / How Its Made,0,,,,,,
935,AAI1628,1,TV Stands & Entertainment Centers,,Relative Size & Fit,0,,,,,,
542,AAI1628,1,TV Stands & Entertainment Centers,,Visual Details,0,,,,,,


In [37]:
'''
Check how many unique values in each column in order to inspect the merging result
'''
df_merge.nunique(dropna=True)

prsku                        5602
clid                            1
clname                          1
CR_percentilerank              44
CR_biccontenttypename           4
CR_hasbiccontenttype            2
MC_modulecount                 14
AT_clid                         1
AT_biccontenttypename           4
AT_Conversion_Rate_Change       4
AT_clmkcid                      1
AT_mkcname                      1
dtype: int64

<h5>Next, merge df_Supplier_US with df_merge</h5>
    
Again, let's first see a row from each dataframe

In [38]:
pd.concat([df_Supplier_US_m.head(1),
          df_merge.head(1)])

Unnamed: 0,SP_prsku,SP_soid,SP_soname,SP_clid,SP_clname,SP_mkcname,SP_suid,SP_iswaymore,SP_addedtocart,SP_placedorder,SP_trafficcount,SP_iscurrent,SP_videocount,SP_grs1month,SP_grs2month,SP_grs3month,SP_grs12month,SP_weightedavgscore,SP_percentilerank,SP_expectedgrs,prsku,clid,clname,CR_percentilerank,CR_biccontenttypename,CR_hasbiccontenttype,MC_modulecount,AT_clid,AT_biccontenttypename,AT_Conversion_Rate_Change,AT_clmkcid,AT_mkcname
96,AAMD1072,1.0,Wayfair,1.0,TV Stands & Entertainment Centers,Entertainment Furniture,37.0,0.0,7.0,0.0,235.0,1.0,0.0,0.0,554.5812,554.5812,2744.168132,1.42,0.92,4123.9774,,,,,,,,,,,,
152,,,,,,,,,,,,,,,,,,,,,AAGE1000,1.0,TV Stands & Entertainment Centers,,Relative Size & Fit,1.0,4.0,6.0,Relative Size & Fit,0.05,7.0,Entertainment Furniture


In [39]:
'''
Merge df_Supplier_US with df_merge
'''

df_merge = pd.merge(df_Supplier_US_m.rename(columns={'SP_prsku':'prsku', 'SP_clid':'clid', 'SP_clname': 'clname'}), 
                    df_merge,
                    how = 'outer',
                    on = ['prsku', 'clid', 'clname']
                   ).sort_values(by=['prsku'])

# df_merge2 = pd.merge(df_Supplier_US.rename(columns={'SP_prsku':'prsku', 'SP_clid':'clid', 'SP_clname': 'clname', 'SP_mkcname': 'mkcname'}), 
#                     df_merge.rename(columns={'AT_mkcname': 'mkcname'}),
#                    how = 'outer',
#                    on = ['prsku', 'clid', 'clname', 'mkcname']).sort_values(by=['prsku'])

In [40]:
'''
Check how many unique values in each column in order to inspect the merging result
'''

df_merge.nunique(dropna=True)
# df_merge2.info()
# df_merge2.nunique(dropna=False)
# df_merge2['AT_mkcname'].value_counts(dropna=False)
# df_Supplier_US['SP_mkcname'].value_counts(dropna=False)

prsku                        5602
SP_soid                         1
SP_soname                       1
clid                            1
clname                          1
SP_mkcname                      1
SP_suid                       469
SP_iswaymore                    2
SP_addedtocart                459
SP_placedorder                169
SP_trafficcount              1761
SP_iscurrent                    1
SP_videocount                  10
SP_grs1month                 2923
SP_grs2month                 3763
SP_grs3month                 4243
SP_grs12month                5984
SP_weightedavgscore            68
SP_percentilerank              44
SP_expectedgrs               3281
CR_percentilerank              44
CR_biccontenttypename           4
CR_hasbiccontenttype            2
MC_modulecount                 14
AT_clid                         1
AT_biccontenttypename           4
AT_Conversion_Rate_Change       4
AT_clmkcid                      1
AT_mkcname                      1
dtype: int64

In [41]:
'''
Drop columns with identical value as well as zero information
'''

df_merge.drop(columns=['SP_soid', 'SP_soname', 'AT_mkcname', 'AT_clmkcid'], inplace=True)

In [42]:
'''
Re order columns
'''
cols = \
['prsku',
 'clname',
 'SP_suid',
 'SP_iswaymore',
 'CR_percentilerank',
 'CR_biccontenttypename',
 'CR_hasbiccontenttype',
 'AT_biccontenttypename', 
 'AT_Conversion_Rate_Change',
 'SP_addedtocart',
 'SP_placedorder',
 'SP_trafficcount',
 'SP_videocount',
 'MC_modulecount',
 'SP_grs1month',
 'SP_grs2month',
 'SP_grs3month',
 'SP_grs12month',
 'SP_weightedavgscore',
 'SP_percentilerank',
 'SP_expectedgrs',
 'clid',
 'SP_mkcname',
 'SP_iscurrent',
 ]
df_merge = df_merge[cols]
# df_merge.columns.to_list()

In [24]:
'''
Delete unncessary data
'''
df = df_merge
del [col,
     cols,
     df_ContentRecomm_m,
     df_ModuleCount_m,
     df_Supplier_US_m,
     df_merge,]
# %who_ls