# Imports

In [2]:
import os
import boto3
import json
import numpy as np
import pandas as pd

# Data Processing

### Convert CSV into JSONs

In [3]:
columns = ['name','description','itemid','colorname','Classification']

df = pd.read_csv('productsclassified.csv')[columns]
df.head()

Unnamed: 0,name,description,itemid,colorname,Classification
0,Men's Vasque Talus Trek Waterproof Hiking Boots,['These waterproof hiking boots for men are ru...,['297896'],['Slate Brown/Chili Pepper'],Boots
1,Ridge Runner Light-Up Running Jacket,"[""The next level of weather protection. This l...",['502860'],['Platinum'],Jackets
2,Men's Merrell Moab 2 Ventilated Trail Shoes,['Great grip and extra breathability make thes...,['304809'],['Walnut'],Shoes
3,Streamlight Ultra II Switch Fly Rods,"[""This lightweight fly rod delivers outstandin...",[],['Green'],Fishing Tools
4,"Wool Hooked Throw Pillow, Wooden Paddles","['Add a pop of paddling fun to your bed, chair...",['507102'],[],Pillows


In [4]:
df['itemid'] = df['itemid'].apply(lambda x: x.replace('"','').replace("'",'').replace('[','').replace(']',''))
df['itemid'] = df['itemid'].apply(lambda x: x.split(',')[0] if len(x)>0 else 0)
df.head()

Unnamed: 0,name,description,itemid,colorname,Classification
0,Men's Vasque Talus Trek Waterproof Hiking Boots,['These waterproof hiking boots for men are ru...,297896,['Slate Brown/Chili Pepper'],Boots
1,Ridge Runner Light-Up Running Jacket,"[""The next level of weather protection. This l...",502860,['Platinum'],Jackets
2,Men's Merrell Moab 2 Ventilated Trail Shoes,['Great grip and extra breathability make thes...,304809,['Walnut'],Shoes
3,Streamlight Ultra II Switch Fly Rods,"[""This lightweight fly rod delivers outstandin...",0,['Green'],Fishing Tools
4,"Wool Hooked Throw Pillow, Wooden Paddles","['Add a pop of paddling fun to your bed, chair...",507102,[],Pillows


In [5]:
df['colorname'] = df['colorname'].apply(lambda x: x.replace('"','').replace("'",'').replace('[','').replace(']',''))
df['colorname'] = df['colorname'].apply(lambda x: x.split(',') if len(x)>0 else [])
df.head()

Unnamed: 0,name,description,itemid,colorname,Classification
0,Men's Vasque Talus Trek Waterproof Hiking Boots,['These waterproof hiking boots for men are ru...,297896,[Slate Brown/Chili Pepper],Boots
1,Ridge Runner Light-Up Running Jacket,"[""The next level of weather protection. This l...",502860,[Platinum],Jackets
2,Men's Merrell Moab 2 Ventilated Trail Shoes,['Great grip and extra breathability make thes...,304809,[Walnut],Shoes
3,Streamlight Ultra II Switch Fly Rods,"[""This lightweight fly rod delivers outstandin...",0,[Green],Fishing Tools
4,"Wool Hooked Throw Pillow, Wooden Paddles","['Add a pop of paddling fun to your bed, chair...",507102,[],Pillows


In [6]:
df['description'] = df['description'].apply(lambda x: x.replace('"','').replace("'",'').replace('[','').replace(']',''))
df['description'] = df['description'].apply(lambda x: x.split(',')[0] if len(x)>0 else '')
df.head()

Unnamed: 0,name,description,itemid,colorname,Classification
0,Men's Vasque Talus Trek Waterproof Hiking Boots,These waterproof hiking boots for men are rugg...,297896,[Slate Brown/Chili Pepper],Boots
1,Ridge Runner Light-Up Running Jacket,The next level of weather protection. This lig...,502860,[Platinum],Jackets
2,Men's Merrell Moab 2 Ventilated Trail Shoes,Great grip and extra breathability make these ...,304809,[Walnut],Shoes
3,Streamlight Ultra II Switch Fly Rods,This lightweight fly rod delivers outstanding ...,0,[Green],Fishing Tools
4,"Wool Hooked Throw Pillow, Wooden Paddles",Add a pop of paddling fun to your bed,507102,[],Pillows


In [7]:
df = df.rename(columns={'name':'product','description':'description','itemid':'id','colorname':'colors','Classification':'category'})
df.head()

Unnamed: 0,product,description,id,colors,category
0,Men's Vasque Talus Trek Waterproof Hiking Boots,These waterproof hiking boots for men are rugg...,297896,[Slate Brown/Chili Pepper],Boots
1,Ridge Runner Light-Up Running Jacket,The next level of weather protection. This lig...,502860,[Platinum],Jackets
2,Men's Merrell Moab 2 Ventilated Trail Shoes,Great grip and extra breathability make these ...,304809,[Walnut],Shoes
3,Streamlight Ultra II Switch Fly Rods,This lightweight fly rod delivers outstanding ...,0,[Green],Fishing Tools
4,"Wool Hooked Throw Pillow, Wooden Paddles",Add a pop of paddling fun to your bed,507102,[],Pillows


### Attach Mock User Profile

In [8]:
user_profiles = ['reseller', 'consumer', 'distributer']
df['user_profile'] = np.random.choice(user_profiles, len(df))
df.head()

Unnamed: 0,product,description,id,colors,category,user_profile
0,Men's Vasque Talus Trek Waterproof Hiking Boots,These waterproof hiking boots for men are rugg...,297896,[Slate Brown/Chili Pepper],Boots,consumer
1,Ridge Runner Light-Up Running Jacket,The next level of weather protection. This lig...,502860,[Platinum],Jackets,distributer
2,Men's Merrell Moab 2 Ventilated Trail Shoes,Great grip and extra breathability make these ...,304809,[Walnut],Shoes,distributer
3,Streamlight Ultra II Switch Fly Rods,This lightweight fly rod delivers outstanding ...,0,[Green],Fishing Tools,distributer
4,"Wool Hooked Throw Pillow, Wooden Paddles",Add a pop of paddling fun to your bed,507102,[],Pillows,distributer


### Store Documents in S3

1. Save and upload the Document-JSON to S3
2. Create, save, and upload the metadata-JSON to S3 in a folder

In [9]:
bucket = 'platform-product-data-20230313'
s3 = boto3.resource('s3')


for i,row in df.iterrows():
    # Process Document
    filename = f'doc_{i}_prod_{row.id}.json'
    js = {'id':row.id, 'product':row['product'], 'description':row.description}
    #js = {'id':row.id, 'product':row['product'], 'description':row.description, 'colors':row.colors, 'category':row.category}
    json.dump(js, open(f'jsons/{filename}', 'w'), indent=4)
    s3.meta.client.upload_file(f'jsons/{filename}', bucket, filename)
    
    # Process Metadata
    colors_list = row.colors[:10]
    filename = f'{filename}.metadata.json'
    md = {
        "DocumentId": f'doc_{i}_prod_{row.id}.json',
        "Attributes": {
            "_category": row.category,
            'colors':colors_list  if len(colors_list) > 0 else ['No color was provided'],
            #"description": row.description if len(row.description) > 10 else 'There was no description provided.',
            "user_profile": row.user_profile
        },
        "Title": row['product'],
    }
    json.dump(md, open(f'jsons/{filename}', 'w'), indent=4)
    s3.meta.client.upload_file(f'jsons/{filename}', bucket, f'metadata/{filename}')