In [1]:
import pandas as pd

In [2]:
# loading the data (might take some time)
df = pd.read_excel('https://drive.google.com/uc?export=download&id=1poIGWFAlRwCpSVmCYYyv9eDrsEVMkFli')

# overwriting column names
guessed_columns = [
  'category', 'quality', 'item_id', 'color', 'size',
  'description', 'color_long', '_', 'current_price', 
  'original_price', 'stock_quantity', 'interal_price', 
  '_', 'cost', '_', 'simple_sku', '_', 
  'enabled/disabled', 'image'
]
df.columns = guessed_columns

# removing unuseful columns
df.drop(columns=['_'], inplace=True)   # equivalent to `df = df.drop(columns=['_'])`

In [3]:
# keeping only active items
mask = df['enabled/disabled'] == "Enabled"
df = df[mask]

# removing sentitive and useless columns
df = df.drop(columns=['cost', 'description', 'original_price', 'interal_price', 'simple_sku', 'enabled/disabled'])

# renaming only remaning price
df = df.rename(columns={'current_price': 'price'})

# anonymizing item_id
df['item_id'] = df['item_id'].apply(hash)

In [4]:
# defining random images (same for same item_id)
def create_image_url(x):
    return f"https://picsum.photos/seed/{x}/200/300"

df['image'] = df.item_id.apply(create_image_url)

In [5]:
df

Unnamed: 0,category,quality,item_id,color,size,color_long,price,stock_quantity,image
64,AC,Other,5724536001677445461,DTE,EACH,DEEP TEAL,124.99,1439,https://picsum.photos/seed/5724536001677445461...
65,JK,DLX,6281272229728998706,KHA,S,KHAKI,124.99,0,https://picsum.photos/seed/6281272229728998706...
66,JK,DLX,6281272229728998706,KHA,XXS,KHAKI,124.99,0,https://picsum.photos/seed/6281272229728998706...
67,JK,DLX,-4191911794694246473,BBL,S,BRIGHT BLUE,124.99,0,https://picsum.photos/seed/-419191179469424647...
68,JK,DLX,-4191911794694246473,BLK,S,BLACK,124.99,1,https://picsum.photos/seed/-419191179469424647...
...,...,...,...,...,...,...,...,...,...
84230,AC,Other,-8604726361771542596,RED,EACH,RED,0.99,0,https://picsum.photos/seed/-860472636177154259...
84234,AC,Other,-2208939360132860094,GRN,EACH,GREEN,0.99,0,https://picsum.photos/seed/-220893936013286009...
84235,AC,Other,2468234437424912844,BLU,EACH,BLUE,0.49,0,https://picsum.photos/seed/2468234437424912844...
84236,AC,Other,-6919946680968433554,GRN,EACH,GREEN,0.50,0,https://picsum.photos/seed/-691994668096843355...


In [6]:
df.to_csv('dataset.csv', index=False)