# DataAnalysis_Sales

In this notebook, the AMZ_sales table is analysed and cleaned. 
1. The data contains null values in various features. But only the desired features: quantity and item_price are handled. 
2. A new feature is generated by calculating the average price of a product (item_price /quantity). This helps in replacing null values when quantity is null but item_price is available or vice-versa for a product. 
3. The data points where either isn't available and when there isn't enough data available to calculate the item_price_avg, are dropped.

In [109]:
# Import Packages
import pandas as pd
import numpy as np

In [110]:
# Read data
sales_df = pd.read_csv("Data/sales.csv")

In [111]:
# Feature generation

# item_price_avg gives the average price of one product
sales_df["item_price_avg"] = sales_df["item_price"] / sales_df["quantity"]

In [112]:
# An example data
sales_df[sales_df["asin"] == "c685af0af33fc8f8e80894b91f073f4d"]

Unnamed: 0,mws_region,marketplace,purchase_date,asin,order_status,ship_country,fulfillment_channel,quantity,item_price,item_tax,item_promotion_discount,gift_wrap_price,gift_wrap_tax,shipping_price,shipping_tax,ship_promotion_discount,item_price_avg
0,mws_europe,DE,2020-01-03,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,,,,,,,16.99
6,mws_europe,DE,2020-01-21,c685af0af33fc8f8e80894b91f073f4d,Shipped,,Amazon,2.0,33.98,,,,,3.98,,3.98,16.99
95,mws_europe,DE,2020-02-12,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,1.7,,,,,,16.99
115,mws_europe,DE,2020-02-15,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,1.7,,,,,,16.99
139,mws_europe,DE,2020-02-18,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,1.7,,,,,,16.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67111,mws_europe,DE,2021-12-25,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,4.0,47.96,,,,,4.00,,4.00,11.99
67224,mws_europe,DE,2021-12-26,c685af0af33fc8f8e80894b91f073f4d,Shipped,AT,Amazon,,23.98,,,,,,,,
67225,mws_europe,DE,2021-12-26,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,23.98,,,,,,,,11.99
67495,mws_europe,DE,2021-12-28,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,23.98,,,,,,,,11.99


### Fill Nan in item_price_avg 

In [113]:
# In order to fill Nan in the data, the average price of each asin is created.
asin_price_df = pd.DataFrame(sales_df.groupby("asin")["item_price_avg"].mean())

asin_price_df.reset_index(inplace=True)

asin_price_df

Unnamed: 0,asin,item_price_avg
0,0063d8d5ed3c6e718687c45cc3ca9559,14.990000
1,0149e42b4f2d5a620c4efa216cee8426,14.990000
2,018cd764d4b819f3ab1bbfd3562796f0,8.390000
3,019d12418a3350d3257f1efb336a26f8,9.990000
4,01f4981d25ec804a3e0cf220272ff1be,12.990000
...,...,...
565,fd366f08deeee435446593f5d1e31daa,9.704500
566,fd66c297fa92101ccd23c2d8b20abd84,14.323333
567,fdcadcd1098566e56a645e866c3419f8,9.900000
568,fef81f9f061688c9a15ac5f450fb1707,12.490000


In [114]:
# The Dataframes are merged to fill Nans
merged_df = (sales_df.merge(asin_price_df, left_on='asin', right_on='asin'))

In [115]:
# Example data
merged_df[merged_df["asin"] == "c685af0af33fc8f8e80894b91f073f4d"]

Unnamed: 0,mws_region,marketplace,purchase_date,asin,order_status,ship_country,fulfillment_channel,quantity,item_price,item_tax,item_promotion_discount,gift_wrap_price,gift_wrap_tax,shipping_price,shipping_tax,ship_promotion_discount,item_price_avg_x,item_price_avg_y
0,mws_europe,DE,2020-01-03,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,,,,,,,16.99,14.578621
1,mws_europe,DE,2020-01-21,c685af0af33fc8f8e80894b91f073f4d,Shipped,,Amazon,2.0,33.98,,,,,3.98,,3.98,16.99,14.578621
2,mws_europe,DE,2020-02-12,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,1.7,,,,,,16.99,14.578621
3,mws_europe,DE,2020-02-15,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,1.7,,,,,,16.99,14.578621
4,mws_europe,DE,2020-02-18,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,1.7,,,,,,16.99,14.578621
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574,mws_europe,DE,2021-12-25,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,4.0,47.96,,,,,4.00,,4.00,11.99,14.578621
575,mws_europe,DE,2021-12-26,c685af0af33fc8f8e80894b91f073f4d,Shipped,AT,Amazon,,23.98,,,,,,,,,14.578621
576,mws_europe,DE,2021-12-26,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,23.98,,,,,,,,11.99,14.578621
577,mws_europe,DE,2021-12-28,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,23.98,,,,,,,,11.99,14.578621


In [116]:
# Making a copy of the dataframe
df = merged_df.copy()

In [117]:
# Replacing Nans in the item_price_avg column in the actual dataframe by the created dataframe
df["item_price_avg_x"].fillna(df["item_price_avg_y"], inplace=True)

In [118]:
df[df["asin"] == "c685af0af33fc8f8e80894b91f073f4d"]

Unnamed: 0,mws_region,marketplace,purchase_date,asin,order_status,ship_country,fulfillment_channel,quantity,item_price,item_tax,item_promotion_discount,gift_wrap_price,gift_wrap_tax,shipping_price,shipping_tax,ship_promotion_discount,item_price_avg_x,item_price_avg_y
0,mws_europe,DE,2020-01-03,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,,,,,,,16.990000,14.578621
1,mws_europe,DE,2020-01-21,c685af0af33fc8f8e80894b91f073f4d,Shipped,,Amazon,2.0,33.98,,,,,3.98,,3.98,16.990000,14.578621
2,mws_europe,DE,2020-02-12,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,1.7,,,,,,16.990000,14.578621
3,mws_europe,DE,2020-02-15,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,1.7,,,,,,16.990000,14.578621
4,mws_europe,DE,2020-02-18,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,1.7,,,,,,16.990000,14.578621
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574,mws_europe,DE,2021-12-25,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,4.0,47.96,,,,,4.00,,4.00,11.990000,14.578621
575,mws_europe,DE,2021-12-26,c685af0af33fc8f8e80894b91f073f4d,Shipped,AT,Amazon,,23.98,,,,,,,,14.578621,14.578621
576,mws_europe,DE,2021-12-26,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,23.98,,,,,,,,11.990000,14.578621
577,mws_europe,DE,2021-12-28,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,23.98,,,,,,,,11.990000,14.578621


### Fill Nan in Quantity 

In [119]:
# Making a copy
sales_df = df.copy()

In [121]:
# Dropping the other column
sales_df.drop(["item_price_avg_y"], axis=1, inplace=True)

# Rename the column
sales_df.rename({'item_price_avg_x': 'item_price_avg'}, axis=1, inplace=True)

In [125]:
# Fill Nans by calculating quatity from item_price and item_price_avg
sales_df["quantity"].fillna(round(sales_df["item_price"]/sales_df["item_price_avg"]), inplace=True)

In [126]:
sales_df[sales_df["asin"] == "c685af0af33fc8f8e80894b91f073f4d"]

Unnamed: 0,mws_region,marketplace,purchase_date,asin,order_status,ship_country,fulfillment_channel,quantity,item_price,item_tax,item_promotion_discount,gift_wrap_price,gift_wrap_tax,shipping_price,shipping_tax,ship_promotion_discount,item_price_avg
0,mws_europe,DE,2020-01-03,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,,,,,,,16.990000
1,mws_europe,DE,2020-01-21,c685af0af33fc8f8e80894b91f073f4d,Shipped,,Amazon,2.0,33.98,,,,,3.98,,3.98,16.990000
2,mws_europe,DE,2020-02-12,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,1.7,,,,,,16.990000
3,mws_europe,DE,2020-02-15,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,1.7,,,,,,16.990000
4,mws_europe,DE,2020-02-18,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,33.98,,1.7,,,,,,16.990000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574,mws_europe,DE,2021-12-25,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,4.0,47.96,,,,,4.00,,4.00,11.990000
575,mws_europe,DE,2021-12-26,c685af0af33fc8f8e80894b91f073f4d,Shipped,AT,Amazon,2.0,23.98,,,,,,,,14.578621
576,mws_europe,DE,2021-12-26,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,23.98,,,,,,,,11.990000
577,mws_europe,DE,2021-12-28,c685af0af33fc8f8e80894b91f073f4d,Shipped,DE,Amazon,2.0,23.98,,,,,,,,11.990000


In [127]:
# Checking if there are more null values
sales_df[sales_df['quantity'].isnull()]

Unnamed: 0,mws_region,marketplace,purchase_date,asin,order_status,ship_country,fulfillment_channel,quantity,item_price,item_tax,item_promotion_discount,gift_wrap_price,gift_wrap_tax,shipping_price,shipping_tax,ship_promotion_discount,item_price_avg
34504,mws_europe,DE,2020-04-02,112675c4d2634444eaa46367ea34d0eb,Shipped,AT,Amazon,,13.98,,,,,,,,
35351,mws_europe,DE,2020-05-04,12f056b3b69e59c26ba07104b8f29f17,Shipped,LU,Amazon,,13.98,,,,,,,,
36733,mws_europe,DE,2020-05-27,1b7445450513d668209922f65a5c9ce1,Shipped,AT,Amazon,,39.98,,,,,,,,
44618,mws_europe,DE,2020-10-14,32d43cd9ac572c3b811fb0bcb2600d23,Shipped,AT,Amazon,,13.98,,,,,,,,


In [128]:
# Dropping the null values since they have only one entry and this data is not useful
sales_df.dropna(subset = ['quantity'], inplace=True)

### Fill item_price_avg

Some values in item_price_avg are 0.0 since the quantity was Nan earlier, now we replace them with the mean_value for the specific 'Asin'

In [134]:
# Replacing 0.0 in item_price_avg by the mean price of the "asin"
sales_df['item_price_avg'] = sales_df['item_price_avg'].replace(0.0, np.nan)
sales_df['item_price_avg'] = sales_df['item_price_avg'].fillna(sales_df.groupby('asin')['item_price_avg'].transform('mean'))

### Fill item_price

In [130]:
# Check for item_price = 0
sales_df[sales_df['item_price']==0.0]

Unnamed: 0,mws_region,marketplace,purchase_date,asin,order_status,ship_country,fulfillment_channel,quantity,item_price,item_tax,item_promotion_discount,gift_wrap_price,gift_wrap_tax,shipping_price,shipping_tax,ship_promotion_discount,item_price_avg
27047,mws_europe,DE,2021-04-28,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,0.0,,,,,,,,0.0
27048,mws_europe,DE,2021-04-29,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,8.0,0.0,,,,,,,,0.0
27049,mws_europe,DE,2021-05-01,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,6.0,0.0,,,,,,,,0.0
27050,mws_europe,DE,2021-05-09,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,0.0,,,,,,,,0.0
27051,mws_europe,DE,2021-05-12,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,4.0,0.0,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67735,mws_europe,DE,2021-12-17,6cc3169f6be587701122893ca9e5cab9,Shipped,DE,Amazon,2.0,0.0,,,,,,,,0.0
67736,mws_europe,DE,2021-12-18,6cc3169f6be587701122893ca9e5cab9,Shipped,DE,Amazon,2.0,0.0,,,,,,,,0.0
67738,mws_europe,DE,2021-12-23,6cc3169f6be587701122893ca9e5cab9,Shipped,DE,Amazon,2.0,0.0,,,,,,,,0.0
67739,mws_europe,DE,2021-12-29,6cc3169f6be587701122893ca9e5cab9,Shipped,DE,Amazon,4.0,0.0,,,,,,,,0.0


In [136]:
sales_df[sales_df["asin"] == "1bcb5ecc056b792b2dcaad68557992db"]

Unnamed: 0,mws_region,marketplace,purchase_date,asin,order_status,ship_country,fulfillment_channel,quantity,item_price,item_tax,item_promotion_discount,gift_wrap_price,gift_wrap_tax,shipping_price,shipping_tax,ship_promotion_discount,item_price_avg
27038,mws_europe,DE,2020-02-29,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,119.98,,83.98,,,7.98,,7.98,59.99
27039,mws_europe,DE,2020-05-01,1bcb5ecc056b792b2dcaad68557992db,Shipped,AT,Amazon,2.0,119.98,,,,,,,,50.055385
27040,mws_europe,DE,2020-06-28,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,119.98,,,,,3.98,,3.98,59.99
27041,mws_europe,DE,2020-07-18,1bcb5ecc056b792b2dcaad68557992db,Shipped,AT,Amazon,2.0,119.98,,,,,,,,50.055385
27042,mws_europe,DE,2020-10-01,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,64.96,,,,,7.78,,7.78,32.48
27043,mws_europe,DE,2020-12-14,1bcb5ecc056b792b2dcaad68557992db,Shipped,AT,Amazon,2.0,119.98,,,,,,,,50.055385
27044,mws_europe,DE,2021-01-31,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,119.98,,,,,9.98,,9.98,59.99
27045,mws_europe,DE,2021-03-04,1bcb5ecc056b792b2dcaad68557992db,Shipped,AT,Amazon,2.0,119.98,,,,,7.98,,7.98,50.055385
27046,mws_europe,DE,2021-03-16,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,119.98,,,,,,,,59.99
27047,mws_europe,DE,2021-04-28,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,0.0,,,,,,,,57.039857


In [137]:
# # Replacing 0.0 in item_price by the mean price of the "asin"
sales_df['item_price'] = sales_df['item_price'].replace(0.0, np.nan)
sales_df['item_price'] = sales_df['item_price'].fillna(sales_df["quantity"] * sales_df["item_price_avg"])

In [138]:
sales_df[sales_df["asin"] == "1bcb5ecc056b792b2dcaad68557992db"]

Unnamed: 0,mws_region,marketplace,purchase_date,asin,order_status,ship_country,fulfillment_channel,quantity,item_price,item_tax,item_promotion_discount,gift_wrap_price,gift_wrap_tax,shipping_price,shipping_tax,ship_promotion_discount,item_price_avg
27038,mws_europe,DE,2020-02-29,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,119.98,,83.98,,,7.98,,7.98,59.99
27039,mws_europe,DE,2020-05-01,1bcb5ecc056b792b2dcaad68557992db,Shipped,AT,Amazon,2.0,119.98,,,,,,,,50.055385
27040,mws_europe,DE,2020-06-28,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,119.98,,,,,3.98,,3.98,59.99
27041,mws_europe,DE,2020-07-18,1bcb5ecc056b792b2dcaad68557992db,Shipped,AT,Amazon,2.0,119.98,,,,,,,,50.055385
27042,mws_europe,DE,2020-10-01,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,64.96,,,,,7.78,,7.78,32.48
27043,mws_europe,DE,2020-12-14,1bcb5ecc056b792b2dcaad68557992db,Shipped,AT,Amazon,2.0,119.98,,,,,,,,50.055385
27044,mws_europe,DE,2021-01-31,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,119.98,,,,,9.98,,9.98,59.99
27045,mws_europe,DE,2021-03-04,1bcb5ecc056b792b2dcaad68557992db,Shipped,AT,Amazon,2.0,119.98,,,,,7.98,,7.98,50.055385
27046,mws_europe,DE,2021-03-16,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,119.98,,,,,,,,59.99
27047,mws_europe,DE,2021-04-28,1bcb5ecc056b792b2dcaad68557992db,Shipped,DE,Amazon,2.0,114.079714,,,,,,,,57.039857


In [140]:
# Save as a CSV file
sales_df.to_csv('Data/Cleaned_sales.csv')