# Explore merchant data

In [92]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib.ticker as tick
import seaborn as sb

### Import data

In [93]:
orders = pd.read_csv("/Users/daniellaframboise/Documents/Internships/2019/data/orders.csv")
order_items = pd.read_csv("/Users/daniellaframboise/Documents/Internships/2019/data/order_items.csv")

In [94]:
orders.head()

Unnamed: 0,id,customer,placed_on,total_cost,total_tax,subtotal
0,1,4,2017-03-05 16:55:52,884.34,101.74,782.6
1,2,22,2015-09-05 10:20:52,972.4,111.87,860.53
2,3,19,2016-01-24 11:38:27,105.91,12.18,93.73
3,4,10,2016-04-25 10:43:25,647.2,74.46,572.74
4,5,4,2017-12-02 06:27:31,343.27,39.49,303.78


In [95]:
order_items.head()

Unnamed: 0,id,order,product_variation,price,quantity
0,1,1,66,9.61,3
1,2,1,1,11.53,2
2,3,1,30,15.35,3
3,4,1,60,72.47,2
4,5,1,13,125.26,3


### Explore merging data

In [96]:
data = orders.merge(order_items, left_on='id', right_on='order')

In [97]:
data.head()

Unnamed: 0,id_x,customer,placed_on,total_cost,total_tax,subtotal,id_y,order,product_variation,price,quantity
0,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,1,1,66,9.61,3
1,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,2,1,1,11.53,2
2,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,3,1,30,15.35,3
3,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,4,1,60,72.47,2
4,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,5,1,13,125.26,3


In [98]:
data.dtypes

id_x                   int64
customer               int64
placed_on             object
total_cost           float64
total_tax            float64
subtotal             float64
id_y                   int64
order                  int64
product_variation      int64
price                float64
quantity               int64
dtype: object

In [99]:
data['placed_on'] = pd.to_datetime(data['placed_on'])

In [100]:
data.dtypes

id_x                          int64
customer                      int64
placed_on            datetime64[ns]
total_cost                  float64
total_tax                   float64
subtotal                    float64
id_y                          int64
order                         int64
product_variation             int64
price                       float64
quantity                      int64
dtype: object

In [101]:
data = data.rename(index=str, columns={"id_x": "id","id_y": "order_item_id"})

In [102]:
data.head()

Unnamed: 0,id,customer,placed_on,total_cost,total_tax,subtotal,order_item_id,order,product_variation,price,quantity
0,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,1,1,66,9.61,3
1,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,2,1,1,11.53,2
2,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,3,1,30,15.35,3
3,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,4,1,60,72.47,2
4,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,5,1,13,125.26,3


### Import product data

In [103]:
products = pd.read_csv("/Users/daniellaframboise/Documents/Internships/2019/data/products.csv")
product_variations = pd.read_csv("/Users/daniellaframboise/Documents/Internships/2019/data/product_variations.csv")

In [104]:
products.head()

Unnamed: 0,id,merchant,name,description,price,sku
0,1,2,Large table,A great table,102.12,8437152631
1,2,11,Cotton t-shirt,100% cotton t-shirt,11.99,1437367336
2,3,3,Wool coat,Sturdy 100% wool,400.5,3977592277
3,4,4,See-through corded telephone,,26.78,8241503020
4,5,3,V-neck t-shirt,100% cotton,10.0,4556771762


In [105]:
product_variations.head()

Unnamed: 0,id,product,name
0,1,2,Red
1,2,11,Small
2,3,10,Green
3,4,23,Checkered
4,5,10,Blue


In [106]:
# Drop columns that won't be used
products.drop(['description', 'sku'], inplace=True, axis=1)

In [107]:
products.head()

Unnamed: 0,id,merchant,name,price
0,1,2,Large table,102.12
1,2,11,Cotton t-shirt,11.99
2,3,3,Wool coat,400.5
3,4,4,See-through corded telephone,26.78
4,5,3,V-neck t-shirt,10.0


### Merge data with product_variations

In [108]:
data = data.merge(product_variations, left_on='product_variation', right_on='id')

In [109]:
data.head()

Unnamed: 0,id_x,customer,placed_on,total_cost,total_tax,subtotal,order_item_id,order,product_variation,price,quantity,id_y,product,name
0,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,1,1,66,9.61,3,66,37,Sandalwood
1,22,18,2014-08-21 21:26:03,200.7,23.09,177.61,120,22,66,9.08,1,66,37,Sandalwood
2,26,22,2015-05-22 03:54:27,676.1,77.78,598.32,141,26,66,9.29,1,66,37,Sandalwood
3,29,20,2017-10-06 09:21:19,120.93,13.91,107.02,157,29,66,9.74,2,66,37,Sandalwood
4,30,3,2017-01-26 18:49:36,221.29,25.46,195.83,160,30,66,9.63,1,66,37,Sandalwood


In [110]:
data = data.rename(index=str, columns={"id_x": "id","id_y": "prod_var_id"})

In [111]:
data.head()

Unnamed: 0,id,customer,placed_on,total_cost,total_tax,subtotal,order_item_id,order,product_variation,price,quantity,prod_var_id,product,name
0,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,1,1,66,9.61,3,66,37,Sandalwood
1,22,18,2014-08-21 21:26:03,200.7,23.09,177.61,120,22,66,9.08,1,66,37,Sandalwood
2,26,22,2015-05-22 03:54:27,676.1,77.78,598.32,141,26,66,9.29,1,66,37,Sandalwood
3,29,20,2017-10-06 09:21:19,120.93,13.91,107.02,157,29,66,9.74,2,66,37,Sandalwood
4,30,3,2017-01-26 18:49:36,221.29,25.46,195.83,160,30,66,9.63,1,66,37,Sandalwood


### Merge data on products

In [112]:
data = data.merge(products, left_on='product_variation', right_on='id')

In [113]:
data.head()

Unnamed: 0,id_x,customer,placed_on,total_cost,total_tax,subtotal,order_item_id,order,product_variation,price_x,quantity,prod_var_id,product,name_x,id_y,merchant,name_y,price_y
0,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,2,1,1,11.53,2,1,2,Red,1,2,Large table,102.12
1,6,6,2016-10-04 06:48:12,503.86,57.97,445.89,33,6,1,11.38,2,1,2,Red,1,2,Large table,102.12
2,9,17,2015-01-15 17:52:33,948.16,109.08,839.08,52,9,1,10.98,2,1,2,Red,1,2,Large table,102.12
3,21,23,2016-08-11 06:25:35,182.79,21.03,161.76,115,21,1,11.36,3,1,2,Red,1,2,Large table,102.12
4,27,28,2015-07-19 22:17:30,835.91,96.17,739.74,148,27,1,11.06,1,1,2,Red,1,2,Large table,102.12


In [114]:
data = data.rename(index=str, columns={
    "id_x": "id",
    "price_x": "price",
    'name_x': 'variation_name',
    'name_y': 'name',
    'price_y': 'current_price',
    'product': 'product_id'
})

In [115]:
data.head()

Unnamed: 0,id,customer,placed_on,total_cost,total_tax,subtotal,order_item_id,order,product_variation,price,quantity,prod_var_id,product_id,variation_name,id_y,merchant,name,current_price
0,1,4,2017-03-05 16:55:52,884.34,101.74,782.6,2,1,1,11.53,2,1,2,Red,1,2,Large table,102.12
1,6,6,2016-10-04 06:48:12,503.86,57.97,445.89,33,6,1,11.38,2,1,2,Red,1,2,Large table,102.12
2,9,17,2015-01-15 17:52:33,948.16,109.08,839.08,52,9,1,10.98,2,1,2,Red,1,2,Large table,102.12
3,21,23,2016-08-11 06:25:35,182.79,21.03,161.76,115,21,1,11.36,3,1,2,Red,1,2,Large table,102.12
4,27,28,2015-07-19 22:17:30,835.91,96.17,739.74,148,27,1,11.06,1,1,2,Red,1,2,Large table,102.12


In [118]:
# Drop unused columns
data.drop(['order', 'id_y'], inplace=True, axis=1)