In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.precision", 2)
products=pd.read_csv("products.csv")
sales=pd.read_csv("sales.csv")
products=products.drop("Unnamed: 0",axis=1)
sales=sales.drop("Unnamed: 0",axis=1)
temp=sales[["invoice_date","total_sale"]]
sales["invoice_date"]=pd.to_datetime(sales["invoice_date"])
sales["week"]=pd.DatetimeIndex(sales["invoice_date"]).week
sales["month"]=pd.DatetimeIndex(sales["invoice_date"]).month
sales["year"]=pd.DatetimeIndex(sales["invoice_date"]).year
sales["weekday"]=pd.DatetimeIndex(sales["invoice_date"]).day_of_week

In [None]:
# Get a list of products that are above the 75th pctl in terms of
# volume sold
temp=sales.groupby(by="ASIN").sum().reset_index()
sales100=temp.loc[temp["total_sale"]>6.61e+04]
sales100=sales100.drop(["InvoiceNo", "Quantity", "price", 
        "invoice_time", "CustomerID"], axis=1)
# Get a list of products that are above the 75th pctl in terms of 
# total revenue
temp=sales.groupby(by="ASIN").sum().reset_index()
quantity100=temp.loc[temp["Quantity"]>1336]
quantity100=quantity100.drop(["InvoiceNo", "price", "total_sale", 
                "invoice_time", "CustomerID"],
                axis=1)
# Find the common items between the two lists
top100=pd.merge(sales100,quantity100,on="ASIN")

In [None]:
def churn(x,i):
    j=i+1
    a=((x["Customers"].loc[(x["month"]==j) & x["month_end"]]).values)
    b=((x["Customers"].loc[(x["month"]==j) & x["month_beg"]]).values)
    c=(a-b)/a
    try:
        return(round(c.item()*100,2))
    except ValueError:
        return(0)

In [None]:
# Let's calculate the customer's attrition on a monthly basis for 2019
attrition=pd.DataFrame({"month":range(1,13)})
temp=sales.copy(deep=True)
temp.loc[:,("Customers")]=1
temp=temp.loc[sales["year"]==2019]
temp=temp.groupby("invoice_date").sum()
temp.reset_index(inplace=True)
a=list(temp.columns)
a=[i for i in a if i not in ("invoice_date","Customers")]
temp.drop(a, axis=1, inplace=True)
temp["month"]=pd.DatetimeIndex(temp["invoice_date"]).month
temp["month_end"]=pd.DatetimeIndex(temp["invoice_date"]).is_month_end
temp["month_beg"]=pd.DatetimeIndex(temp["invoice_date"]).is_month_start

# Keep only dates that are on month's end / beginning
temp=temp.loc[(temp["month_end"]) | (temp["month_beg"])]

#Keep only months that we know the stats for both the start and end date
temp=temp.loc[temp.duplicated(subset="month",keep=False),:]
temp.reset_index(inplace=True)
for i in range(len(attrition)):
    attrition.loc[i,"change"]=churn(temp,i)

In [None]:
fig,ax=plt.subplots(figsize=(12,4))
sns.lineplot(data=attrition, x="month", y="change", color="crimson")
plt.axhline(0)
plt.title("Customer attrition rate for each month")
plt.xlabel("Month of the year")
plt.ylabel("Change in %")

In [None]:
# Find the revenue per customer for each country 
temp=sales.copy(deep=True)
temp["ca"]=1
temp=temp.groupby(by="Country").sum()
a=list(temp.columns)
a=[i for i in a if i not in ("total_sale","ca")]
temp.drop(a,axis=1,inplace=True)
temp["Revenue per customer"]=temp["total_sale"]/temp["ca"]
temp.rename({"total_sale":"Total Sales", "ca":"Customers"},
            axis=1, inplace=True)
temp.sort_values(by="Revenue per customer",ascending=False)

In [None]:
# Calculate average order size and revenue for each country
temp=sales.copy(deep=True)
temp["Items"]=1
temp=temp.groupby(by=["Country","InvoiceNo"]).sum()
temp.reset_index(inplace=True)
temp.rename({"weekday":"no_inv"},axis=1,inplace=True)
temp["no_inv"]=1
temp=temp.groupby(by="Country").sum()
temp["Items per order"]=temp["Items"]/temp["no_inv"]
temp["Revenue per order"]=temp["total_sale"]/temp["no_inv"]
temp["Revenue per item"]=temp["Revenue per order"]/temp["Items"]
a=list(temp.columns)
a=[i for i in a if i not in ("Items per order", 
    "Revenue per order", "Revenue per item")]
temp.drop(a,axis=1,inplace=True)
temp.sort_values(by="Revenue per order", ascending=False)

In [None]:
temp=sales.groupby(by=["Country","InvoiceNo"]).sum()
temp.reset_index(inplace=True)
temp

In [None]:
fig,ax=plt.subplots(figsize=(5,5),facecolor="snow")
temp=products["product_type"].value_counts()
label=[i for i in temp.index]
label=[i.capitalize() for i in label]
temp.plot.pie(
        labels=label, autopct="%1.0f%%")
ax.set_ylabel("")

In [None]:
# Let's plot the total sales for each day of the week
# Interestingly enough, there are no orders on Tuesdays. 
# That can not be correct, so we assume it's due to date corruption 
temp=sales.groupby(by="weekday").sum()
a=list(temp.columns)
a=[i for i in a if i not in ("total_sale")]
temp.drop(a, axis=1, inplace=True)
temp.loc[1]=0
temp.reset_index(inplace=True)

fig,ax=plt.subplots(figsize=(12,5))
sns.lineplot(data=temp, x="weekday", y="total_sale",
            color="crimson")
plt.xticks([0, 1, 2, 3, 4, 5, 6], ["Monday", "Tuesday", "Wednesday",
            "Thursday", "Friday", "Saturday", "Sunday"])
plt.title("Sales distribution on the days of the week")
plt.ylabel("Sales in 100s of millions")
plt.xlabel("Day of the week")

In [None]:
# Do it again but with better grouping and rolling averages
fig,ax=plt.subplots(2,1,figsize=(17,8.5),sharex=True)
temp=pd.merge(sales,products,on="ASIN")
temp=temp.groupby(by="invoice_date").sum()
temp.reset_index(inplace=True)
temp["avg"]=(temp["total_sale"].rolling(7).sum())
sns.lineplot(ax=ax[0],x=temp["invoice_date"], y=temp["avg"],
        color="crimson")
temp["avg"]=(temp["total_sale"].rolling(30).sum())        
sns.lineplot(ax=ax[1],x=temp["invoice_date"], y=temp["avg"],
        color="crimson")        
ax[0].set(title="Weekly average revenue", xlabel="", 
        ylabel="Revenue in 10s of millions")
ax[1].set(title="Monthly average revenue", xlabel="Date", 
        ylabel="Revenue in 10s of millions")

In [None]:
# Daily sales throughout the year, with Black Friday denoted by vertical line
fig,ax=plt.subplots(figsize=(17,8.5))
temp=sales
temp=temp.groupby(by="invoice_date").sum()
temp.reset_index(inplace=True)
sns.lineplot(x=temp["invoice_date"], y=temp["total_sale"],
        color="crimson")
lst=temp["invoice_date"].loc[temp["invoice_date"]=="2019-11-28"]
plt.axvline(lst)
plt.title("Daily Sales")
plt.ylabel("Sales in millions")
plt.xlabel("Date")

In [None]:
fig,ax=plt.subplots(figsize=(12,5))
temp=pd.merge(sales,products,on="ASIN")
temp=temp.groupby(by=["week","product_type"]).sum()
temp.reset_index(inplace=True)
sns.lineplot(x=temp["week"], y=temp["Quantity"],
        hue=temp["product_type"],
        color="darkslategrey")
plt.title("Sales breakdown based on product type")
plt.xlabel("Week of the year")


In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
temp=sales.loc[sales["year"]==2019]
temp=temp[["month","total_sale"]]
temp.set_index("month", inplace=True)
temp.index=pd.to_datetime(temp.index)
dec=seasonal_decompose(temp["total_sale"], model="additive",period=12)
dec.trend.plot()

In [None]:
# Attempt at web scrapping for accurate ratings and up to date prices
# WIP
from bs4 import BeautifulSoup
import requests
site="https://www.amazon.de/dp/B00000JRRD"
head=({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763"})
r=requests.get(url=site,headers=head)
soup=BeautifulSoup(r.content, "lxml")
title=soup.find("span", attrs={"class":"a-price-whole"})