In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules
pd.set_option("display.precision", 2)
products=pd.read_csv("products.csv")
sales=pd.read_csv("sales.csv")
products=products.drop("Unnamed: 0",axis=1)
sales=sales.drop("Unnamed: 0",axis=1)
temp=sales[["invoice_date","total_sale"]]
sales["invoice_date"]=pd.to_datetime(sales["invoice_date"])
sales["week"]=pd.DatetimeIndex(sales["invoice_date"]).week
sales["month"]=pd.DatetimeIndex(sales["invoice_date"]).month
sales["year"]=pd.DatetimeIndex(sales["invoice_date"]).year
sales["weekday"]=pd.DatetimeIndex(sales["invoice_date"]).day_of_week

In [None]:
# FUNCTIONS

def churn(x,i):
    j=i+1
    a=((x["Customers"].loc[(x["month"]==j) & x["month_end"]]).values)
    b=((x["Customers"].loc[(x["month"]==j) & x["month_beg"]]).values)
    c=(a-b)/a
    try:
        return(round(c.item()*100,2))
    except ValueError:
        return(0)

def hot_encode(x):
    if(x<=0):
        return(0)
    elif(x>=0):
        return(1)

def stringify(x):
    return(x.apply(lambda x:", ".join(list(x)))).astype("unicode")

In [None]:
# Get a list of products that are above the 75th pctl in terms of
# volume sold
temp=sales.groupby(by="ASIN").sum().reset_index()
sales100=temp.loc[temp["total_sale"]>6.61e+04]
sales100=sales100.drop(["InvoiceNo", "Quantity", "price", 
        "invoice_time", "CustomerID"], axis=1)
# Get a list of products that are above the 75th pctl in terms of 
# total revenue
temp=sales.groupby(by="ASIN").sum().reset_index()
quantity100=temp.loc[temp["Quantity"]>1336]
quantity100=quantity100.drop(["InvoiceNo", "price", "total_sale", 
                "invoice_time", "CustomerID"],
                axis=1)
# Find the common items between the two lists
top100=pd.merge(sales100,quantity100,on="ASIN")

In [None]:
# NOT CORRECT, WE CAN NOT CALCULATE CHURN RATE WITH EXISTING DATA
# Let's calculate the customer's attrition on a monthly basis for 2019
attrition=pd.DataFrame({"month":range(1,13)})
temp=sales.copy(deep=True)
temp.loc[:,("Customers")]=1
temp=temp.loc[sales["year"]==2019]
temp=temp.groupby("invoice_date").sum()
temp.reset_index(inplace=True)
a=list(temp.columns)
a=[i for i in a if i not in ("invoice_date","Customers")]
temp.drop(a, axis=1, inplace=True)
temp["month"]=pd.DatetimeIndex(temp["invoice_date"]).month
temp["month_end"]=pd.DatetimeIndex(temp["invoice_date"]).is_month_end
temp["month_beg"]=pd.DatetimeIndex(temp["invoice_date"]).is_month_start

# Keep only dates that are on month's end / beginning
temp=temp.loc[(temp["month_end"]) | (temp["month_beg"])]

# Keep only months that we know the stats for both the start and end date
temp=temp.loc[temp.duplicated(subset="month",keep=False),:]
temp.reset_index(inplace=True)
for i in range(len(attrition)):
    attrition.loc[i,"change"]=churn(temp,i)

# Plot for better visualisation
fig,ax=plt.subplots(figsize=(12,4))
sns.lineplot(data=attrition, x="month", y="change", color="crimson")
plt.axhline(0)
plt.title("Customer attrition rate for each month")
plt.xlabel("Month of the year")
plt.ylabel("Change in %")

In [None]:
attrition=pd.DataFrame({"month":range(1,13)})
temp=sales.copy(deep=True)
temp.loc[:,("Customers")]=1
temp=temp.loc[sales["year"]==2019]
temp=temp.groupby("invoice_date").sum()
temp.reset_index(inplace=True)
a=list(temp.columns)
a=[i for i in a if i not in ("invoice_date","Customers")]
temp.drop(a, axis=1, inplace=True)
temp["month"]=pd.DatetimeIndex(temp["invoice_date"]).month
temp["month_end"]=pd.DatetimeIndex(temp["invoice_date"]).is_month_end
temp["month_beg"]=pd.DatetimeIndex(temp["invoice_date"]).is_month_start

In [None]:
temp

In [None]:
# Check for customers who haven't shopped for a while
temp=sales.groupby(by=["CustomerID","invoice_date"]).sum()
temp.reset_index(inplace=True)
a=list(temp.columns)
a=[i for i in a if i not in ("CustomerID","invoice_date")]
temp.drop(a,axis=1,inplace=True)
temp=temp.groupby(by="CustomerID")\
    .min().sort_values(by="invoice_date",ascending=False)
temp=len(temp.loc[temp["invoice_date"]<"2019-06-01"])
lst=[temp,sales["CustomerID"].nunique()]

# Plot them to visualise better
fig,ax=plt.subplots(figsize=(5,5),facecolor="snow")
label=["Inactive users","Active users"]
plt.pie(lst, labels=label, autopct="%1.0f%%")
plt.xlabel("Inactive users defined as no transactions"\
    "in the last 6 months")

In [None]:
# Find the revenue per customer for each country 
temp=sales.copy(deep=True)
temp["ca"]=1
temp=temp.groupby(by="Country").sum()
a=list(temp.columns)
a=[i for i in a if i not in ("total_sale","ca")]
temp.drop(a,axis=1,inplace=True)
temp["Revenue per customer"]=temp["total_sale"]/temp["ca"]
temp.rename({"total_sale":"Total sales", "ca":"Customers"},
            axis=1, inplace=True)
temp.sort_values(by="Revenue per customer",ascending=False)

In [None]:
# Calculate average order size and revenue for each country
temp=sales.copy(deep=True)
temp["Items"]=1
temp=temp.groupby(by=["Country","InvoiceNo"]).sum()
temp.reset_index(inplace=True)
temp.rename({"weekday":"no_inv"},axis=1,inplace=True)
temp["no_inv"]=1
temp=temp.groupby(by="Country").sum()
temp["Items per order"]=temp["Items"]/temp["no_inv"]
temp["Revenue per order"]=temp["total_sale"]/temp["no_inv"]
temp["Revenue per item"]=temp["Revenue per order"]/temp["Items"]
a=list(temp.columns)
a=[i for i in a if i not in ("Items per order", 
    "Revenue per order", "Revenue per item")]
temp.drop(a,axis=1,inplace=True)
temp.sort_values(by="Revenue per order", ascending=False)

In [196]:
# Find the most common itemsets for December 2019
temp=pd.merge(sales,products,on="ASIN")
temp=(temp[(temp["month"]==12)&(temp["year"]==2019)]
                .groupby(by=["InvoiceNo","ASIN",])["Quantity"]
                .sum().unstack().reset_index().fillna(0)
                .set_index("InvoiceNo"))

temp_encoded=temp.applymap(hot_encode)
frq_items=apriori(temp_encoded,min_support=0.04,use_colnames=True)
rules=association_rules(frq_items, metric="lift", min_threshold=1.2)
rules=rules.sort_values(["confidence","lift"], ascending=[False,False])
rules["antecedents"]=stringify(rules["antecedents"])
rules["consequents"]=stringify(rules["consequents"])
a=rules[["antecedents","consequents","confidence"]]



In [197]:
a=pd.merge(a,products,left_on="antecedents",right_on="ASIN")
pd.merge(a,products,left_on="consequents",right_on="ASIN")

Unnamed: 0,antecedents,consequents,confidence,ASIN_x,product_type_x,title_x,rating_x,review_count_x,ASIN_y,product_type_y,title_y,rating_y,review_count_y
0,B00U2UZ40Y,B01KZC1EAW,0.77,B00U2UZ40Y,dslr camera,Fujifilm Instax Mini 8 Instant Film Camera (Ra...,4.6,15513,B01KZC1EAW,smartphone,Syncwire Long Aux Cable 6.5Ft- Auxiliary Audio...,4.6,18048
1,B0719KWJ1B,B006BIQBMQ,0.76,B0719KWJ1B,smartphone,Rockpapa On Ear Stereo Headphones Earphones fo...,4.4,3719,B006BIQBMQ,smartphone,"Cellet PH600 Car Cup Holder Mount, Adjustable ...",4.2,5400
2,B016ATD6AU,B006BIQBMQ,0.67,B016ATD6AU,keyboard,URBAN ARMOR GEAR UAG Microsoft Surface Pro 7/P...,4.4,2351,B006BIQBMQ,smartphone,"Cellet PH600 Car Cup Holder Mount, Adjustable ...",4.2,5400
3,B00TCKRKU2,B01KUDFE20,0.76,B00TCKRKU2,dslr camera,Peacechaos Men's Canvas Camera Bag Leather DSL...,4.3,396,B01KUDFE20,smartphone,Maryland Terps Adhesive Silicone Cell Phone Wa...,4.5,461
4,B07MD3WDLS,B00004Z5RA,0.75,B07MD3WDLS,processor,Crucial Ballistix Sport LT 3000 MHz DDR4 DRAM ...,4.7,6759,B00004Z5RA,keyboard,Belkin F8E263-BLK WaveRest Gel Wrist Pad for K...,4.3,10509
5,B01KZC1EAW,B00U2UZ40Y,0.72,B01KZC1EAW,smartphone,Syncwire Long Aux Cable 6.5Ft- Auxiliary Audio...,4.6,18048,B00U2UZ40Y,dslr camera,Fujifilm Instax Mini 8 Instant Film Camera (Ra...,4.6,15513
6,B07D1XCKWW,B072LX99L9,0.64,B07D1XCKWW,processor,Corsair Vengeance RGB PRO 16GB (2x8GB) DDR4 32...,4.8,7135,B072LX99L9,processor,Corsair VENGEANCE LPX 16GB (2 x 8GB) DDR4 3200...,4.7,30131
7,B00004Z5RA,B07MD3WDLS,0.63,B00004Z5RA,keyboard,Belkin F8E263-BLK WaveRest Gel Wrist Pad for K...,4.3,10509,B07MD3WDLS,processor,Crucial Ballistix Sport LT 3000 MHz DDR4 DRAM ...,4.7,6759
8,B00E4MQODC,B012PL6K8M,0.6,B00E4MQODC,mouse,Logitech G602 Lag-Free Wireless Gaming Mouse â...,4.3,9067,B012PL6K8M,dslr camera,Lexar Professional 633x 64GB SDXC UHS-I Card (...,4.6,7035
9,B006BIQBMQ,B0719KWJ1B,0.58,B006BIQBMQ,smartphone,"Cellet PH600 Car Cup Holder Mount, Adjustable ...",4.2,5400,B0719KWJ1B,smartphone,Rockpapa On Ear Stereo Headphones Earphones fo...,4.4,3719


In [None]:
from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine
from bokeh.plotting import figure
from bokeh.plotting import from_networkx


In [None]:
import networkx as nx
g=nx.from_pandas_edgelist(rules,"antecedents","consequents","confidence")
pos=nx.spring_layout(g)
fig,ax=plt.subplots(figsize=(10,10))
plt.style.use("ggplot")
nodes=nx.draw_networkx_nodes(g,pos,alpha=0.8)
nodes.set_edgecolor("k")
nx.draw_networkx_labels(g,pos,font_size=8)
nx.draw_networkx_edges(g,pos,width=1.0,alpha=0.2)

In [None]:
fig,ax=plt.subplots(figsize=(5,5),facecolor="snow")
temp=products["product_type"].value_counts()
label=[i for i in temp.index]
label=[i.capitalize() for i in label]
temp.plot.pie(
        labels=label, autopct="%1.0f%%")
ax.set_ylabel("")

In [None]:
# Let's plot the total sales for each day of the week
# Interestingly enough, there are no orders on Tuesdays. 
# That can not be correct, so we assume it's due to date corruption 
temp=sales.groupby(by="weekday").sum()
a=list(temp.columns)
a=[i for i in a if i not in ("total_sale")]
temp.drop(a, axis=1, inplace=True)
temp.loc[1]=0
temp.reset_index(inplace=True)

fig,ax=plt.subplots(figsize=(12,5))
sns.lineplot(data=temp, x="weekday", y="total_sale",
            color="crimson")
plt.xticks([0, 1, 2, 3, 4, 5, 6], ["Monday", "Tuesday", "Wednesday",
            "Thursday", "Friday", "Saturday", "Sunday"])
plt.title("Sales distribution on the days of the week")
plt.ylabel("Sales in 100s of millions")
plt.xlabel("Day of the week")

In [None]:
# Do it again but with better grouping and rolling averages
fig,ax=plt.subplots(2,1,figsize=(17,8.5),sharex=True)
temp=pd.merge(sales,products,on="ASIN")
temp=temp.groupby(by="invoice_date").sum()
temp.reset_index(inplace=True)
temp["avg"]=(temp["total_sale"].rolling(7).sum())
sns.lineplot(ax=ax[0],x=temp["invoice_date"], y=temp["avg"],
        color="crimson")
temp["avg"]=(temp["total_sale"].rolling(30).sum())        
sns.lineplot(ax=ax[1],x=temp["invoice_date"], y=temp["avg"],
        color="crimson")        
ax[0].set(title="Weekly average revenue", xlabel="", 
        ylabel="Revenue in 10s of millions")
ax[1].set(title="Monthly average revenue", xlabel="Date", 
        ylabel="Revenue in 10s of millions")

In [None]:
# Daily sales throughout the year, with Black Friday denoted by vertical line
fig,ax=plt.subplots(figsize=(17,8.5))
temp=sales
temp=temp.groupby(by="invoice_date").sum()
temp.reset_index(inplace=True)
sns.lineplot(x=temp["invoice_date"], y=temp["total_sale"],
        color="crimson")
lst=temp["invoice_date"].loc[temp["invoice_date"]=="2019-11-28"]
plt.axvline(lst)
plt.title("Daily Sales")
plt.ylabel("Sales in millions")
plt.xlabel("Date")

In [None]:
fig,ax=plt.subplots(figsize=(12,5))
temp=pd.merge(sales,products,on="ASIN")
temp=temp.groupby(by=["week","product_type"]).sum()
temp.reset_index(inplace=True)
sns.lineplot(x=temp["week"], y=temp["Quantity"],
        hue=temp["product_type"],
        color="darkslategrey")
plt.title("Sales breakdown based on product type")
plt.xlabel("Week of the year")


In [None]:
%%script false
from statsmodels.tsa.seasonal import seasonal_decompose
temp=sales.loc[sales["year"]==2019]
temp=temp[["month","total_sale"]]
temp.set_index("month", inplace=True)
temp.index=pd.to_datetime(temp.index)
dec=seasonal_decompose(temp["total_sale"], model="additive",period=12)
dec.trend.plot()

In [None]:
%%script false 
# Attempt at web scrapping for accurate ratings and up to date prices
# WIP
from bs4 import BeautifulSoup
import smtplib
import requests
site="https://www.amazon.de/dp/B00004THD0"
head=({"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37"})
r=requests.get(url=site,headers=head)
soup=BeautifulSoup(r.content, "html.parser")
title=soup.findAll("span", {"id":"productTitle"})

title