In [37]:
import pandas as pd

In [38]:
df = pd.read_csv("data/WMDataWeeks201601To201852.txt", sep='|', encoding="ISO-8859-1", 
                 dtype={"UPC":"str"}, parse_dates=["POSDate"])

In [None]:
df.head()

In [40]:
len(df)

10441571

In [41]:
df.UPC.nunique()

1573

In [None]:
df["Brand"].unique()

## Aggregating by Brand

In [43]:
from helper_save_load import load_from_pickle

df_a, df_f, df_v = load_from_pickle("data/dataframes_Dollars.pickle")
del df_a, df_f

In [44]:
df_v = df_v.loc[:,~df_v.columns.duplicated()]

item_to_brand = df_v[["UPC_code", "Brand"]].groupby("UPC_code", as_index=False).first()
item_to_brand.columns = ["UPC", "Brand"]
item_to_brand["UPC"] = pd.to_numeric(item_to_brand["UPC"])

In [None]:
item_to_brand.head()

In [None]:
# Aggregate(sum) by Date and UPC
df_pos = df[["TotalSales", "TotalQty", "POSDate", "UPC"]].groupby(["POSDate", "UPC"], as_index=False).sum()
df_pos["UPC"] = pd.to_numeric(df_pos["UPC"] )
df_pos.head()

In [47]:
# Compare UPC to variance data brands
pos_with_brands = df_pos.merge(item_to_brand, on=["UPC"]) # how="left" to include items not matched to brands

In [48]:
137305/952840 # 14% of items not matchable to brands

0.14410079341757273

In [None]:
# Group by Month_Year and Brand
pos_grouped_brands = pos_with_brands.groupby([pos_with_brands["POSDate"].dt.to_period("M"), "Brand"]).agg(
                                            {"TotalSales":"sum", "TotalQty":"sum"}).reset_index()
pos_grouped_brands["MonthlyUnitPrice"] = pos_grouped_brands["TotalSales"]/pos_grouped_brands["TotalQty"]
pos_grouped_brands.head()

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

ax = pos_grouped_brands[pos_grouped_brands["Brand"] == '1I-VIM CLEANERS Brand'].plot(x="POSDate", y="TotalSales")

pos_grouped_brands[pos_grouped_brands["Brand"] == '1I-VIM CLEANERS Brand'].plot(x="POSDate", y="MonthlyUnitPrice")
plt.show()

In [57]:
group_by_day = pos_with_brands.groupby([pos_with_brands["POSDate"].dt.to_period("M"), "Brand"]).agg({"TotalSales":"sum", "TotalQty":"sum"}).reset_index()
group_by_day["MonthlyUnitPrice"] = group_by_day["TotalSales"]/group_by_day["TotalQty"]

In [None]:
group_by_day.head(10)

In [None]:
to_plot.head()

In [None]:
to_plot = group_by_day[group_by_day["Brand"] == 'LL-DOVE BAR Brand']

plt.rcParams.update({'font.size': 22})

ax = to_plot.plot(x="POSDate", y="TotalQty", figsize=(30,10), linewidth=2, label="Monthly Sales")
ax2 = to_plot.plot(x="POSDate", y="MonthlyUnitPrice", ax=ax, secondary_y=True, linewidth=2, label="Unit Price")

ax.set_ylabel("Number of sales", fontsize=28, labelpad=20)
ax2.set_ylabel("Average unit price of brand ($)", fontsize=28, labelpad=40 ,rotation=-90)
ax.set_xlabel("Month", fontsize=28)

ax.set_title("Walmart POS for LL-DOVE BAR Brand", fontsize=30)

to_plot[to_plot["POSDate"] == "2017-06"].plot(x="POSDate", y="TotalQty", style='y*', ax=ax, markersize=20, legend=False)
promo_months = ["2018-03", "2018-07", "2018-08", "2018-09"]
for month in promo_months:
    to_plot[to_plot["POSDate"] == month].plot(x="POSDate", y="TotalQty", style='g*', ax=ax, markersize=20, legend=False)


plt.show()