## Carbo Loading Live Coding Test

In [112]:
import pandas as pd
import numpy as np

In [2]:
transactions = pd.read_csv('dh_transactions.csv')

In [3]:
store = pd.read_csv('dh_store_lookup.csv')

In [4]:
product = pd.read_csv('dh_product_lookup.csv')

In [6]:
causal = pd.read_csv('dh_causal_lookup.csv')

In [7]:
transactions.head()

Unnamed: 0,upc,dollar_sales,units,time_of_transaction,geography,week,household,store,basket,day,coupon
0,7680850106,0.8,1,1100,2,1,125434,244,1,1,0
1,3620000470,3.59,1,1100,2,1,125434,244,1,1,0
2,1800028064,2.25,1,1137,2,1,108320,244,2,1,0
3,9999985067,0.85,1,1148,2,1,162016,244,3,1,0
4,9999985131,2.19,1,1323,2,1,89437,244,4,1,0


In [8]:
store.head()

Unnamed: 0,store,store_zip_code
0,1,37865
1,2,30084
2,3,30039
3,4,31210
4,5,30044


In [9]:
product.head()

Unnamed: 0,upc,product_description,commodity,brand,product_size
0,111112360,VINCENT S ORIG MARINARA S,pasta sauce,Vincent's,25 OZ
1,566300023,PINE MOUNTAIN SYRUP,syrups,Pine Mountain,40 OZ
2,566300028,MILLER CANE SYRUP,syrups,Miller,19 OZ
3,566300029,MILLER CANE SYRUP,syrups,Miller,12 OZ
4,566300035,PINE MOUNTAIN SYRUP,syrups,Pine Mountain,19 OZ


In [10]:
causal.head()

Unnamed: 0,upc,store,week,feature_desc,display_desc,geography
0,7680850108,1,68,Wrap Interior Feature,Not on Display,1
1,5100001212,1,66,Wrap Back Feature,Not on Display,1
2,5100002792,1,72,Interior Page Feature,Not on Display,1
3,3620000300,1,55,Wrap Interior Feature,Not on Display,1
4,4112907742,1,68,Wrap Interior Feature,Not on Display,1


## Questions

#### Q1. What are the top five products in each commodity?

In [12]:
# Start by checking what the most common UPCs are in the transactions set. Mode gives us the first.
transactions.upc.mode()

0    9999985020
dtype: int64

In [79]:
# And then value counts gives us the nth most.
upcsales = transactions.upc.value_counts()[:20]

# We can make a new DataFrame of these twenty values.
upcsales = pd.DataFrame(upcsales)
upcsales = upcsales.rename(columns = {'upc':'sales'})
upcsales

Unnamed: 0,sales
9999985020,110132
9999985004,103971
9999985068,79920
3620000250,79305
9999985021,62020
9999967727,59460
5100002549,53951
3620000300,48643
9999985051,46344
9999985005,45572


In [63]:
# Let's make this a list, and then iterate over the product database. Set index lets us use loc.
# This just gives us the top ten most sold products though.
upclist = transactions.upc.value_counts()[:10].index.tolist()

for i in upclist:
    print(product.set_index('upc').loc[i])

product_description    PRIVATE LABEL THIN SPAGHETTI
commodity                                     pasta
brand                                 Private Label
product_size                                  16 OZ
Name: 9999985020, dtype: object
product_description    PRIVATE LABEL SPAGHETTI REGULAR
commodity                                        pasta
brand                                    Private Label
product_size                                     16 OZ
Name: 9999985004, dtype: object
product_description    PRIVATE LABEL ANGEL HAIR PASTA
commodity                                       pasta
brand                                   Private Label
product_size                                    16 OZ
Name: 9999985068, dtype: object
product_description    RAGU TRADITIONAL PLAIN
commodity                         pasta sauce
brand                                    Ragu
product_size                            26 OZ
Name: 3620000250, dtype: object
product_description    PRIVATE LABEL ELBO MA

In [111]:
# But it's a place to start. So let's make a full count list, a copy of products, and add a column of sales.
product2 = product.copy()
upclist = transactions.upc.value_counts()
upcsales = pd.DataFrame(upclist)
upcsales = upcsales.rename(columns = {'upc':'sales'})

i = 0

for group_key, group_value in product2.groupby('upc'):
    product2['units_sold'][0] = upcsales.loc[group_key].sales
    i += 1

KeyError: '9999985020'

In [84]:
transactions.shape()

TypeError: 'tuple' object is not callable

#### Q2. What are the top 5 brands in each commodity?

In [130]:
# This is similar to Q1, and actually we can use variables from Q1 to solve this.
# We already have the sales column, so we use that as a base in product 2.
# I didn't do Q1 correctly, so let's just generate random sales.

i = 0
for group_key, group_value in product2.groupby('upc'):
    product2['units_sold'][i] = np.random.randint(20000)
    i += 1
    
# We now need to add up the counts from each brand.

product2.set_index('brand').loc['Private Label']

brands = product2.value_counts('brand').index.tolist

# Out of time, but to do this, what would happen is that the list would be iterated over in products 2, and
# each usage of the brand name in the list would have an aggregated count, made into a new DataFrame, and the
# top brands would be pulled from there.

<bound method IndexOpsMixin.tolist of Index(['Private Label', 'Barilla', 'Ragu', 'Private Label Premium', 'Prego',
       'Ronzoni', 'DaVinci', 'San Giorgio', 'Mueller', 'Creamette',
       ...
       'Maggi Spaetzle', 'Edd Og', 'Moonlite', 'Defino', 'Cucina', 'Classique',
       'Chef Pizza', 'Orzo', 'Pastariso', 'Osem Bissli'],
      dtype='object', name='brand', length=131)>

#### Q3. How often is each commodity purchased on average by a customer?

#### Q4. How is the performance of the Pasta category? (geography, weekly trends, etc)

#### Q5. In Pasta and Pasta Sauce, what products, if any, are commonly purchased together in the same basket?