## Task 1 - Quantium

In [1]:
import pandas as pd
import numpy as np

## Inspect and clean `purchase_behaviour` dataset

In [2]:
data = pd.read_csv('purchase_behaviour.csv')

In [3]:
data.head()

Unnamed: 0,LYLTY_CARD_NBR,LIFESTAGE,PREMIUM_CUSTOMER
0,1000,YOUNG SINGLES/COUPLES,Premium
1,1002,YOUNG SINGLES/COUPLES,Mainstream
2,1003,YOUNG FAMILIES,Budget
3,1004,OLDER SINGLES/COUPLES,Mainstream
4,1005,MIDAGE SINGLES/COUPLES,Mainstream


## Rename and lowercase the column names

In [4]:
data.columns = ['loyalty_card_num', 'lifestage', 'premium_customer']

In [6]:
data.lifestage = data.lifestage.str.lower()

In [8]:
data.lifestage = data.lifestage.str.replace('/' , ' / ') 

In [11]:
data.premium_customer = data.premium_customer.str.lower()

## Check nulls and unique values

In [7]:
data.lifestage.unique()

array(['young singles/couples', 'young families', 'older singles/couples',
       'midage singles/couples', 'new families', 'older families',
       'retirees'], dtype=object)

In [9]:
data.lifestage.unique()

array(['young singles / couples', 'young families',
       'older singles / couples', 'midage singles / couples',
       'new families', 'older families', 'retirees'], dtype=object)

In [10]:
data.premium_customer.unique()

array(['Premium', 'Mainstream', 'Budget'], dtype=object)

In [15]:
data.isnull().sum()

loyalty_card_num    0
lifestage           0
premium_customer    0
dtype: int64

## Check dtypes

In [12]:
data.dtypes

loyalty_card_num     int64
lifestage           object
premium_customer    object
dtype: object

## Take a sample and check the dataset values

In [17]:
data.sample(15)

Unnamed: 0,loyalty_card_num,lifestage,premium_customer
28124,103188,young singles / couples,budget
26602,96290,new families,mainstream
33003,121440,retirees,mainstream
47414,176501,new families,budget
1976,7009,retirees,premium
12645,47154,young singles / couples,budget
69002,258454,retirees,mainstream
41800,155303,young families,mainstream
42573,160120,midage singles / couples,budget
46582,174222,young families,budget


## It looks clean. Save it to a csv file

In [18]:
data.to_csv('purchase_behaviour_clean.csv', index=False)

## Import transaction data

In [123]:
raw_data = pd.read_excel('transaction_data.xlsx')

In [182]:
data = raw_data.sample(frac = 0.4)

In [183]:
data.head()

Unnamed: 0,DATE,STORE_NBR,LYLTY_CARD_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES
208928,43537,101,101167,101005,41,Doritos Salsa Mild 300g,2,5.2
119583,43513,26,26379,23922,4,Dorito Corn Chp Supreme 380g,2,13.0
88304,43563,106,106221,108067,46,Kettle Original 175g,2,10.8
73355,43496,91,91153,89769,109,Pringles Barbeque 134g,2,7.4
263140,43493,197,197433,197424,86,Cheetos Puffs 165g,1,2.8


## Rename column names and lowercase

In [184]:
data.columns = [i.lower() for i in data.columns]
data = data.rename(columns = {'store_nbr' : 'store_num', 'lylty_card_nbr' : 'loyalty_card_num', 'prod_name' : 'product_name' , 'prod_nbr' : 'product_number', 'prod_qty' : 'product_quantity', 'tot_sales' : 'total_sales'})

In [185]:
data.sample(10)

Unnamed: 0,date,store_num,loyalty_card_num,txn_id,product_number,product_name,product_quantity,total_sales
53976,43525,255,255124,254686,82,Smith Crinkle Cut Mac N Cheese 150g,1,2.6
4019,43484,216,216036,214837,75,Cobs Popd Sea Salt Chips 110g,2,7.6
222319,43559,115,115099,118625,10,RRD SR Slow Rst Pork Belly 150g,2,5.4
215426,43506,236,236035,238751,90,Tostitos Smoked Chipotle 175g,2,8.8
217252,43603,209,209238,208858,2,Cobs Popd Sour Crm &Chives Chips 110g,2,7.6
260379,43551,73,73105,72383,21,WW Sour Cream &OnionStacked Chips 160g,2,3.8
172613,43577,6,6457,6268,75,Cobs Popd Sea Salt Chips 110g,2,7.6
192594,43494,105,105024,105366,1,Smiths Crinkle Cut Chips Barbecue 170g,2,5.8
225079,43376,184,184072,186685,33,Cobs Popd Swt/Chlli &Sr/Cream Chips 110g,2,7.6
469,43322,16,16313,14440,6,RRD Lime & Pepper 165g,1,3.0


## Check dtypes and missing values

In [186]:
data.dtypes

date                  int64
store_num             int64
loyalty_card_num      int64
txn_id                int64
product_number        int64
product_name         object
product_quantity      int64
total_sales         float64
dtype: object

`date` should have been imported in date format. I will go up and fix the import statement.  
Back from import statement, the values in the `date` column are integers. They are in the format of times passed since 30th December 1899. I use the proper parameter in `pd.to_datetime()`

In [187]:
data.date = pd.to_datetime(data['date'], unit='d', origin='1899-12-30')

In [188]:
data.dtypes

date                datetime64[ns]
store_num                    int64
loyalty_card_num             int64
txn_id                       int64
product_number               int64
product_name                object
product_quantity             int64
total_sales                float64
dtype: object

Now the dtypes look right. Move on to missing values study

In [189]:
data.isnull().sum()

date                0
store_num           0
loyalty_card_num    0
txn_id              0
product_number      0
product_name        0
product_quantity    0
total_sales         0
dtype: int64

In [190]:
data.sample(15)

Unnamed: 0,date,store_num,loyalty_card_num,txn_id,product_number,product_name,product_quantity,total_sales
57196,2018-12-27,36,36342,33353,25,Pringles SourCream Onion 134g,2,7.4
58069,2018-10-30,56,56082,50469,60,Kettle Tortilla ChpsFeta&Garlic 150g,2,9.2
80666,2018-08-17,19,19067,16018,46,Kettle Original 175g,1,5.4
136178,2019-04-01,7,7146,6887,109,Pringles Barbeque 134g,1,3.7
30534,2018-11-30,4,4241,3749,60,Kettle Tortilla ChpsFeta&Garlic 150g,2,9.2
8439,2018-08-11,72,72214,71540,32,Kettle Sea Salt And Vinegar 175g,2,10.8
208996,2019-02-26,102,102183,102631,17,Kettle Sensations BBQ&Maple 150g,2,9.2
14729,2019-06-04,219,219239,219413,34,Pringles Slt Vingar 134g,2,7.4
93726,2018-12-16,186,186464,188828,33,Cobs Popd Swt/Chlli &Sr/Cream Chips 110g,2,7.6
193021,2018-09-08,109,109142,111029,102,Kettle Mozzarella Basil & Pesto 175g,2,10.8


## Extract weight values from `product_name`

In [191]:
data.product_name

208928                    Doritos Salsa Mild  300g
119583            Dorito Corn Chp     Supreme 380g
88304                         Kettle Original 175g
73355                     Pringles Barbeque   134g
263140                          Cheetos Puffs 165g
                            ...                   
72302      Kettle Tortilla ChpsBtroot&Ricotta 150g
195514         RRD Sweet Chilli &  Sour Cream 165g
99516      Infuzions BBQ Rib   Prawn Crackers 110g
7902      Infuzions Thai SweetChili PotatoMix 110g
32955         Kettle Tortilla ChpsFeta&Garlic 150g
Name: product_name, Length: 105934, dtype: object

In [192]:
names = data.product_name.copy(deep=True)

### Create a `product_weight` with the weight of the product

Check if all the weight values are only three figures values

In [193]:
weights = names.apply(lambda x : x[-4:])

Some weights could be invalid

In [194]:
weights.apply(lambda x : x[0].isnumeric() ).all()

False

In [195]:
invalid_weights = weights[ ~ weights.apply(lambda x : x[0].isnumeric() ) ]

In [196]:
invalid_weights.unique()

array(['Salt', ' 90g', ' 70g'], dtype=object)

Some products are lighter than 100 grams and others have no weight value at the end of the name.

In [197]:
names[names.str.endswith('Salt')]

156595    Kettle 135g Swt Pot Sea Salt
218057    Kettle 135g Swt Pot Sea Salt
170482    Kettle 135g Swt Pot Sea Salt
20036     Kettle 135g Swt Pot Sea Salt
236158    Kettle 135g Swt Pot Sea Salt
                      ...             
200808    Kettle 135g Swt Pot Sea Salt
122624    Kettle 135g Swt Pot Sea Salt
231301    Kettle 135g Swt Pot Sea Salt
200758    Kettle 135g Swt Pot Sea Salt
252523    Kettle 135g Swt Pot Sea Salt
Name: product_name, Length: 1269, dtype: object

Those products have the weight information in the middle of the name.

In [198]:
names[names.str.endswith('Salt')].unique()

array(['Kettle 135g Swt Pot Sea Salt'], dtype=object)

Create a function to extract the weight info from the product name

In [208]:
def extract_weight(name):
    # three digits weight
    if name[-4].isnumeric():
        return float(name[-4:-1])
    # two digits weight
    elif name[-4] ==' ':
        return float(name[-3:-1])
    elif name == 'Kettle 135g Swt Pot Sea Salt' :
        return 135
    # by now, any other format of name will return NaN
    else :
        return np.nan

In [209]:
product_weight = names.apply(extract_weight)

In [210]:
product_weight.isnull().sum()

0

In [211]:
product_weight.unique()

array([300., 380., 175., 134., 165., 150., 110., 200., 160., 330., 170.,
       190., 135., 270., 180., 210., 250.,  90.,  70., 125., 220.])

In [207]:
print(f'Size of the product name column: {names.shape[0]} vs total processed values {product_weight.notnull().sum()}.')
print(f'There is a difference of {names.shape[0] - product_weight.notnull().sum()}')

Size of the product name column: 105934 vs total processed values 105934.
There is a difference of 0


After having applied the `extract_weight` function, it seems all the weight values have been extracted.  
Now i will apply the a slightly modified version of the function to remove the weight info from the name and store it a new column named `product_weight`.

In [212]:
def extract_weight_from_name(name):
    # three digits weight
    if name[-4].isnumeric():
        clean_name = name[:-4]
        weight = float(name[-4:-1])
    # two digits weight
    elif name[-4] ==' ':
        clean_name = name[:-3]
        weight = float(name[-3:-1])
    elif name == 'Kettle 135g Swt Pot Sea Salt' :
        clean_name = name.replace('135g ', '')
        weight = 135
    # by now, any other format of name will return NaN
    else :
        clean_name = name
        weight = np.nan
    return clean_name, weight

In [213]:
clean_names = names.apply(extract_weight_from_name).apply(lambda x: x[0].strip())

In [214]:
clean_names.isnull().sum()

0

In [215]:
weights = names.apply(extract_weight_from_name).apply(lambda x: x[1])

In [220]:
pd.DataFrame({'name': clean_names, 'weight' : weights})

Unnamed: 0,name,weight
208928,Doritos Salsa Mild,300.0
119583,Dorito Corn Chp Supreme,380.0
88304,Kettle Original,175.0
73355,Pringles Barbeque,134.0
263140,Cheetos Puffs,165.0
...,...,...
72302,Kettle Tortilla ChpsBtroot&Ricotta,150.0
195514,RRD Sweet Chilli & Sour Cream,165.0
99516,Infuzions BBQ Rib Prawn Crackers,110.0
7902,Infuzions Thai SweetChili PotatoMix,110.0


In [219]:
pd.DataFrame({'name': clean_names, 'weight' : weights}).isnull().sum()

name      0
weight    0
dtype: int64

Since there are no missing values, there's no need to store the weights as float. I will convert to integer

In [221]:
weights = weights.astype(int)

### Add the `clean_names` and `weight` to dataset

In [222]:
data.product_name = clean_names
data['product_weight'] = weights

In [223]:
data.sample(15)

Unnamed: 0,date,store_num,loyalty_card_num,txn_id,product_number,product_name,product_quantity,total_sales,product_weight
198750,2019-04-27,191,191116,192453,75,Cobs Popd Sea Salt Chips,2,7.6,110
251458,2019-04-04,173,173168,174586,14,Smiths Crnkle Chip Orgnl Big Bag,2,11.8,380
212316,2018-10-25,168,168059,169699,108,Kettle Tortilla ChpsHny&Jlpno Chili,2,9.2,150
168629,2019-04-16,229,229157,231296,50,Tostitos Lightly Salted,2,8.8,175
92443,2019-06-27,167,167337,169160,98,NCC Sour Cream & Garden Chives,2,6.0,175
136636,2018-09-16,17,17352,15134,3,Kettle Sensations Camembert & Fig,2,9.2,150
106952,2019-05-12,105,105001,105206,111,Smiths Chip Thinly Cut Original,2,6.0,175
103109,2018-11-23,55,55067,48847,70,Tyrrells Crisps Lightly Salted,2,8.4,165
89731,2018-08-28,125,125076,128583,57,Old El Paso Salsa Dip Tomato Mild,2,10.2,300
180984,2018-09-15,190,190091,190781,57,Old El Paso Salsa Dip Tomato Mild,2,10.2,300


## Clean `product_name`

In [293]:
names = data.product_name

In [294]:
names.sample(20)

87876      Infuzions Thai SweetChili PotatoMix
86977                          Cheezels Cheese
210712                        Doritos Mexicana
208993                      RRD Salt & Vinegar
222756     Infuzions SourCream&Herbs Veg Strws
186110        Smiths Crinkle Cut  Tomato Salsa
165449     Smiths Crinkle Chips Salt & Vinegar
134201       Smiths Crnkle Chip  Orgnl Big Bag
206615        Grain Waves         Sweet Chilli
14176       Red Rock Deli Sp    Salt & Truffle
237644             Dorito Corn Chp     Supreme
60527      Infuzions SourCream&Herbs Veg Strws
98933        Kettle Mozzarella   Basil & Pesto
84967       Doritos Corn Chip Southern Chicken
121794    Sunbites Whlegrn    Crisps Frch/Onin
65937         Grain Waves         Sweet Chilli
179123                         Kettle Original
205779        Smiths Chip Thinly  Cut Original
221670           Smith Crinkle Cut   Bolognese
192950            Doritos Corn Chips  Original
Name: product_name, dtype: object

Make some replacements in the names to expand some abbreviations of ingredients

In [298]:
def replace_abbreviations(name):
    abbreviations = {'Chps' : 'Chips', 'Chp' : 'Chip', 'Hny' : 'Honey', 'Jlpno' : 'Jalapeno', 'Vinegr' : 'Vinegar', 'Tmato' : 'Tomato', 'Hrb&Spce' : 'Herbs & Spices', 'FriedChicken' : 'Fried Chicken',
                    'Frch/Onin' : 'French Onion'}
    for i in abbreviations.keys():
        if i in name:
            name = name.replace(i, abbreviations[i])
    return name        

Function to remove whitespaces

In [295]:
def remove_whitespaces(string):
    string = string.split()
    string = ' '.join(string)
    return string

Function to replace ampersand characters without whitespaces

In [302]:
def replace_ampersand(name):
    if '&' in name:
        pos = name.find('&')
        # ampersand is surrounded by words without whitspace
        # no space on the left
        if name[pos-1] != ' ':
            name = name.replace('&', ' &')
        # no space on the right
        pos = name.find('&')
        if name[pos+1] != ' ':
            name = name.replace('&', '& ')
    return name

Apply all functions to name values

In [303]:
names = names.apply(remove_whitespaces)
names = names.apply(replace_abbreviations)
names = names.apply(replace_ampersand)

Check the results and save to `data`

In [304]:
names.sample(20)

199291                  Tostitos Splash Of Lime
49816                 Pringles Sweet & Spcy BBQ
11008      Sunbites Whlegrn Crisps French Onion
101627         Old El Paso Salsa Dip Tomato Med
63667                              Burger Rings
242717       Red Rock Deli Chikn & Garlic Aioli
14099             Tyrrells Crisps Ched & Chives
144581                Pringles Sweet & Spcy BBQ
62321     Kettle Tortilla ChipsBtroot & Ricotta
191867          Red Rock Deli Sp Salt & Truffle
127245                          RRD Pc Sea Salt
86105                  Pringles Mystery Flavour
6341                              Kettle Chilli
51535        Kettle Sweet Chilli And Sour Cream
85410               Pringles Chicken Salt Crips
259735          Smiths Chip Thinly Cut Original
105195       Doritos Corn Chip Southern Chicken
25286          Smiths Crnkle Chip Orgnl Big Bag
222344           Kettle Sensations Siracha Lime
108971                  Kettle Swt Pot Sea Salt
Name: product_name, dtype: object

In [306]:
data.product_name = names

In [307]:
data.sample(10)

Unnamed: 0,date,store_num,loyalty_card_num,txn_id,product_number,product_name,product_quantity,total_sales,product_weight
206184,2019-04-26,43,43092,39218,113,Twisties Chicken,2,9.2,270
21300,2018-09-14,138,138152,141438,114,Kettle Sensations Siracha Lime,2,9.2,150
242059,2019-02-10,47,47479,43012,26,Pringles Sweet & Spcy BBQ,2,7.4,134
31036,2019-04-20,89,89440,88600,99,Pringles Sthrn Fried Chicken,2,7.4,134
230539,2019-02-12,51,51098,46801,23,Cheezels Cheese,1,5.7,330
127206,2019-03-26,145,145344,145630,62,Pringles Mystery Flavour,1,3.7,134
219193,2019-03-17,55,55100,49053,77,Doritos Corn Chips Nacho Cheese,2,8.8,170
105356,2018-09-03,81,81275,80960,59,Old El Paso Salsa Dip Tomato Med,2,10.2,300
198817,2018-08-09,194,194235,194213,42,Doritos Corn Chip Mexican Jalapeno,2,7.8,150
119318,2019-04-15,23,23072,19186,59,Old El Paso Salsa Dip Tomato Med,2,10.2,300


## After the cleaning, everything looks fine.

I will create a copy of this dataset, but using instead the whole dataset. In this case I used a sample 40% the size of the raw data.