# Machine Learning Capstone Project - Starbucks


Importing necessary libraries and datasets

In [59]:
import pandas as pd
import numpy as np
import math
import json


# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

## Exploratory Data Analysis

In this first section we are going to explore the 3 datasets at hand.
We will also use plots to get a sense of what data is avaiable and how we might want to use it.
The goal of this phase is to get an insight on the datasets at hands and identify potential issues to tackle.

### Portfolio Dataset 

In [43]:
portfolio.head()

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7


In [47]:
portfolio.shape

(10, 6)

The portfolio has information about the offers Starbucks put forward. We see that 2 columns need to be one-hot encoded: Channels and offer_type.


In [53]:
#How many channels available do we have?
lst=[]
for var in portfolio['channels']:
    lst+=var
set(lst)

{'email', 'mobile', 'social', 'web'}

In [54]:
#How many offer_type?
portfolio.offer_type.unique()

array(['bogo', 'informational', 'discount'], dtype=object)

### Profile Dataset

The profile dataset has information about our customers. 

In [65]:
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [66]:
profile.shape

(17000, 5)

In [68]:
#Missing values? It appears we have a number of missing values, and Age is set to be 118 for those.
#Let's confirm the count of rows with age 118
profile.isnull().sum()

gender              2175
age                    0
id                     0
became_member_on       0
income              2175
dtype: int64

In [69]:
profile[profile['age']==118]
#As we thought, where age is 118 we have missing gender and income. Let's dropped these before visualizing some distributions.

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,
6,,118,8ec6ce2a7e7949b1bf142def7d0e0586,20170925,
7,,118,68617ca6246f4fbc85e91a2a49552598,20171002,
...,...,...,...,...,...
16980,,118,5c686d09ca4d475a8f750f2ba07e0440,20160901,
16982,,118,d9ca82f550ac4ee58b6299cf1e5c824a,20160415,
16989,,118,ca45ee1883624304bac1e4c8a114f045,20180305,
16991,,118,a9a20fa8b5504360beb4e7c8712f8306,20160116,


In [71]:
profile=profile[profile['age']!=118]
#We will need to one hot encode gender and perhaps transform the variable "became_member_on" to
#Something that tells us the customer tenure, such as number of days since customer joined.

## Data Pre-processing

In this section we will combine the datasets and tackle the issues we identified in our EDA, including data cleaning.

The goal at the end of this section is to have a unique datasets for our Supervised Learning models.

### Portfolio Dataset 

In [60]:
#Encoding Offer_type
portfolio=pd.get_dummies(portfolio,columns=['offer_type'],drop_first=True)
portfolio.head(3)

Unnamed: 0,reward,channels,difficulty,duration,id,offer_type_discount,offer_type_informational
0,10,"[email, mobile, social]",10,7,ae264e3637204a6fb9bb56bc8210ddfd,0,0
1,10,"[web, email, mobile, social]",10,5,4d5c57ea9a6940dd891ad53e9dbe8da0,0,0
2,0,"[web, email, mobile]",0,4,3f207df678b143eea3cee63160fa8bed,0,1


In [62]:
#Encoding channels
channels=['email', 'mobile', 'social', 'web']
for channel in channels:
    portfolio[channel]=0
    
for channel in channels:
    portfolio[channel]=portfolio.apply(lambda row: 1 if channel in row['channels'] else 0,axis=1)


In [64]:
portfolio=portfolio.drop(columns=['channels'])
portfolio.head()

Unnamed: 0,reward,difficulty,duration,id,offer_type_discount,offer_type_informational,email,mobile,social,web
0,10,10,7,ae264e3637204a6fb9bb56bc8210ddfd,0,0,1,1,1,0
1,10,10,5,4d5c57ea9a6940dd891ad53e9dbe8da0,0,0,1,1,1,1
2,0,0,4,3f207df678b143eea3cee63160fa8bed,0,1,1,1,0,1
3,5,5,7,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,0,1,1,0,1
4,5,20,10,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,1,0,0,1


In [48]:
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [49]:
profile.shape

(17000, 5)

In [45]:
transcript.head()

Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0


In [46]:
transcript.shape

(306534, 4)