# Starbucks Customer Segmentation

## I. Outline

## II. Introduction

## III. Libraries

In [348]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## IV. Data Loading

In [349]:
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

## V. Data Cleaning

In [380]:
portfolio.head()

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7


In [382]:
portfolio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reward      10 non-null     int64 
 1   channels    10 non-null     object
 2   difficulty  10 non-null     int64 
 3   duration    10 non-null     int64 
 4   offer_type  10 non-null     object
 5   id          10 non-null     object
dtypes: int64(3), object(3)
memory usage: 608.0+ bytes


In [351]:
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [383]:
profile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            14825 non-null  object 
 1   age               17000 non-null  int64  
 2   id                17000 non-null  object 
 3   became_member_on  17000 non-null  int64  
 4   income            14825 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 664.2+ KB


In [381]:
transcript.head()

Unnamed: 0,person,event,time,offer_id,amount,reward
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,0.0,0
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,0.0,0
2,e2127556f4f64592b11af22de27a7932,offer received,0,2906b810c7d4411798c6938adc9daaa5,0.0,0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,fafdcd668e3743c1bb461111dcafc2a4,0.0,0
4,68617ca6246f4fbc85e91a2a49552598,offer received,0,4d5c57ea9a6940dd891ad53e9dbe8da0,0.0,0


In [384]:
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   person    306534 non-null  object 
 1   event     306534 non-null  object 
 2   time      306534 non-null  int64  
 3   offer_id  306534 non-null  object 
 4   amount    306534 non-null  float64
 5   reward    306534 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 14.0+ MB


### V.I Data Transform

In [354]:
# extract keys in value column
keys = []
for value in transcript['value']:
  keys.extend(value.keys())

value_keys = set(keys)
print(value_keys)

{'offer_id', 'offer id', 'amount', 'reward'}


In [355]:
# extract value column in transcript
for key in value_keys:
  transcript[key] = transcript['value'].apply(lambda x: x[key] if key in x.keys() else 0)
  

# combine offer id and offer_id column
transcript['offer_id'] = np.where(transcript['offer id'] != 0, transcript['offer id'], transcript['offer_id'])

#drop value and offer id column in transcript
transcript.drop(columns=['value','offer id'], inplace=True)

In [363]:
df_customer = profile.merge(transcript, left_on='id', right_on='person', how="inner")
df = df_customer.merge(portfolio, left_on='offer_id', right_on='id', how="inner")

In [364]:
df.head()

Unnamed: 0,gender,age,id_x,became_member_on,income,person,event,time,offer_id,amount,reward_x,reward_y,channels,difficulty,duration,offer_type,id_y
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,,68be06ca386d4c31939f3a4f0e3dd783,offer received,168,2906b810c7d4411798c6938adc9daaa5,0.0,0,2,"[web, email, mobile]",10,7,discount,2906b810c7d4411798c6938adc9daaa5
1,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,,68be06ca386d4c31939f3a4f0e3dd783,offer viewed,216,2906b810c7d4411798c6938adc9daaa5,0.0,0,2,"[web, email, mobile]",10,7,discount,2906b810c7d4411798c6938adc9daaa5
2,M,68,e2127556f4f64592b11af22de27a7932,20180426,70000.0,e2127556f4f64592b11af22de27a7932,offer received,0,2906b810c7d4411798c6938adc9daaa5,0.0,0,2,"[web, email, mobile]",10,7,discount,2906b810c7d4411798c6938adc9daaa5
3,M,68,e2127556f4f64592b11af22de27a7932,20180426,70000.0,e2127556f4f64592b11af22de27a7932,offer viewed,18,2906b810c7d4411798c6938adc9daaa5,0.0,0,2,"[web, email, mobile]",10,7,discount,2906b810c7d4411798c6938adc9daaa5
4,,118,8ec6ce2a7e7949b1bf142def7d0e0586,20170925,,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,408,2906b810c7d4411798c6938adc9daaa5,0.0,0,2,"[web, email, mobile]",10,7,discount,2906b810c7d4411798c6938adc9daaa5


### V.II Handling Missing Value

## VI. Exploratory Data Analysis

## VII. Data Preprocessing

## VIII. Modeling

## IX. Evaluation

## X. Model Tuning

## XI. Conclusion