# Starbucks

The project uses the [CRISP-DM](https://en.wikipedia.org/wiki/Cross-industry_standard_process_for_data_mining) process and is divided into the following parts:

## 1. Business understanding

Body.

## 2. Data understanding

### 2.1. Importing libraries

In [56]:
import numpy as np
import pandas as pd
import json

import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use("./style/minimal.mplstyle")

import warnings
warnings.filterwarnings("ignore")

%config InlineBackend.figure_format="retina"
%matplotlib inline

### 2.2. Loading data

In [22]:
portfolio = pd.read_json("data/portfolio.json", orient="records", lines=True)
profile = pd.read_json("data/profile.json", orient="records", lines=True)
transcript = pd.read_json("data/transcript.json", orient="records", lines=True)

In [23]:
print("Portfolio shape: ", portfolio.shape)
print("Profile shape:   ", profile.shape)
print("Transcript shape:", transcript.shape)

Portfolio shape:  (10, 6)
Profile shape:    (17000, 5)
Transcript shape: (306534, 4)


In [24]:
portfolio.sort_values(by=["offer_type", "difficulty", "duration"])

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
8,5,"[web, email, mobile, social]",5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
5,3,"[web, email, mobile, social]",7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2
9,2,"[web, email, mobile]",10,7,discount,2906b810c7d4411798c6938adc9daaa5
6,2,"[web, email, mobile, social]",10,10,discount,fafdcd668e3743c1bb461111dcafc2a4
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7
7,0,"[email, mobile, social]",0,3,informational,5a8bc65990b245e5a138643cd4eb9837
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed


In [25]:
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [26]:
transcript.head()

Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0


In [64]:
transcript.describe()

Unnamed: 0,time
count,306534.0
mean,366.38294
std,200.326314
min,0.0
25%,186.0
50%,408.0
75%,528.0
max,714.0


In [55]:
time_min = transcript.describe().loc["min", :]
time_max = transcript.describe().loc["max", :]

print(f"Transcript has {int(round((time_max - time_min)/24))} days of data")

Transcript has 30 days of data


### 2.3. Adjusting data types

In [59]:
profile["became_member_on"] = pd.to_datetime(profile["became_member_on"].astype(str), format="%Y%m%d")

In [60]:
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,2017-02-12,
1,F,55,0610b486422d4921ae7d2bf64640c50b,2017-07-15,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,2018-07-12,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,2017-08-04,


In [68]:
profile["cohort"].value_counts()

2017-10    900
2017-12    900
2017-8     886
2017-9     836
2017-11    816
          ... 
2013-12     52
2014-12     50
2014-10     49
2013-11     48
2013-7       5
Name: cohort, Length: 61, dtype: int64

In [75]:
profile["year"] = profile["became_member_on"].dt.year
profile["month"] = profile["became_member_on"].dt.month
profile["cohort"] = profile["became_member_on"].dt.strftime('%Y-%m').astype(str)

In [76]:
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income,year,month,cohort
0,,118,68be06ca386d4c31939f3a4f0e3dd783,2017-02-12,,2017,2,2017-02
1,F,55,0610b486422d4921ae7d2bf64640c50b,2017-07-15,112000.0,2017,7,2017-07
2,,118,38fe809add3b4fcf9315a9694bb96ff5,2018-07-12,,2018,7,2018-07
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,100000.0,2017,5,2017-05
4,,118,a03223e636434f42ac4c3df47e8bac43,2017-08-04,,2017,8,2017-08


### 2.4. Dividing the data

In [27]:
# Showing all event types
transcript["event"].value_counts()

transaction        138953
offer received      76277
offer viewed        57725
offer completed     33579
Name: event, dtype: int64

In [28]:
# Showing the first line of each event type
transcript.iloc[[0, 12650, 12654, 12658], :]

Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
12650,389bc3fa690240e798340f5a15918d5c,offer viewed,{'offer id': 'f19421c1d4aa40978ebb69ca19b0e20d'},0
12654,02c083884c7d45b39cc68e1314fec56c,transaction,{'amount': 0.8300000000000001},0
12658,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,offer completed,{'offer_id': '2906b810c7d4411798c6938adc9daaa5...,0


In [29]:
# New dataframe shapes
offer = transcript[transcript["event"] != "transaction"]
transaction = transcript[transcript["event"] == "transaction"]

print("Offer shape:      ", offer.shape)
print("Transaction shape:", transaction.shape)

Offer shape:       (167581, 4)
Transaction shape: (138953, 4)


In [57]:
offer["offer_id"] = offer["value"].apply(lambda x: x.get("offer id"))
transaction["amount"] = transaction["value"].apply(lambda x: x.get("amount"))

In [31]:
offer.head(1)

Unnamed: 0,person,event,value,time,offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0,9b98b8c7a33c4b65b9aebfe6a799e6d9


In [32]:
transaction.head(1)

Unnamed: 0,person,event,value,time,amount
12654,02c083884c7d45b39cc68e1314fec56c,transaction,{'amount': 0.8300000000000001},0,0.83


### 2.5. Exploratory data analysis

In [33]:
profile.describe()

Unnamed: 0,age,became_member_on,income
count,17000.0,17000.0,14825.0
mean,62.531412,20167030.0,65404.991568
std,26.73858,11677.5,21598.29941
min,18.0,20130730.0,30000.0
25%,45.0,20160530.0,49000.0
50%,58.0,20170800.0,64000.0
75%,73.0,20171230.0,80000.0
max,118.0,20180730.0,120000.0


## 3. Data preparation

## 4. Modelling

## 5. Evaluation

## 6. Deployment