# 06 Relationships between customer behaviors

In [1]:
import json
import sqlalchemy
import pandas as pd

from pprint import pprint

In [2]:
# Make a sql connection with sqlalchmey
conn_string = "postgresql://postgres-db/churn?user=postgres&password=password" 
engine = sqlalchemy.create_engine(
    conn_string, connect_args={'options': '-csearch_path={}'.format("socialnet7,public")}
)
conn = engine.connect()
# conn.close()

# Query with Pandas, e.g. list all tables
tables = pd.read_sql_query("SELECT * FROM information_schema.tables;", conn)
tables.head(3)

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action
0,churn,socialnet7,active_period,BASE TABLE,,,,,,YES,NO,
1,churn,socialnet7,event_type,BASE TABLE,,,,,,YES,NO,
2,churn,socialnet7,metric_name,BASE TABLE,,,,,,YES,NO,


In [3]:
with open("/app/fightchurn/listings/conf/socialnet7_listings.json", "r") as f: 
    conf = json.loads(f.read())

chap4 = conf["chap4"]
chap6 = conf["chap6"]

## Dependent Data

- Listing 4.5
- 4.6.1 Dataset creation SQL program

In [4]:
obs = pd.read_sql_query("SELECT * FROM observation ORDER BY account_id, observation_date", conn) 
display(obs.shape)
obs.head()

(31333, 3)

Unnamed: 0,account_id,observation_date,is_churn
0,1,2020-02-23,False
1,1,2020-03-23,False
2,1,2020-04-23,False
3,2,2020-03-08,True
4,4,2020-02-24,False


In [11]:
metric = pd.read_sql_query("SELECT * FROM metric ORDER BY RANDOM() LIMIT 10000", conn) 
metric.head()

Unnamed: 0,account_id,metric_time,metric_name_id,metric_value
0,1123,2020-03-22,8,11.0
1,11044,2020-05-10,8,72.0
2,4201,2020-04-19,8,163.0
3,10712,2020-02-16,8,68.0
4,8094,2020-03-01,8,1416.0


In [5]:
tmp = pd.read_sql_query("SELECT metric_name_id, COUNT(*) FROM metric GROUP BY metric_name_id", conn) 
tmp

Unnamed: 0,metric_name_id,count
0,0,163450
1,1,144501
2,2,160936
3,3,160510
4,4,154133
5,5,39979
6,6,155804
7,7,124663
8,8,158237


In [6]:
metric_name = pd.read_sql_query("SELECT * FROM metric_name", conn) 
metric_name.head()

Unnamed: 0,metric_name_id,metric_name
0,0,like_per_month
1,1,newfriend_per_month
2,2,post_per_month
3,3,adview_per_month
4,4,dislike_per_month


### Listing 4.5

- 4.6.1 Dataset creation SQL program

In [7]:
metric_interval = chap4["list6"]["params"]["%metric_interval"]
from_yyyy_mm_dd = chap4["defaults"]["%from_yyyy-mm-dd"]
to_yyyy_mm_dd = chap4["defaults"]["%to_yyyy-mm-dd"]

metric_interval, from_yyyy_mm_dd, to_yyyy_mm_dd

('7 day', '2020-02-09', '2020-05-10')

In [8]:
query = f"""\

WITH observation_params AS (
    SELECT  
        interval '{metric_interval}' AS metric_period,
        '{from_yyyy_mm_dd}'::timestamp AS obs_start,
        '{to_yyyy_mm_dd}'::timestamp AS obs_end
)

    SELECT 
        metric.account_id, 
        obs.observation_date, 
        obs.is_churn,
        SUM(CASE WHEN metric_name_id=0 THEN metric_value ELSE 0 END) AS like_per_month,
        SUM(CASE WHEN metric_name_id=1 THEN metric_value ELSE 0 END) AS newfriend_per_month,
        SUM(CASE WHEN metric_name_id=2 THEN metric_value ELSE 0 END) AS post_per_month,
        SUM(CASE WHEN metric_name_id=3 THEN metric_value ELSE 0 END) AS adview_per_month,
        SUM(CASE WHEN metric_name_id=4 THEN metric_value ELSE 0 END) AS dislike_per_month,
        SUM(CASE WHEN metric_name_id=5 THEN metric_value ELSE 0 END) AS unfriend_per_month,
        SUM(CASE WHEN metric_name_id=6 THEN metric_value ELSE 0 END) AS message_per_month,
        SUM(CASE WHEN metric_name_id=7 THEN metric_value ELSE 0 END) AS reply_per_month,
        SUM(CASE WHEN metric_name_id=8 THEN metric_value ELSE 0 END) AS account_tenure
      FROM metric
INNER JOIN observation_params AS params
        ON metric_time BETWEEN params.obs_start AND params.obs_end    
INNER JOIN observation AS obs
        ON metric.account_id = obs.account_id
       AND metric.metric_time > (obs.observation_date - params.metric_period)::timestamp    
       AND metric.metric_time <= obs.observation_date::timestamp
  GROUP BY metric.account_id, metric.metric_time, obs.observation_date, obs.is_churn    
  ORDER BY obs.observation_date, metric.account_id

"""
res = pd.read_sql_query(query, conn).sort_values(["account_id", "observation_date"])
res.head(6)

Unnamed: 0,account_id,observation_date,is_churn,like_per_month,newfriend_per_month,post_per_month,adview_per_month,dislike_per_month,unfriend_per_month,message_per_month,reply_per_month,account_tenure
4112,1,2020-02-23,False,2.0,0.0,6.0,1.0,3.0,1.0,6.0,6.0,26.0
14104,1,2020-03-23,False,2.0,1.0,6.0,2.0,2.0,2.0,7.0,4.0,54.0
24621,1,2020-04-23,False,1.0,1.0,2.0,4.0,4.0,1.0,7.0,3.0,82.0
9178,2,2020-03-08,True,67.0,7.0,49.0,124.0,120.0,0.0,0.0,0.0,55.0
4398,4,2020-02-24,False,36.0,4.0,27.0,36.0,5.0,1.0,32.0,8.0,25.0
14425,4,2020-03-24,False,37.0,8.0,39.0,40.0,9.0,0.0,39.0,3.0,53.0


- each row is an observation (snapshot) of an account before a subscription ends,

In [9]:
df = res.copy()
df.shape

(31333, 12)

## Correlations

In [7]:
pprint(conf["chap6"])

{'defaults': {'data_set_path': 'socialnet7/socialnet7_dataset.csv',
              'type': 'py'},
 'list1': {'name': 'metric_pair_plot',
           'params': {'metric1': 'post_per_month', 'metric2': 'like_per_month'},
           'v1': {'data_set_path': 'socialnet7/socialnet7_dataset_scores.csv'},
           'v10': {'data_set_path': 'socialnet7/socialnet7_dataset_scores.csv',
                   'metric2': 'dislike_per_month'},
           'v11': {'metric2': 'unfriend_per_month'},
           'v12': {'data_set_path': 'socialnet7/socialnet7_dataset_scores.csv',
                   'metric2': 'unfriend_per_month'},
           'v13': {'metric2': 'reply_per_month'},
           'v14': {'data_set_path': 'socialnet7/socialnet7_dataset_scores.csv',
                   'metric2': 'reply_per_month'},
           'v15': {'metric2': 'account_tenure'},
           'v16': {'data_set_path': 'socialnet7/socialnet7_dataset_scores.csv',
                   'metric2': 'account_tenure'},
           'v17': {'data_se

In [10]:
import pandas as pd
import matplotlib.pyplot as plt

In [11]:
def metric_pair_plot(data_set_path, metric1='',metric2=''):

    churn_data = pd.read_csv(data_set_path,index_col=[0,1])

    met1_series = churn_data[metric1]
    met2_series = churn_data[metric2]

    corr = met1_series.corr(met2_series)

    plt.scatter(met1_series, met2_series, marker='.')

    plt.xlabel(metric1)
    plt.ylabel(metric2)
    plt.title('Correlation = %.2f' % corr)
    plt.tight_layout()
    plt.grid()