# 06 Relationships between customer behaviors

In [1]:
import json
import sqlalchemy
import pandas as pd

from pprint import pprint

In [2]:
# Make a sql connection with sqlalchmey
conn_string = "postgresql://postgres-db/churn?user=postgres&password=password" 
engine = sqlalchemy.create_engine(conn_string)
conn = engine.connect()

# Query with Pandas, e.g. list all tables
tables = pd.read_sql_query("SELECT * FROM information_schema.tables;", conn)
tables.head(3)

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action
0,churn,socialnet7,active_period,BASE TABLE,,,,,,YES,NO,
1,churn,socialnet7,event_type,BASE TABLE,,,,,,YES,NO,
2,churn,socialnet7,metric_name,BASE TABLE,,,,,,YES,NO,


In [3]:
with open("/app/fightchurn/listings/conf/socialnet7_listings.json", "r") as f: 
    conf = json.loads(f.read())

chap4 = conf["chap4"]
chap6 = conf["chap6"]

## Dependent Data

In [4]:
pprint(chap4)

{'defaults': {'%from_yyyy-mm-dd': '2020-02-09',
              '%to_yyyy-mm-dd': '2020-05-10',
              'mode': 'run',
              'type': 'sql'},
 'list1': {'name': 'ongoing_active_periods',
           'params': {'%gap_interval': '7 day'}},
 'list2': {'name': 'churned_periods', 'params': {'%gap_interval': '7 day'}},
 'list3': {'name': 'active_event_weeks', 'params': {}},
 'list4': {'name': 'observation_dates',
           'params': {'%lead_time': '5 day', '%obs_interval': '1 month'}},
 'list5': {'name': 'dataset',
           'params': {'%metric_interval': '7 day', 'mode': 'save'}},
 'list6': {'name': 'current_customers',
           'params': {'%from_yyyy-mm-dd': '2020-05-10',
                      '%metric_interval': '7 day',
                      '%to_yyyy-mm-dd': '2020-05-10',
                      'mode': 'save'}}}


In [8]:
tmp = ["subscription"]
tables[tables.table_name.isin(tmp)]

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action
8,churn,socialnet7,subscription,BASE TABLE,,,,,,YES,NO,


In [9]:
subs = pd.read_sql_query("SELECT * FROM socialnet7.subscription", conn)
subs.head()

Unnamed: 0,id,account_id,product,start_date,end_date,mrr,quantity,units,bill_period_months
0,0,1,'socialnet7',2020-01-28,2020-02-28,9.99,,,1
1,1,1,'socialnet7',2020-02-28,2020-03-28,9.99,,,1
2,2,1,'socialnet7',2020-03-28,2020-04-28,9.99,,,1
3,3,1,'socialnet7',2020-04-28,2020-05-28,9.99,,,1
4,4,1,'socialnet7',2020-05-28,2020-06-28,9.99,,,1


In [7]:
# listing_4_1_ongoing_active_periods.sql
gap_interval = chap4["list1"]["params"]["%gap_interval"]
to_yyyy_mm_dd = chap4["defaults"]["%to_yyyy-mm-dd"]

query = f"""\
WITH RECURSIVE active_period_params AS (
    SELECT
        interval '{gap_interval}' AS allowed_gap,
        '{to_yyyy_mm_dd}'::date AS calc_date
),
active AS (
  -- find all accounts that are currently active
    SELECT
  DISTINCT
        account_id,
        min(start_date) AS start_date
      FROM socialnet7.subscription
INNER JOIN active_period_params AS params
        ON start_date <= params.calc_date
       AND (end_date > params.calc_date OR end_date IS null)
  GROUP BY account_id

     UNION
  -- find earlier subscriptions that overlapped with or are continuous with
  -- but older than the subscriptions currently found
    SELECT
        subs.account_id,
        subs.start_date
      FROM socialnet7.subscription AS subs
CROSS JOIN active_period_params AS params
INNER JOIN active -- self, i.e. recursive join
        ON subs.account_id = active.account_id
       AND subs.start_date < active.start_date
       AND subs.end_date >= (active.start_date - params.allowed_gap)::date
)

-- INSERT INTO active_period (account_id, start_date, churn_date)
     SELECT
        account_id,
        min(start_date) AS start_date,
        NULL::date as churn_date
       FROM active
   GROUP BY account_id, churn_date

""" 
# print(query)
result = pd.read_sql_query(query, conn)
result.head()

Unnamed: 0,account_id,start_date,churn_date
0,10688,2020-02-04,
1,12614,2020-04-27,
2,12380,2020-04-16,
3,10511,2020-02-14,
4,12819,2020-04-24,


In [2]:
import pandas as pd
import matplotlib.pyplot as plt

def metric_pair_plot(data_set_path, metric1='',metric2=''):

    churn_data = pd.read_csv(data_set_path,index_col=[0,1])

    met1_series = churn_data[metric1]
    met2_series = churn_data[metric2]

    corr = met1_series.corr(met2_series)

    plt.scatter(met1_series, met2_series, marker='.')

    plt.xlabel(metric1)
    plt.ylabel(metric2)
    plt.title('Correlation = %.2f' % corr)
    plt.tight_layout()
    plt.grid()

    save_name = data_set_path.replace('.csv', '_' + metric1 + '_vs_' + metric2 + '.png')
    plt.savefig(save_name )
    print('Saving plot to %s' % save_name)
    plt.close()
