# 07 Segmenting customers with advanced metrics

In [1]:
import json
import sqlalchemy
import pandas as pd

from pprint import pprint

In [2]:
# Make a sql connection with sqlalchmey
conn_string = "postgresql://postgres-db/churn?user=postgres&password=password" 
engine = sqlalchemy.create_engine(
    conn_string, connect_args={'options': '-csearch_path={}'.format("socialnet7,public")}
)
conn = engine.connect()
# conn.close()

# Query with Pandas, e.g. list all tables
tables = pd.read_sql_query("SELECT * FROM information_schema.tables;", conn)
tables.head(3)

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action
0,churn,socialnet7,active_period,BASE TABLE,,,,,,YES,NO,
1,churn,socialnet7,event_type,BASE TABLE,,,,,,YES,NO,
2,churn,socialnet7,metric_name,BASE TABLE,,,,,,YES,NO,


In [3]:
import json 
from box import Box
# !pip install python-box

with open("/app/fightchurn/listings/conf/socialnet7_listings.json", "r") as f: 
    # conf = json.loads(f.read())
    conf = Box(json.loads(f.read()))

chap8 = conf.chap8

## Dependent Data

In [11]:
tmp = pd.read_sql_query("SELECT * FROM metric_name ORDER BY metric_name_id", conn)
tmp.tail()

Unnamed: 0,metric_name_id,metric_name
16,28,unfriend_per_newfriend_scaled
17,30,new_friends_pcnt_change
18,31,days_since_newfriend
19,33,unfriend_28day_avg_84day_obs
20,34,unfriend_28day_avg_84day_obs_scaled


In [12]:
tmp = pd.read_sql_query("SELECT * FROM metric ORDER BY RANDOM() LIMIT 10000", conn)
tmp.head()

Unnamed: 0,account_id,metric_time,metric_name_id,metric_value
0,3583,2020-04-19,3,2.0
1,1605,2020-03-01,30,1.0
2,5384,2020-03-29,25,0.0
3,974,2020-05-03,4,42.0
4,4812,2020-03-15,21,0.195122


## Metric Queries

### Listing 8.0 Observations

In [16]:
from_yyyy_mm_dd = chap8.list0.params["%from_yyyy-mm-dd"] 
to_yyyy_mm_dd = chap8.list0.params["%to_yyyy-mm-dd"]
metric_interval = chap8.list0.params["%metric_interval"]
from_yyyy_mm_dd, to_yyyy_mm_dd, metric_interval

('2020-03-01', '2020-05-10', '7 day')

In [17]:
# listing_8_0_dataset2.sql

query = f"""\
WITH observation_params AS (
    SELECT  
        interval '{metric_interval}' AS metric_period,
        '{from_yyyy_mm_dd}'::timestamp AS obs_start,
        '{to_yyyy_mm_dd}'::timestamp AS obs_end
)

    SELECT 
        m.account_id, 
        o.observation_date, 
        is_churn,
        SUM(CASE WHEN metric_name_id=0 THEN metric_value ELSE 0 END) AS like_per_month,
        SUM(CASE WHEN metric_name_id=1 THEN metric_value ELSE 0 END) AS newfriend_per_month,
        SUM(CASE WHEN metric_name_id=2 THEN metric_value ELSE 0 END) AS post_per_month,
        SUM(CASE WHEN metric_name_id=3 THEN metric_value ELSE 0 END) AS adview_per_month,
        SUM(CASE WHEN metric_name_id=4 THEN metric_value ELSE 0 END) AS dislike_per_month,
        SUM(CASE WHEN metric_name_id=6 THEN metric_value ELSE 0 END) AS message_per_month,
        SUM(CASE WHEN metric_name_id=7 THEN metric_value ELSE 0 END) AS reply_per_month,
        SUM(CASE WHEN metric_name_id=21 THEN metric_value ELSE 0 END) AS adview_per_post,
        SUM(CASE WHEN metric_name_id=22 THEN metric_value ELSE 0 END) AS reply_per_message,
        SUM(CASE WHEN metric_name_id=23 THEN metric_value ELSE 0 END) AS like_per_post,
        SUM(CASE WHEN metric_name_id=24 THEN metric_value ELSE 0 END) AS post_per_message,
        SUM(CASE WHEN metric_name_id=27 THEN metric_value ELSE 0 END) AS dislike_pcnt,
        SUM(CASE WHEN metric_name_id=28 THEN metric_value ELSE 0 END) AS unfriend_per_newfriend,
        SUM(CASE WHEN metric_name_id=30 THEN metric_value ELSE 0 END) AS newfriend_pcnt_chng,
        SUM(CASE WHEN metric_name_id=31 THEN metric_value ELSE 0 END) AS days_since_newfriend,
        SUM(CASE WHEN metric_name_id=34 THEN metric_value ELSE 0 END) AS unfriend_per_month
      FROM 
        metric AS m 
INNER JOIN 
        observation_params
        ON metric_time BETWEEN obs_start AND obs_end
INNER JOIN observation AS o 
        ON m.account_id = o.account_id
       AND m.metric_time > (o.observation_date - metric_period)::timestamp
       AND m.metric_time <= o.observation_date::timestamp
  GROUP BY 
        m.account_id, metric_time, observation_date, is_churn
  ORDER BY 
        observation_date, m.account_id
"""
res = pd.read_sql_query(query, conn)
res.head()

Unnamed: 0,account_id,observation_date,is_churn,like_per_month,newfriend_per_month,post_per_month,adview_per_month,dislike_per_month,message_per_month,reply_per_month,adview_per_post,reply_per_message,like_per_post,post_per_message,dislike_pcnt,unfriend_per_newfriend,newfriend_pcnt_chng,days_since_newfriend,unfriend_per_month
0,27,2020-03-01,False,48.0,3.0,12.0,6.0,7.0,78.0,36.0,0.5,0.461538,0.0,0.153846,0.0,0.0,0.0,0.0,0.0
1,102,2020-03-01,False,40.0,5.0,50.0,16.0,7.0,2.0,1.0,0.32,0.5,0.0,25.0,0.0,0.0,0.0,0.0,1.018182
2,139,2020-03-01,False,15.0,0.0,17.0,10.0,3.0,47.0,21.0,0.588235,0.446809,0.0,0.361702,0.0,18.0,0.0,0.0,0.0
3,194,2020-03-01,False,19.0,5.0,54.0,139.0,0.0,107.0,36.0,2.574074,0.336449,0.0,0.504673,0.0,0.0,-0.444444,0.0,1.018182
4,232,2020-03-01,False,94.0,7.0,12.0,31.0,12.0,2.0,1.0,2.583333,0.5,0.0,6.0,0.0,0.0,0.4,0.0,0.0


In [18]:
df = res.copy()
df.shape

(24450, 19)

### Listing 8.3

In [23]:
pprint(chap8.list3.params)

Box({'type': 'sql', 'mode': 'save', '%metric_interval': '7 day'})


In [24]:
# listing_8_3_dataset2_current

query = """\
WITH metric_date AS (
    SELECT 
        MAX(metric_time) AS last_metric_time 
      FROM metric
), 
account_tenures AS (
    SELECT 
        account_id, 
        metric_value AS account_tenure
      FROM metric AS m 
INNER JOIN metric_date 
        ON metric_time = last_metric_time
    WHERE metric_name_id = 8
      AND metric_value >= 14
)

    SELECT 
        s.account_id, 
        d.last_metric_time AS observation_date,
        SUM(CASE WHEN metric_name_id=0 THEN metric_value ELSE 0 END) AS like_per_month,
        SUM(CASE WHEN metric_name_id=1 THEN metric_value ELSE 0 END) AS newfriend_per_month,
        SUM(CASE WHEN metric_name_id=2 THEN metric_value ELSE 0 END) AS post_per_month,
        SUM(CASE WHEN metric_name_id=3 THEN metric_value ELSE 0 END) AS adview_per_month,
        SUM(CASE WHEN metric_name_id=4 THEN metric_value ELSE 0 END) AS dislike_per_month,
        SUM(CASE WHEN metric_name_id=6 THEN metric_value ELSE 0 END) AS message_per_month,
        SUM(CASE WHEN metric_name_id=7 THEN metric_value ELSE 0 END) AS reply_per_month,
        SUM(CASE WHEN metric_name_id=21 THEN metric_value ELSE 0 END) AS adview_per_post,
        SUM(CASE WHEN metric_name_id=22 THEN metric_value ELSE 0 END) AS reply_per_message,
        SUM(CASE WHEN metric_name_id=23 THEN metric_value ELSE 0 END) AS like_per_post,
        SUM(CASE WHEN metric_name_id=24 THEN metric_value ELSE 0 END) AS post_per_message,
        SUM(CASE WHEN metric_name_id=25 THEN metric_value ELSE 0 END) AS unfriend_per_newfriend,
        SUM(CASE WHEN metric_name_id=27 THEN metric_value ELSE 0 END) AS dislike_pcnt,
        SUM(CASE WHEN metric_name_id=30 THEN metric_value ELSE 0 END) AS newfriend_pcnt_chng,
        SUM(CASE WHEN metric_name_id=31 THEN metric_value ELSE 0 END) AS days_since_newfriend,
        SUM(CASE WHEN metric_name_id=34 THEN metric_value ELSE 0 END) AS unfriend_per_month
      FROM 
        metric AS m 
INNER JOIN metric_date AS d 
        ON m.metric_time = d.last_metric_time
INNER JOIN account_tenures AS a 
        ON a.account_id = m.account_id
INNER JOIN subscription AS s 
        ON m.account_id = s.account_id
     WHERE s.start_date <= d.last_metric_time
       AND (s.end_date >= d.last_metric_time OR s.end_date IS null)
  GROUP BY 
        s.account_id, d.last_metric_time
  ORDER BY 
        s.account_id
"""


## Data Preprations

### Listing 8.1 Prepare Data

In [19]:
import sys
sys.path.append("/app")

from fightchurn.listings.chap5.listing_5_2_dataset_stats import dataset_stats
from fightchurn.listings.chap7.listing_7_5_fat_tail_scores import fat_tail_scores
from fightchurn.listings.chap6.listing_6_4_find_metric_groups import find_metric_groups
from fightchurn.listings.chap6.listing_6_3_apply_metric_groups import apply_metric_groups
from fightchurn.listings.chap6.listing_6_5_ordered_correlation_matrix import ordered_correlation_matrix

In [20]:
group_corr_thresh = chap8.list1.params["group_corr_thresh"]

group_corr_thresh

0.65

In [6]:
dataset_stats??

[0;31mSignature:[0m [0mdataset_stats[0m[0;34m([0m[0mdata_set_path[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mdataset_stats[0m[0;34m([0m[0mdata_set_path[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0;32massert[0m [0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0misfile[0m[0;34m([0m[0mdata_set_path[0m[0;34m)[0m[0;34m,[0m[0;34m'"{}" is not a valid dataset path'[0m[0;34m.[0m[0mformat[0m[0;34m([0m[0mdata_set_path[0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0mchurn_data[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mread_csv[0m[0;34m([0m[0mdata_set_path[0m[0;34m,[0m[0mindex_col[0m[0;34m=[0m[0;34m[[0m[0;36m0[0m[0;34m,[0m[0;36m1[0m[0;34m][0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0;32mif[0m [0;34m'is_churn'[0m [0;32min[0m [0mchurn_data[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0mchurn_data[0m[0;34m[[0m[0;34m'is_churn'[0m

In [7]:
fat_tail_scores??

[0;31mSignature:[0m [0mfat_tail_scores[0m[0;34m([0m[0mdata_set_path[0m[0;34m,[0m [0mskew_thresh[0m[0;34m=[0m[0;36m4.0[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mfat_tail_scores[0m[0;34m([0m[0mdata_set_path[0m[0;34m,[0m[0mskew_thresh[0m[0;34m=[0m[0;36m4.0[0m[0;34m,[0m[0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0mchurn_data[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mread_csv[0m[0;34m([0m[0mdata_set_path[0m[0;34m,[0m[0mindex_col[0m[0;34m=[0m[0;34m[[0m[0;36m0[0m[0;34m,[0m[0;36m1[0m[0;34m][0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0mdata_scores[0m [0;34m=[0m [0mchurn_data[0m[0;34m.[0m[0mcopy[0m[0;34m([0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0mdata_scores[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0;34m'is_churn'[0m[0;34m,[0m[0minplace[0m[0;34m=[0m

In [8]:
find_metric_groups??

[0;31mSignature:[0m [0mfind_metric_groups[0m[0;34m([0m[0mdata_set_path[0m[0;34m,[0m [0mgroup_corr_thresh[0m[0;34m=[0m[0;36m0.5[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mfind_metric_groups[0m[0;34m([0m[0mdata_set_path[0m[0;34m,[0m[0mgroup_corr_thresh[0m[0;34m=[0m[0;36m0.5[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0mscore_save_path[0m[0;34m=[0m[0mdata_set_path[0m[0;34m.[0m[0mreplace[0m[0;34m([0m[0;34m'.csv'[0m[0;34m,[0m[0;34m'_scores.csv'[0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0;32massert[0m [0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0misfile[0m[0;34m([0m[0mscore_save_path[0m[0;34m)[0m[0;34m,[0m[0;34m'You must run listing 5.3 or 7.5 to save metric scores first'[0m[0;34m[0m
[0;34m[0m    [0mscore_data[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mread_csv[0m[0;34m([0m[0mscore_save_path[0m[0;34m,[0m[0min

In [9]:
apply_metric_groups??

[0;31mSignature:[0m [0mapply_metric_groups[0m[0;34m([0m[0mdata_set_path[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mapply_metric_groups[0m[0;34m([0m[0mdata_set_path[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0mscore_save_path[0m[0;34m=[0m[0mdata_set_path[0m[0;34m.[0m[0mreplace[0m[0;34m([0m[0;34m'.csv'[0m[0;34m,[0m[0;34m'_scores.csv'[0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0;32massert[0m [0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0misfile[0m[0;34m([0m[0mscore_save_path[0m[0;34m)[0m[0;34m,[0m[0;34m'You must run listing 5.3 or 7.5 to save metric scores first'[0m[0;34m[0m
[0;34m[0m    [0mscore_data[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mread_csv[0m[0;34m([0m[0mscore_save_path[0m[0;34m,[0m[0mindex_col[0m[0;34m=[0m[0;34m[[0m[0;36m0[0m[0;34m,[0m[0;36m1[0m[0;34m][0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0

In [10]:
ordered_correlation_matrix??

[0;31mSignature:[0m [0mordered_correlation_matrix[0m[0;34m([0m[0mdata_set_path[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mordered_correlation_matrix[0m[0;34m([0m[0mdata_set_path[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0mchurn_data[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mread_csv[0m[0;34m([0m[0mdata_set_path[0m[0;34m.[0m[0mreplace[0m[0;34m([0m[0;34m'.csv'[0m[0;34m,[0m[0;34m'_scores.csv'[0m[0;34m)[0m[0;34m,[0m[0mindex_col[0m[0;34m=[0m[0;34m[[0m[0;36m0[0m[0;34m,[0m[0;36m1[0m[0;34m][0m[0;34m)[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0mload_mat_df[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mread_csv[0m[0;34m([0m[0mdata_set_path[0m[0;34m.[0m[0mreplace[0m[0;34m([0m[0;34m'.csv'[0m[0;34m,[0m [0;34m'_load_mat.csv'[0m[0;34m)[0m[0;34m,[0m [0mindex_col[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0

## Modelling

In [14]:
pprint(chap8)

{'defaults': {'data_set_path': 'socialnet7/socialnet7_dataset2.csv',
              'type': 'py'},
 'list0': {'name': 'dataset2',
           'params': {'%from_yyyy-mm-dd': '2020-03-01',
                      '%metric_interval': '7 day',
                      '%to_yyyy-mm-dd': '2020-05-10',
                      'mode': 'save',
                      'type': 'sql'}},
 'list1': {'name': 'prepare_data',
           'params': Box({'group_corr_thresh': 0.65}),
           'v2': Box({'data_set_path': 'socialnet7/socialnet7_dataset.csv'}),
           'v3': {'data_set_path': 'socialnet7/socialnet7_dataset3_nocat.csv'}},
 'list2': {'name': 'logistic_regression',
           'params': Box({}),
           'v1': Box({'as_retention': False})},
 'list3': {'name': 'dataset2_current',
           'params': {'%metric_interval': '7 day',
                      'mode': 'save',
                      'type': 'sql'}},
 'list4': Box({'name': 'rescore_metrics', 'params': {}}),
 'list5': {'name': 'churn_forecast',
  

### Listing 8.2 Logistic Regression

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from math import exp
import pickle

def logistic_regression(data_set_path,as_retention=True):
    X,y = prepare_data(data_set_path,as_retention=as_retention)
    retain_reg = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True)
    retain_reg.fit(X, y)
    file_ext = '' if as_retention else '_churn'
    save_regression_summary(data_set_path,retain_reg, file_ext)
    save_regression_model(data_set_path,retain_reg, file_ext)
    save_dataset_predictions(data_set_path,retain_reg,X, file_ext)

def prepare_data(data_set_path,ext='_groupscore',as_retention=True):
    score_save_path = data_set_path.replace('.csv', '{}.csv'.format(ext))
    assert os.path.isfile(score_save_path), 'You must run listing 6.3 to save grouped metric scores first'
    grouped_data = pd.read_csv(score_save_path,index_col=[0,1])
    y = grouped_data['is_churn'].astype(int)
    if as_retention: y=np.subtract(1,y)
    X = grouped_data.drop(['is_churn'],axis=1)
    return X,y

def calculate_impacts(retain_reg):
    average_retain=s_curve(-retain_reg.intercept_)
    one_stdev_retain=np.array( [ s_curve(-retain_reg.intercept_-c) for c in  retain_reg.coef_[0]])
    one_stdev_impact=one_stdev_retain-average_retain
    return one_stdev_impact, average_retain

def s_curve(x):
    return 1.0 - (1.0/(1.0+exp(-x)))

def save_regression_summary(data_set_path,retain_reg,ext=''):
    one_stdev_impact,average_retain = calculate_impacts(retain_reg)
    group_lists = pd.read_csv(data_set_path.replace('.csv', '_groupmets.csv'),index_col=0)
    coef_df = pd.DataFrame.from_dict(
        {'group_metric_offset':  np.append(group_lists.index,'offset'),
         'weight': np.append(retain_reg.coef_[0],retain_reg.intercept_),
         'retain_impact' : np.append(one_stdev_impact,average_retain),
         'group_metrics' : np.append(group_lists['metrics'],'(baseline)')})
    save_path = data_set_path.replace('.csv', '_logreg_summary{}.csv'.format(ext))
    coef_df.to_csv(save_path, index=False)
    print('Saved coefficients to ' + save_path)

def save_regression_model(data_set_path,retain_reg,ext=''):
    pickle_path = data_set_path.replace('.csv', '_logreg_model{}.pkl'.format(ext))
    with open(pickle_path, 'wb') as fid:
        pickle.dump(retain_reg, fid)
    print('Saved model pickle to ' + pickle_path)

def save_dataset_predictions(data_set_path, retain_reg, X,ext=''):
    predictions = retain_reg.predict_proba(X)
    predict_df = pd.DataFrame(predictions,index=X.index,columns=['churn_prob','retain_prob'])
    predict_path = data_set_path.replace('.csv', '_predictions{}.csv'.format(ext))
    predict_df.to_csv(predict_path,header=True)
    print('Saved dataset predictions to ' + predict_path)


### Listing 8.4 Rescore Metrics

In [None]:
import pandas as pd
import numpy as np
import os
from fightchurn.listings.chap7.listing_7_5_fat_tail_scores import transform_fattail_columns, transform_skew_columns

def rescore_metrics(data_set_path):

    load_mat_df = reload_churn_data(data_set_path,'load_mat','6.4',is_customer_data=False)
    score_df = reload_churn_data(data_set_path,'score_params','7.5',is_customer_data=False)
    current_data = reload_churn_data(data_set_path,'current','8.3',is_customer_data=True)
    assert set(score_df.index.values)==set(current_data.columns.values),"Data to re-score does not match transform params"
    assert set(load_mat_df.index.values)==set(current_data.columns.values),"Data to re-score does not match lodaasasdfasdfasdf matrix"

    transform_skew_columns(current_data,score_df[score_df['skew_score']].index.values)
    transform_fattail_columns(current_data,score_df[score_df['fattail_score']].index.values)
    scaled_data = score_current_data(current_data,score_df,data_set_path)
    grouped_data = group_current_data(scaled_data, load_mat_df,data_set_path)
    save_segment_data(grouped_data,current_data,load_mat_df,data_set_path)

def score_current_data(current_data,score_df, data_set_path):
    current_data=current_data[score_df.index.values]
    scaled_data=(current_data-score_df['mean'])/score_df['std']
    score_save_path=data_set_path.replace('.csv','_current_scores.csv')
    scaled_data.to_csv(score_save_path,header=True)
    print('Saving score results to %s' % score_save_path)
    return scaled_data

def group_current_data(scaled_data,load_mat_df,data_set_path):
    scaled_data = scaled_data[load_mat_df.index.values]
    grouped_ndarray = np.matmul(scaled_data.to_numpy(), load_mat_df.to_numpy())
    current_data_grouped = pd.DataFrame(grouped_ndarray,columns=load_mat_df.columns.values, index=scaled_data.index)
    score_save_path=data_set_path.replace('.csv','_current_groupscore.csv')
    current_data_grouped.to_csv(score_save_path,header=True)
    print('Saving grouped results to %s' % score_save_path)
    return current_data_grouped

def save_segment_data(current_data_grouped, current_data, load_mat_df, data_set_path):
    group_cols =  load_mat_df.columns[load_mat_df.astype(bool).sum(axis=0) > 1]
    no_group_cols = load_mat_df.columns[load_mat_df.astype(bool).sum(axis=0) == 1]
    segment_df = current_data_grouped[group_cols].join(current_data[no_group_cols])
    segment_df.to_csv(data_set_path.replace('.csv','_current_groupmets_segment.csv'),header=True)

def reload_churn_data(data_set_path,suffix,listing,is_customer_data):
    data_path = data_set_path.replace('.csv', '_{}.csv'.format(suffix))
    assert os.path.isfile(data_path),'You must run listing {} to save {} first'.format(listing,suffix)
    ic = [0,1] if is_customer_data else 0
    churn_data = pd.read_csv(data_path, index_col=ic)
    return churn_data

### Listing 8.5 Churn Prediction

In [None]:
import pandas as pd
import os
import pickle
import matplotlib.pyplot as plt
from fightchurn.listings.chap8.listing_8_4_rescore_metrics import reload_churn_data

def churn_forecast(data_set_path,model_name='logreg_model'):

    pickle_path = data_set_path.replace('.csv', '_{}.pkl'.format(model_name))
    assert os.path.isfile(pickle_path), 'You must run listing 8.2 to save a logistic regression model first'
    with open(pickle_path, 'rb') as fid:
        logreg_model = pickle.load(fid)

    current_score_df = reload_churn_data(data_set_path,'current_groupscore','8.4',is_customer_data=True)

    predictions = logreg_model.predict_proba(current_score_df.to_numpy())

    predict_df = pd.DataFrame(predictions, index=current_score_df.index, columns=['churn_prob', 'retain_prob'])
    forecast_save_path = data_set_path.replace('.csv', '_current_predictions.csv')
    print('Saving results to %s' % forecast_save_path)
    predict_df.to_csv(forecast_save_path, header=True)

    forecast_histogram(data_set_path,predict_df)

def forecast_histogram(data_set_path,predict_df,ext='reg'):
    plt.figure(figsize=[6,4])
    n, bins,_ = plt.hist(predict_df['churn_prob'].values,bins=20)
    plt.xlabel('Churn Probability')
    plt.ylabel('# of Accounts')
    plt.title('Histogram of Active Customer Churn Probability ({})'.format(ext))
    plt.grid()
    plt.savefig(data_set_path.replace('.csv', '_{}_churnhist.png'.format(ext)), format='png')
    plt.close()
    hist_df=pd.DataFrame({'n':n,'bins':bins[1:]})
    hist_df.to_csv(data_set_path.replace('.csv', '_current_churnhist.csv'))


### Listing 8.6 Rescore Metrics

In [None]:
import pandas as pd
import numpy as np
from fightchurn.listings.chap7.listing_7_5_fat_tail_scores import transform_fattail_columns, transform_skew_columns
from fightchurn.listings.chap8.listing_8_4_rescore_metrics import reload_churn_data

def clip_hi_cols(data, hi_vals):
    for col in hi_vals.index.values:
        data.loc[data[col] > hi_vals[col],col] = hi_vals[col]

def clip_lo_cols(data, lo_vals):
    for col in lo_vals.index.values:
        data.loc[data[col] < lo_vals[col],col] = lo_vals[col]

def rescore_metrics(data_set_path):

    current_data = reload_churn_data(data_set_path,'current','8.3',is_customer_data=True)
    load_mat_df = reload_churn_data(data_set_path,'load_mat','6.4',is_customer_data=False)
    score_df = reload_churn_data(data_set_path,'score_params','7.5',is_customer_data=False)
    stats = reload_churn_data(data_set_path,'summarystats','5.2',is_customer_data=False)
    stats.drop('is_churn',inplace=True)
    assert set(score_df.index.values)==set(current_data.columns.values),"Data to re-score does not match transform params"
    assert set(load_mat_df.index.values)==set(current_data.columns.values),"Data to re-score does not match load matrix"
    assert set(stats.index.values)==set(current_data.columns.values),"Data to re-score does not match summary stats"

    clip_hi_cols(current_data, stats['99pct'])
    clip_lo_cols(current_data, stats['1pct'])

    transform_skew_columns(current_data, score_df[score_df['skew_score']].index.values)
    transform_fattail_columns(current_data, score_df[score_df['skew_score']].index.values)

    current_data=current_data[score_df.index.values]
    scaled_data=(current_data-score_df['mean'])/score_df['std']

    scaled_data = scaled_data[load_mat_df.index.values]
    grouped_ndarray = np.matmul(scaled_data.to_numpy(), load_mat_df.to_numpy())

    current_data_grouped = pd.DataFrame(grouped_ndarray,columns=load_mat_df.columns.values, index=current_data.index)

    score_save_path=data_set_path.replace('.csv','_current_groupscore.csv')
    current_data_grouped.to_csv(score_save_path,header=True)
    print('Saving results to %s' % score_save_path)


