# 08 Forcasting Churn

In [3]:
import json
import sqlalchemy
import numpy as np
import numpy.typing as npt
import pandas as pd
import matplotlib.pyplot as plt

from pprint import pprint
from typing import Tuple, List, Dict

In [4]:
# Make a sql connection with sqlalchmey
conn_string = "postgresql://postgres-db/churn?user=postgres&password=password" 
engine = sqlalchemy.create_engine(
    conn_string, connect_args={'options': '-csearch_path={}'.format("socialnet7,public")}
)
conn = engine.connect()
# conn.close()

# Query with Pandas, e.g. list all tables
tables = pd.read_sql_query("SELECT * FROM information_schema.tables;", conn)
tables.head(3)

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action
0,churn,socialnet7,active_period,BASE TABLE,,,,,,YES,NO,
1,churn,socialnet7,event_type,BASE TABLE,,,,,,YES,NO,
2,churn,socialnet7,metric_name,BASE TABLE,,,,,,YES,NO,


In [5]:
import json 

with open("/app/fightchurn/listings/conf/socialnet7_listings.json", "r") as f: 
    conf = json.loads(f.read())
    # conf = Box(json.loads(f.read()))

chap8 = conf["chap8"]

In [6]:
import sys
sys.path.append("/app") # for importing module locally in /app

## Dependent Data

In [4]:
tmp = pd.read_sql_query("SELECT * FROM metric_name ORDER BY metric_name_id", conn)
tmp.tail()

Unnamed: 0,metric_name_id,metric_name
16,28,unfriend_per_newfriend_scaled
17,30,new_friends_pcnt_change
18,31,days_since_newfriend
19,33,unfriend_28day_avg_84day_obs
20,34,unfriend_28day_avg_84day_obs_scaled


In [5]:
tmp = pd.read_sql_query("SELECT * FROM metric ORDER BY RANDOM() LIMIT 10000", conn)
tmp.head()

Unnamed: 0,account_id,metric_time,metric_name_id,metric_value
0,3698,2020-03-08,1,2.0
1,5203,2020-04-26,7,6.0
2,12014,2020-05-10,22,0.25
3,5338,2020-03-22,26,117.0
4,260,2020-04-12,7,2.0


## Model Validations

### Listing 9.1 AUC

In [None]:
import os
import pickle
from sklearn.metrics import roc_auc_score
from fightchurn.listings.chap8.listing_8_2_logistic_regression import prepare_data

def reload_regression(data_set_path):
    pickle_path = data_set_path.replace('.csv', '_logreg_model.pkl')
    assert os.path.isfile(pickle_path), 'You must run listing 8.2 to save a logistic regression model first'
    with open(pickle_path, 'rb') as fid:
        logreg_model = pickle.load(fid)
    return logreg_model

def regression_auc(data_set_path):

    logreg_model = reload_regression(data_set_path)
    X,y = prepare_data(data_set_path)
    predictions = logreg_model.predict_proba(X)
    auc_score = roc_auc_score(y,predictions[:,1])
    print('Regression AUC score={:.3f}'.format(auc_score))


### Listing 9.2 Top Decile Lift

In [None]:
from fightchurn.listings.chap8.listing_8_2_logistic_regression import prepare_data
from fightchurn.listings.chap9.listing_9_1_regression_auc  import reload_regression
import numpy

def calc_lift(y_true, y_pred):
    if numpy.unique(y_pred).size < 10:
        return 1.0
    sort_by_pred=[(p,t) for p,t in sorted(zip(y_pred, y_true))]
    overall_churn = sum(y_true)/len(y_true)
    i90=int(round(len(y_true)*0.9))
    top_decile_count=sum([p[1] for p in sort_by_pred[i90:]])
    top_decile_churn = top_decile_count/(len(y_true)-i90)
    lift = top_decile_churn/overall_churn
    return lift

def top_decile_lift(data_set_path):

    logreg_model = reload_regression(data_set_path)
    X,y = prepare_data(data_set_path,as_retention=False)
    predictions = logreg_model.predict_proba(X)
    lift = calc_lift(y,predictions[:,0])
    print('Regression Lift score={:.3f}'.format(lift))


### Listing 9.3 Backtesting

In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression

from fightchurn.listings.chap8.listing_8_2_logistic_regression import prepare_data
from fightchurn.listings.chap9.listing_9_2_top_decile_lift import calc_lift


def backtest(data_set_path,n_test_split):

    X,y = prepare_data(data_set_path,as_retention=False)

    tscv = TimeSeriesSplit(n_splits=n_test_split)

    lift_scorer = make_scorer(calc_lift, needs_proba=True)
    score_models = {'lift': lift_scorer, 'AUC': 'roc_auc'}

    retain_reg = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True)

    gsearch = GridSearchCV(estimator=retain_reg,scoring=score_models, cv=tscv, verbose=1,
                           return_train_score=False,  param_grid={'C' : [1]}, refit='AUC')

    gsearch.fit(X,y)
    result_df = pd.DataFrame(gsearch.cv_results_)

    save_path = data_set_path.replace('.csv', '_backtest.csv')
    result_df.to_csv(save_path, index=False)
    print('Saved test scores to ' + save_path)


ModuleNotFoundError: No module named 'fightchurn'