In [3]:
import boto3
import io
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

import awswrangler as wr

import warnings
warnings.filterwarnings("ignore")

plt.style.use('seaborn-colorblind')

### set-up

In [4]:
session = boto3.Session()
s3 = session.client('s3')

In [5]:
bucket_name = "cdo-ililapse-364524684987-bucket"
file_path = "x266754/lapse/"

### data intake

In [6]:
file_name = "x266754/lapse/lapse_curated_jan_jun.parquet"

In [7]:
%%time
obj = s3.get_object(Bucket = bucket_name, Key = file_name)
df = pd.read_parquet(io.BytesIO(obj['Body'].read())) 

CPU times: user 46.8 s, sys: 16 s, total: 1min 2s
Wall time: 50.6 s


In [8]:
df =df.sort_values(['policy_id', 'pfmc_cur_month'])

In [9]:
df[df['policy_id']=='V2680808'][['policy_id', 'month', 'lapse_ind', 'surrender_ind', 'reinstate_ind']]

Unnamed: 0,policy_id,month,lapse_ind,surrender_ind,reinstate_ind
2472748,V2680808,1,,,
1455046,V2680808,2,,,
4290871,V2680808,3,1.0,0.0,0.0
728509,V2680808,4,,,
3383649,V2680808,5,,,


### Create 3month ahead status

In [11]:
df['lapse_ind']= df['lapse_ind'].fillna(0)
df['reinstate_ind']= df['reinstate_ind'].fillna(0)
df['surrender_ind']= df['surrender_ind'].fillna(0)

In [13]:
df['cum_lapse_ind'] = df.groupby(['policy_id'])['lapse_ind'].cumsum()
df['cum_reinstate_ind'] = df.groupby(['policy_id'])['reinstate_ind'].cumsum()
df['cum_surrender_ind'] = df.groupby(['policy_id'])['surrender_ind'].cumsum()

In [14]:
df['3mo_ahead_status']= np.where(df['lapse_ind']==1, 'lapse', 
                           (np.where(df['surrender_ind']==1, 'surrender', 
                             (np.where(df['reinstate_ind']==1, 'reinstated',  "current")))))

### Surrender: terminal event, remove rows after surrender

In [16]:
# before
df[df['policy_id']=='V9063643'][['policy_id', 'month', 
                                 'lapse_ind', 
                                 'surrender_ind', 
                                 'reinstate_ind', 
                                 '3mo_ahead_status']]

Unnamed: 0,policy_id,month,lapse_ind,surrender_ind,reinstate_ind,3mo_ahead_status
2863346,V9063643,1,0.0,0.0,0.0,current
1420642,V9063643,2,0.0,0.0,0.0,current
4346879,V9063643,3,0.0,0.0,0.0,current
474344,V9063643,4,0.0,1.0,0.0,surrender
3640830,V9063643,5,0.0,0.0,0.0,current
2194608,V9063643,6,0.0,0.0,0.0,current


In [17]:
# remove rows after first surrender occurence 
df['cum_surrender_ind2'] = df.groupby(['policy_id'])['cum_surrender_ind'].cumsum()
df= df[df['cum_surrender_ind2']<2]
del df['cum_surrender_ind2']

In [24]:
# after
df[df['policy_id']=='V9063643'][['policy_id', 'month', 
                                 'lapse_ind', 
                                 'surrender_ind', 
                                 'reinstate_ind', 
                                 '3mo_ahead_status']]

Unnamed: 0,policy_id,month,lapse_ind,surrender_ind,reinstate_ind,3mo_ahead_status
2863346,V9063643,1,0.0,0.0,0.0,current
1420642,V9063643,2,0.0,0.0,0.0,current
4346879,V9063643,3,0.0,0.0,0.0,current
474344,V9063643,4,0.0,1.0,0.0,surrender


### Lapse: forward fill until reinstated

In [19]:
# before
df[df['policy_id']=='V9051243'][['policy_id', 'month', 
                                 'lapse_ind', 
                                 'surrender_ind', 
                                 'reinstate_ind', 
                                 '3mo_ahead_status']]

Unnamed: 0,policy_id,month,lapse_ind,surrender_ind,reinstate_ind,3mo_ahead_status
2732403,V9051243,1,0.0,0.0,0.0,current
1266135,V9051243,2,0.0,0.0,0.0,current
4544942,V9051243,3,1.0,0.0,0.0,lapse
576127,V9051243,4,0.0,0.0,0.0,current
3616291,V9051243,5,0.0,0.0,1.0,reinstated


In [20]:
df['cum_lapse_reinstate'] = df['cum_lapse_ind'] + df['cum_reinstate_ind']

In [21]:
df['3mo_ahead_status'] = np.where((df['cum_lapse_reinstate']==1), "lapse", df['3mo_ahead_status'])

In [23]:
df.drop(['cum_lapse_ind', 'cum_reinstate_ind', 'cum_surrender_ind'], axis=1, inplace=True)

Unnamed: 0,year,month,agmt_pkge_id,vul,ul,term,aos_status_code,val_inforce_cls,kind_code,sub_kind,...,num_sr_catg,max_sr_time,SR_TYPE_CATG,SR_CREATE_DT,SR_CLOSE_DT,closed,time_to_close,clarify_cur_month,3mo_ahead_status,cum_lapse_reinstate
3295130,2022,1,344785920000,YES,,,B,PP,01479,B,...,,,,NaT,NaT,,,,current,0.0
1035951,2022,2,344785920000,YES,,,B,PP,01479,B,...,4.0,92.0,Disbursements,2022-02-03,2022-05-06,1.0,92.0,2022-02,current,0.0
4809172,2022,3,344785920000,YES,,,B,PP,01479,B,...,1.0,1.0,SettlementOptions,2022-03-03,2022-03-04,1.0,1.0,2022-03,current,0.0
106950,2022,4,344785920000,YES,,,B,PP,01479,B,...,,,,NaT,NaT,,,,current,0.0
4065087,2022,5,344785920000,YES,,,B,PP,01479,B,...,,,,NaT,NaT,,,,current,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1472158,2022,2,V90645440000,YES,,,A,PP,CVL07,B,...,,,,NaT,NaT,,,,current,0.0
4321131,2022,3,V90645440000,YES,,,A,PP,CVL07,B,...,,,,NaT,NaT,,,,current,0.0
783080,2022,4,V90645440000,YES,,,A,PP,CVL07,B,...,,,,NaT,NaT,,,,current,0.0
3409598,2022,5,V90645440000,YES,,,A,PP,CVL07,B,...,,,,NaT,NaT,,,,current,0.0


In [22]:
# after
df[df['policy_id']=='V9051243'][['policy_id', 'month', 
                                 'lapse_ind', 
                                 'surrender_ind', 
                                 'reinstate_ind', 
                                 '3mo_ahead_status']]

Unnamed: 0,policy_id,month,lapse_ind,surrender_ind,reinstate_ind,3mo_ahead_status
2732403,V9051243,1,0.0,0.0,0.0,current
1266135,V9051243,2,0.0,0.0,0.0,current
4544942,V9051243,3,1.0,0.0,0.0,lapse
576127,V9051243,4,0.0,0.0,0.0,lapse
3616291,V9051243,5,0.0,0.0,1.0,reinstated


### Create current month status

In [25]:
df['current_status'] = df.groupby('agmt_pkge_id')['3mo_ahead_status'].shift(3)
df['current_status']=df['current_status'].fillna('current')

In [28]:
# after
df[df['policy_id']=='V9063643'][['policy_id', 'month', 
                                 'lapse_ind', 
                                 'surrender_ind', 
                                 'reinstate_ind', 
                                 'current_status',
                                 '3mo_ahead_status']]

Unnamed: 0,policy_id,month,lapse_ind,surrender_ind,reinstate_ind,current_status,3mo_ahead_status
2863346,V9063643,1,0.0,0.0,0.0,current,current
1420642,V9063643,2,0.0,0.0,0.0,current,current
4346879,V9063643,3,0.0,0.0,0.0,current,current
474344,V9063643,4,0.0,1.0,0.0,current,surrender


### sample transition matrix

In [38]:
sample = df[df['pfmc_cur_month']=='2022-03']
tmatrix1 = pd.crosstab(index=sample['current_status'], columns=[sample['3mo_ahead_status']])
display(tmatrix1)

tmatrix2 = pd.crosstab(index=sample['current_status'], columns=[sample['3mo_ahead_status']], normalize='index').round(4)*100
display(tmatrix2)

3mo_ahead_status,current,lapse,reinstated,surrender
current_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
current,824524,688,9,1395
lapse,6,3,0,0


3mo_ahead_status,current,lapse,reinstated,surrender
current_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
current,99.75,0.08,0.0,0.17
lapse,66.67,33.33,0.0,0.0


### export

In [41]:
#!pip install awswrangler --q

In [43]:
%%time
import awswrangler as wr

wr.s3.to_parquet(
    df=df,
    path='s3://cdo-ililapse-364524684987-bucket/x266754/lapse/curated_test.parquet'
)

CPU times: user 1min 38s, sys: 11.9 s, total: 1min 49s
Wall time: 1min 38s


{'paths': ['s3://cdo-ililapse-364524684987-bucket/x266754/lapse/curated_test.parquet'],
 'partitions_values': {}}