In [2]:
!pip install awswrangler --q

Keyring is skipped due to an exception: 'keyring.backends'
[0m

In [3]:
import sagemaker
import io
import os

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput

import pandas as pd
import numpy as np
import boto3
import warnings
warnings.filterwarnings("ignore")

### set-up

In [4]:
session = boto3.Session()
s3 = session.client('s3')

### bucket location

In [5]:
bucket_name = "cdo-ililapse-364524684987-bucket"
file_path = "nfrm_data/clarify/2022/"
#s3_path = "s3://"+bucket_name+file_path
response = s3.list_objects(Bucket = bucket_name, Prefix = file_path + 'clarify_report_sr_notes')

In [6]:
%%time

df = []
for i in range(len(response['Contents'])):
    key = response['Contents'][i]['Key']
    obj = s3.get_object(Bucket = bucket_name, Key = key)
    df_temp = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding="ISO-8859-1",header=None, skiprows=1, na_values=" ")
    df.append(df_temp)
    print(i,' = ',response['Contents'][i]['Key'])
df = pd.concat(df, ignore_index = True)

0  =  nfrm_data/clarify/2022/clarify_report_sr_notes_Apr2022_OLTP.csv
1  =  nfrm_data/clarify/2022/clarify_report_sr_notes_Aug2022_OLTP.csv
2  =  nfrm_data/clarify/2022/clarify_report_sr_notes_Feb2022_OLTP.csv
3  =  nfrm_data/clarify/2022/clarify_report_sr_notes_Jan2022_OLTP.csv
4  =  nfrm_data/clarify/2022/clarify_report_sr_notes_Jul2022_OLTP.csv
5  =  nfrm_data/clarify/2022/clarify_report_sr_notes_Jun2022_OLTP.csv
6  =  nfrm_data/clarify/2022/clarify_report_sr_notes_Mar2022_OLTP.csv
7  =  nfrm_data/clarify/2022/clarify_report_sr_notes_May2022_OLTP.csv
8  =  nfrm_data/clarify/2022/clarify_report_sr_notes_Sep2022_OLTP.csv
CPU times: user 1min 46s, sys: 35.7 s, total: 2min 22s
Wall time: 7min 17s


In [7]:
df.columns =[
'POLICY_NBR',
'CASE_ID',
'CASE_TITLE',
'CASE_CREATE_DATE',
'CLIENT_ID',
'SR_ID',
'SR_TITLE',
'SR_TYPE_CATG',
'SR_TYPE_DESC',
'SUB_TYPE',
'SR_CREATE_DT',
'SR_CLOSE_DT',
'ACTION_TYPE',
'SR_NOTES'
]

In [8]:
df=df.sort_values(['POLICY_NBR', 'SR_CREATE_DT'])

In [9]:
pd.set_option('display.max_colwidth', 1000)
sample = df[df['POLICY_NBR']=='V3077010']
sample.to_csv("clarify_sample.csv")

### processing

In [10]:
df_clarify = df[['POLICY_NBR', 'SR_CREATE_DT', 'SR_CLOSE_DT','SR_TYPE_CATG']]

In [11]:
# remove whitespaces
for col in df_clarify.columns:
    try:
        df_clarify[col] = df_clarify[col].str.strip()
    except AttributeError:
        pass

In [12]:
df_clarify["SR_TYPE_CATG"]= df_clarify["SR_TYPE_CATG"].str.replace(" ","")

In [13]:
df_clarify['SR_CREATE_DT']= pd.to_datetime(df_clarify['SR_CREATE_DT'], format="%Y-%m-%d-%H.%M.%S.%f").dt.strftime('%m/%d/%y')
df_clarify['SR_CREATE_DT']=pd.to_datetime(df_clarify['SR_CREATE_DT'])

df_clarify['SR_CLOSE_DT']= pd.to_datetime(df_clarify['SR_CLOSE_DT'], format="%Y-%m-%d-%H.%M.%S.%f").dt.strftime('%m/%d/%y')
df_clarify['SR_CLOSE_DT']=pd.to_datetime(df_clarify['SR_CLOSE_DT'])

#### max time to process

In [14]:
min_create_dt= pd.DataFrame(df_clarify.groupby(['POLICY_NBR','SR_TYPE_CATG'])['SR_CREATE_DT'].min()).reset_index()
max_close_dt= pd.DataFrame(df_clarify.groupby(['POLICY_NBR','SR_TYPE_CATG'])['SR_CLOSE_DT'].max()).reset_index()
min_max_df = min_create_dt.merge(max_close_dt, how= 'inner', on =['POLICY_NBR','SR_TYPE_CATG'])
min_max_df['sr_create_month'] = min_max_df['SR_CREATE_DT'].dt.to_period('M')

min_max_df['closed'] = np.where(min_max_df['SR_CLOSE_DT'].isnull() , 0, 1)
min_max_df['time_to_close']= np.round((min_max_df['SR_CLOSE_DT'] - min_max_df['SR_CREATE_DT'])/np.timedelta64(1, 'D'), 0)

In [15]:
min_max_df['closed'] = np.where(min_max_df['SR_CLOSE_DT'].isnull() , 0, 1)
min_max_df['time_to_close']= np.round((min_max_df['SR_CLOSE_DT'] - min_max_df['SR_CREATE_DT'])/np.timedelta64(1, 'D'), 0)

#### Number of cases

In [16]:
df_count = pd.DataFrame(min_max_df.groupby(['POLICY_NBR','sr_create_month'])['SR_TYPE_CATG'].count()).reset_index()
df_count.columns = ['POLICY_NBR', 'sr_create_month', 'num_sr_catg']

#### Longest taking SR Category

In [17]:
# service category that took the longest in each month
df_sr_time = pd.DataFrame(min_max_df.groupby(['POLICY_NBR','sr_create_month'])['time_to_close'].max()).reset_index()
df_sr_time.columns = ['POLICY_NBR', 'sr_create_month', 'max_sr_time']

In [18]:
df_out = df_count.merge(df_sr_time, how= 'inner', on =['POLICY_NBR','sr_create_month'])

In [19]:
del df_count, df_sr_time

In [20]:
df_out = df_out.merge(min_max_df, how= 'inner', 
                      left_on =['POLICY_NBR','sr_create_month','max_sr_time'],
                      right_on =['POLICY_NBR','sr_create_month','time_to_close']
                     )

In [21]:
df_out[df_out['POLICY_NBR']=='V3077010']

Unnamed: 0,POLICY_NBR,sr_create_month,num_sr_catg,max_sr_time,SR_TYPE_CATG,SR_CREATE_DT,SR_CLOSE_DT,closed,time_to_close
2074312,V3077010,2022-02,2,6.0,CVC-Illustration,2022-02-24,2022-03-02,1,6.0
2074313,V3077010,2022-03,1,78.0,Corres,2022-03-16,2022-06-02,1,78.0
2074314,V3077010,2022-04,1,0.0,CustomerValueCenter,2022-04-07,2022-04-07,1,0.0
2074315,V3077010,2022-05,3,32.0,CVC-Correspondence,2022-05-05,2022-06-06,1,32.0
2074316,V3077010,2022-06,1,1.0,EFT,2022-06-02,2022-06-03,1,1.0


#### export to S3

In [22]:
# %%time
# df_out =pd.read_csv("clarify_sample.csv")


# import awswrangler as wr
# wr.s3.to_csv(
#     df=df_out,
#     path='s3://cdo-ililapse-364524684987-bucket/x266754/temp/clarify_sample.csv'
# )

In [23]:
%%time
import awswrangler as wr

wr.s3.to_parquet(
    df=df_out,
    path='s3://cdo-ililapse-364524684987-bucket/x266754/lapse/clarify_jan_jun.parquet'
    #dataset=True
)

CPU times: user 9.89 s, sys: 325 ms, total: 10.2 s
Wall time: 10.3 s


{'paths': ['s3://cdo-ililapse-364524684987-bucket/x266754/lapse/clarify_jan_jun.parquet'],
 'partitions_values': {}}