<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [26]</a>'.</span>

<img src="../images/AzPTravel_PPM.png">

## GPM Data Transformation Script

#### This script transforms the single consolidated raw file "{Data Collection}.csv" into it's final GPM input versions "{Data Collection Code}localcur.csv" and "{Data Collection Code}euroconv.csv"
#### Current transformations

-    make headers lowercase and replace spaces with hyphens
-    Remove any rows with null BUs
-    validate columns in validcols
-    output documented here: "Global_Attribute_Catalog.xlsx" you can also edit dataframe entries using the instructions in the file
-    replace various null or placeholder values with "Not Provided"


###

In [1]:
commit = ""
# give a reason for the run

percode = "20XX.QX"
# Data Collection Code, this controls file paths and output names

run_type = 1
#run_type =  0 - lite run with no reporting, not recommended.
#run_type =  1 - lite run with normal reporting, default setting.
#run_type =  2 - Heavy run with full reporting, available for audits and troubleshooting.

specialchars = "-GTHtest"
# optional - add up to a 12 character code in order to mark your instance record .ipynb

from datetime import datetime
inst_datetime = datetime.now().strftime("%m%d%Y%H%M%S")
# a single datetime stamp for the full instance run


In [2]:
# Parameters
run_control = 1
percode = "2021.Q1"
commit_message = "Add PL and CZ, added standard reporting to pipeline, Travel LOB report is not knitting."
inst_datetime = "05262021080602"


### Set Run Control

##### 0 - lite run with no reporting, not recommended.
##### 1 - lite run with normal reporting, default setting.
##### 2 - Heavy run with full reporting, available for audits and troubleshooting.

In [3]:
run_control = 0  # in development mode

In [4]:
#### Packages used
import sys
import os
import pandas as pd
from pandas import ExcelWriter
from numpy import nan

In [5]:
default_dc = "2021.Q1"

try:
    if sys.argv[1] == "-f":
        percode = default_dc
    else:
        percode = sys.argv[1]

except IndexError:
    percode = default_dc
except NameError:
    percode = default_dc

#### style settings

In [6]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### Convert paths and files to variables.
#### Read data into pd DataFrames.
#### Make paths for the live sources.

In [7]:
rt_path = f'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/Production/{str(percode)}'

infile = os.path.join(rt_path, f'{str(percode)}.parquet')
gppm_file = os.path.join(rt_path, f'GPPM_Input_{str(percode)}.xlsx')
atcat = f'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/Production/{str(percode)}\\{str(percode)}_Attribute_Catalog.xlsx'


In [8]:
#### Read input file

#### This reads the consolidated file that will be transformed or a stored dataframe based on the execution method.

In [9]:
try:
    %store -r read_dc_df
    input = read_dc_df
        
except:
    input = pd.read_parquet(infile,engine = "pyarrow")
    os.remove(infile)

print(input)

                       Submission File Business Unit Country Currency  \
0     us_dat_au_02172021174434.parquet            AU      AU      AUD   
1     us_dat_au_02172021174434.parquet            AU      AU      AUD   
2     us_dat_au_02172021174434.parquet            AU      AU      AUD   
3     us_dat_au_02172021174434.parquet            AU      AU      AUD   
4     us_dat_au_02172021174434.parquet            AU      AU      AUD   
...                                ...           ...     ...      ...   
3500  us_dat_pt_05202021121056.parquet            PT      PT      EUR   
3501  us_dat_pt_05202021121056.parquet            PT      PT      EUR   
3502  us_dat_pt_05202021121056.parquet            PT      PT      EUR   
3503  us_dat_pt_05202021121056.parquet            PT      PT      EUR   
3504  us_dat_pt_05202021121056.parquet            PT      PT      EUR   

                           Region Reporting Date From Reporting Date To  \
0                            APAC          2020-

#### Make a copy for debug purposes

In [10]:
trandata = input.copy()

#### Format Headers

In [11]:
# save the column names to variable, format them, replace headers
cols = trandata.columns.values

"""format headers"""
fixedcols = []

for f in cols:
    fixedhead = f.replace(' ', '_').lower()
    fixedcols.append(fixedhead)

trandata.columns = fixedcols

'format headers'

#### Update Verification 1

In [12]:
if run_control > -10:
    disp = {"Original": input.columns, "Transformed": trandata.columns}

    disp = pd.DataFrame(disp)

    print(disp.to_markdown())
else:
    print("Skipped Transformation Report 1")

|    | Original                                                        | Transformed                                                     |
|---:|:----------------------------------------------------------------|:----------------------------------------------------------------|
|  0 | Submission File                                                 | submission_file                                                 |
|  1 | Business Unit                                                   | business_unit                                                   |
|  2 | Country                                                         | country                                                         |
|  3 | Currency                                                        | currency                                                        |
|  4 | Region                                                          | region                                                          |
|  5 | Reporting Date From 

#### Remove rows with null business units


In [13]:
# todo, remove this from read file
trandata = trandata[trandata.business_unit.notnull()]

#### Update Verification 2

In [14]:
if run_control > -10:

    ibus = input.groupby(['Business Unit']).count()
    ibus = ibus.reset_index()
    tbus = trandata.groupby(['business_unit']).count()
    tbus = tbus.reset_index()

    compdata = pd.merge(ibus, tbus, left_on='Business Unit', right_on='business_unit', how='left')

    disp = {"Business Unit": compdata['Business Unit'], "Original": compdata['Earned Revenues net of Taxes'],
            "Transformed": compdata['earned_revenues_net_of_taxes']}

    disp = pd.DataFrame(disp)

    print("Row Counts")
    print(disp.to_markdown())
    
else:
    
    print("Skipped Transformation Report 2")


Row Counts
|    | Business Unit   |   Original |   Transformed |
|---:|:----------------|-----------:|--------------:|
|  0 | AU              |         85 |            85 |
|  1 | CA              |         17 |            17 |
|  2 | CH              |       3187 |          3187 |
|  3 | CZ              |          7 |             7 |
|  4 | ES              |         28 |            28 |
|  5 | GR              |         15 |            15 |
|  6 | IT              |        100 |           100 |
|  7 | PL              |         27 |            27 |
|  8 | PT              |         39 |            39 |


#### Swap  nulls for "Not Provided" in 'Sub Lob' and 'Distribution Channel'

In [15]:
mults = trandata['sub_lob'][trandata.sub_lob.replace(nan, 'Not Provided').str.contains(',', case=False)]
mults = mults.unique()

trandata['sub_lob'] = trandata['sub_lob'].fillna('Not Provided')
trandata['sub_lob'] = trandata['sub_lob'].replace('', 'Not Provided')
trandata['distribution_channel'] = trandata['distribution_channel'].replace('', 'Not Provided').fillna('Not Provided')

#### Replace multiple entries in "Sub Lob" to "Multiple"

In [16]:
for i in mults:
    trandata['sub_lob'] = trandata['sub_lob'].replace(i, 'Multiple')

# todo automate figuring out which fields fx rates should be applied to somehow

#### Update Verification 3

In [17]:
if run_control > -10:

    ibus = input.groupby(['Sub LOB']).sum()
    ibus = ibus.reset_index()
    tbus = trandata.groupby(['sub_lob']).sum()
    tbus = tbus.reset_index()

    compdatasublob = pd.merge(ibus, tbus, left_on='Sub LOB', right_on='sub_lob', how='outer')

    disp1 = pd.DataFrame(
        {'Original Sub LOB': compdatasublob['Sub LOB'], 'Transformed Sub LOB': compdatasublob['sub_lob'],
         "Original": compdatasublob['Earned Revenues net of Taxes'] / 1000,
         "Transformed": compdatasublob['earned_revenues_net_of_taxes'] / 1000})



    tots1 = pd.DataFrame({'Original Total': (input['Earned Revenues net of Taxes']).sum() / 10000,
                          "Transformed Total": (compdatasublob['earned_revenues_net_of_taxes']).sum() / 10000},
                         index=[0])
else:
    print("Skipped Transformation Report 2")

#### Make cflds, a list of currency fields, force to float, coercion is null or string to 0

In [18]:
cflds = ['written_revenues_net_of_taxes', 'written_revenues', 'earned_revenues_net_of_taxes',
         'earned_revenues', 'earned_base_commissions', 'earned_over-commissions', 'upfront_cash_payments',
         'total_compensation', 'paid_claims', 'ocr_+_ibnr', 'actual_incurred_losses_(paid_+_ocr_+_ibnr)',
         'internal_variable_costs_(excl._az_tech_fee)', 'az_tech_fee', 'internal_fixed_costs_(excl._hq_fees)',
         'hq_fees', 'total_expenses', 'risk_premium', 'profit_or_loss', 'contribution_margin_-_hq_view',
         'contribution_margin_-_bu_view', ]

for i in cflds:
    pd.to_numeric(trandata[i], errors='coerce')

0        1772.4400
1       73384.9866
2        4193.4300
3         874.8900
4        3388.4715
           ...    
3500     1509.1600
3501     2164.1100
3502      309.7200
3503        0.0000
3504     1260.0000
Name: written_revenues_net_of_taxes, Length: 3505, dtype: float64

0          0.0000
1          0.0000
2          0.0000
3          0.0000
4          0.0000
          ...    
3500    1644.9844
3501    2358.8799
3502     337.5948
3503       0.0000
3504    1373.4000
Name: written_revenues, Length: 3505, dtype: float64

0         5524.7718
1       517217.3011
2        12578.4759
3          874.8900
4        11928.4636
           ...     
3500      1576.8600
3501      2085.8000
3502        16.5700
3503        68.0000
3504      1260.0000
Name: earned_revenues_net_of_taxes, Length: 3505, dtype: float64

0          0.0000
1          0.0000
2          0.0000
3          0.0000
4          0.0000
          ...    
3500    1718.7774
3501    2273.5220
3502      18.0613
3503      74.1200
3504    1373.4000
Name: earned_revenues, Length: 3505, dtype: float64

0          411.4924
1       186987.7598
2         1798.8306
3          171.8100
4         4812.8102
           ...     
3500         0.0000
3501         0.0000
3502         0.0000
3503         0.0000
3504         0.0000
Name: earned_base_commissions, Length: 3505, dtype: float64

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
3500    0.0
3501    0.0
3502    0.0
3503    0.0
3504    0.0
Name: earned_over-commissions, Length: 3505, dtype: float64

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
3500    0.0
3501    0.0
3502    0.0
3503    0.0
3504    0.0
Name: upfront_cash_payments, Length: 3505, dtype: float64

0          411.4924
1       186987.7598
2         1798.8306
3          171.8100
4         4812.8102
           ...     
3500         0.0000
3501         0.0000
3502         0.0000
3503         0.0000
3504         0.0000
Name: total_compensation, Length: 3505, dtype: float64

0          841.6000
1       179952.5261
2            0.0000
3            0.0000
4          650.3800
           ...     
3500        69.4500
3501         0.0000
3502       630.1200
3503         0.0000
3504         0.0000
Name: paid_claims, Length: 3505, dtype: float64

0         158.949826
1       30986.379636
2         284.268956
3          16.847558
4         317.166200
            ...     
3500     1472.110000
3501      608.720000
3502        0.000000
3503        0.000000
3504        0.000000
Name: ocr_+_ibnr, Length: 3505, dtype: float64

0         1000.549826
1       210938.905736
2          284.268956
3           16.847558
4          967.546200
            ...      
3500      1541.560000
3501       608.720000
3502       630.120000
3503         0.000000
3504         0.000000
Name: actual_incurred_losses_(paid_+_ocr_+_ibnr), Length: 3505, dtype: float64

0         930.696166
1       91986.087535
2        1076.945043
3          55.083911
4        1236.770428
            ...     
3500      433.790006
3501      341.095979
3502      305.054137
3503       33.114511
3504      835.265953
Name: internal_variable_costs_(excl._az_tech_fee), Length: 3505, dtype: float64

0         315.488111
1       29535.321094
2         718.284798
3          49.959963
4         681.166314
            ...     
3500       45.413568
3501       60.071040
3502        0.477216
3503        1.958400
3504       36.288000
Name: az_tech_fee, Length: 3505, dtype: float64

0         1474.395339
1       145930.890587
2         3144.451670
3          151.527925
4         2633.394389
            ...      
3500       272.796780
3501       360.843400
3502         2.866610
3503        19.176000
3504       355.320000
Name: internal_fixed_costs_(excl._hq_fees), Length: 3505, dtype: float64

0         546.657015
1       54435.025836
2        1298.674892
3          51.138343
4        1329.630483
            ...     
3500       96.976890
3501      128.276700
3502        1.019055
3503        4.182000
3504       63.000000
Name: hq_fees, Length: 3505, dtype: float64

0         3267.236631
1       321887.325052
2         6238.356404
3          307.710141
4         5880.961615
            ...      
3500       848.977244
3501       890.287119
3502       309.417018
3503        58.430911
3504      1289.873953
Name: total_expenses, Length: 3505, dtype: float64

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
3500    0.0
3501    0.0
3502    0.0
3503    0.0
3504    0.0
Name: risk_premium, Length: 3505, dtype: float64

0          845.492943
1      -202596.689488
2         4257.019941
3          378.522301
4          267.145586
            ...      
3500      -813.677244
3501       586.792881
3502      -922.967018
3503         9.569089
3504       -29.873953
Name: profit_or_loss, Length: 3505, dtype: float64

0       2866.545298
1      -2230.773065
2       8700.146503
3        581.188569
4       4230.170458
           ...     
3500    -443.903574
3501    1075.912981
3502    -919.081353
3503      32.927089
3504     388.446047
Name: contribution_margin_-_hq_view, Length: 3505, dtype: float64

0        2319.888282
1      -56665.798901
2        7401.471611
3         530.050226
4        2900.539975
            ...     
3500     -540.880464
3501      947.636281
3502     -920.100408
3503       28.745089
3504      325.446047
Name: contribution_margin_-_bu_view, Length: 3505, dtype: float64

#### Turn selected columns values uppercase

In [19]:
validcols = ['business_unit', 'currency', 'region', 'type_of_analysis', 'type_of_business', 'type_of_account', 'lob',
             'distribution_type', 'distribution_channel', ]

for c in validcols:
    trandata[c] = trandata[c].astype(str)
    trandata[c] = trandata[c].apply(lambda x: x.upper())

#### Read the values from the Global Attribute Catalog, one field per loop iterance and xlsx sheet.

In [20]:
checktabs = []



for s in validcols:
    t = pd.read_excel(atcat, sheet_name = s )
    checktabs.append([[s], [t]])

gacout = []


#### Compare the lists from the previous step and each field. Find those that do not match, make lists of unique values
#### replace any that have replacements in GPPM inoout already, make a list of values without any matches.


In [21]:
# todo rewrite this beginner code to use dicts and simplify

for u in enumerate(validcols):
    trandata.loc[0:len(trandata[u[1]]), u[1]] = trandata[u[1]].replace(
        [checktabs[u[0]][1][0]['Upper_Vers']][0].to_numpy(), [checktabs[u[0]][1][0][u[1]]][0].to_numpy())
    d = list(checktabs[u[0]][1][0]['Non Matches'].drop_duplicates().dropna().append(
        pd.Series(trandata[u[1]][~trandata[u[1]].isin(checktabs[u[0]][1][0][u[1]])].drop_duplicates().dropna()),
        ignore_index=True))
    g = [checktabs[u[0]][1][0][str(u[1])], checktabs[u[0]][1][0]['Upper_Vers'], checktabs[u[0]][1][0]['Unnamed: 2'],
         pd.Series(d, dtype='object').drop_duplicates().dropna(), checktabs[u[0]][1][0]['User Defined Corrections']]
    gacout.append(list([g, u[1]]))

#### Rebuild the GPPM input file, with the replaced values in the bad values column of each sheet.
#### Step one, make and xlsx with a notes page.

In [22]:
w = ExcelWriter(atcat)
notes = pd.DataFrame([
    "This page is script generated during the source creation process. Do not edit these notes directly in the file as they will be overwritten",
    "", ""
    , " Purpose :     to manage attribute entries in the data collection process, this workbook documents and organizes all entries and also allows a user to swap those that do \
                        not conform to validation rules  with an entry of their choice",
    " Each attribute field that requires validation has its own sheet tab", ""
    , "Column A:    of each sheet tab contains all unique acceptable responses",
    "Column B:    an upper case version, to wrangle case mismatches",
    "Column D:   is generated by the process, this is a list of an uppercase version of each unique unacceptable response, this builds over time with each collection",
    "Column E:   you can enter accpetable response here (sase sensitive) to be swapped out in the data, save and exit this file, run the process again and they will be replaced"
    , "", "", "Gavin Harmon 9 - July -2020"])

notes.columns = ['Notes']

notes.to_excel(w, index=False, sheet_name="Notes")

#### Step two, build the new sheets for each validcols field

In [23]:
for v in enumerate(validcols):
    df = pd.DataFrame(gacout[v[0]][0], index=[f"{v[1]}", 'Upper_Vers', '', 'Non Matches', 'User Defined Corrections']).T
    df.to_excel(w, index=False, sheet_name=gacout[v[0]][1])
w.save()

#### Step three, read these lists back in, make the necessary replacements in the DataFrame

In [24]:
for s in validcols:
    t = pd.read_excel(f'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/Production/{str(percode)}\\{str(percode)}_Attribute_Catalog.xlsx', sheet_name = s )
    checktabs.append([[s],[t]])

In [25]:
gacout = []

for u in enumerate(validcols):
    trandata.loc[0:len(trandata[u[1]]), u[1]] = trandata[u[1]].replace(
        [checktabs[u[0]][1][0]['Non Matches']][0].to_numpy(),
        [checktabs[u[0]][1][0]['User Defined Corrections']][0].to_numpy())
repper = trandata[['business_unit', 'reporting_date_to']]
repper = pd.DataFrame({"business_unit": (repper['business_unit']), "YearMo": (repper['reporting_date_to'])})
minrep = repper.groupby(['business_unit']).max()
a = trandata['business_unit'].replace(list(minrep.axes[0]), minrep.get("YearMo"))
trandata['rep_date'] = a
trandata.loc[0:len(trandata['business_partner_id_number']), 'business_partner_id_number'] = trandata[
    'business_partner_id_number'].replace('0', 'Not Provided').fillna('Not Provided')
trandata.loc[0:len(trandata['product_id_number']), 'product_id_number'] = trandata['product_id_number'].replace('0',
                                                                                                                'Not Provided').replace(
    '-', 'Not Provided').fillna('Not Provided')
trandata.loc[0:len(trandata['sub_lob']), 'sub_lob'] = trandata['sub_lob'].replace('0', 'Not Provided').replace('-',
                                                                                                               'Not Provided').replace(
    'Other', 'Not Provided').fillna('Not Provided')

#### Evaluate the current state, alert the user if input is needed.
- At the end of this process, alert the user with instructions if they need to adjust bad entries for validated fields.



<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [26]:
#see above note about converting to dictionaries

#Make lists of all the attribute catalog entries that do not have a correction as a list of errors.
err_messages = []

for idx, i in enumerate(checktabs):
    subset_no_nans =  i[1][0][['Non Matches', 'User Defined Corrections']][~i[1][0]['Non Matches'].isna()]
    entries = subset_no_nans['Non Matches'][subset_no_nans['User Defined Corrections'].isna()]. tolist()
    if len(entries) > 0: 
        err_messages.append(f'Please deal with bad entries {entries} on tab {i[0][0]} of the Attribute Catalog.')

#raise an error to stop the process and give instructions

# define Python user-defined exceptions


# define Python user-defined exceptions
class Error(Exception):
    """Base class for other exceptions"""
    pass

class BadEntriesError(Error):
    """Raised when the input value is too large"""
    for i in err_messages : print(i)
        

if len(err_messages) > 0:
    raise BadEntriesError
else:
    pass
        

Please deal with bad entries ['CARRIERS - AIRLINES'] on tab type_of_account of the Attribute Catalog.


BadEntriesError: 

#### Remove empty rows, if there is no claims experience and no revenue for a 12 month period, it should not be included

In [None]:
trandata = trandata.loc[(trandata['units_of_risk_(written)'].fillna(0).replace('', 0)
                         + trandata['written_revenues_net_of_taxes'].fillna(0).replace('', 0)
                         + trandata['written_revenues'].fillna(0).replace('', 0)
                         + trandata['number_of_policies_(earned)'].fillna(0).replace('', 0)
                         + trandata['units_of_risk_(earned)'].fillna(0).replace('', 0)
                         + trandata['earned_revenues_net_of_taxes'].fillna(0).replace('', 0)
                         + trandata['earned_revenues'].fillna(0).replace('', 0)
                         + trandata['earned_base_commissions'].fillna(0).replace('', 0)
                         + trandata['upfront_cash_payments'].fillna(0).replace('', 0)
                         + trandata['earned_over-commissions'].fillna(0).replace('', 0)
                         + trandata['total_compensation'].fillna(0).replace('', 0)
                         + trandata['number_of_claims_(paid_+_ocr_+_ibnr)'].fillna(0).replace('', 0)
                         + trandata['number_of_open_claims'].fillna(0).replace('', 0)
                         + trandata['open_claims_%'].fillna(0).replace('', 0)
                         + trandata['number_of_persons_involved_in_claims_(paid_+_ocr_+_ibnr)'].fillna(0).replace('', 0)
                         + trandata['paid_claims'].fillna(0).replace('', 0)
                         + trandata['ocr_+_ibnr'].fillna(0).replace('', 0)
                         + trandata['actual_incurred_losses_(paid_+_ocr_+_ibnr)'].fillna(0).replace('', 0)
                         + trandata['internal_variable_costs_(excl._az_tech_fee)'].fillna(0).replace('', 0)
                         + trandata['az_tech_fee'].fillna(0).replace('', 0)
                         + trandata['internal_fixed_costs_(excl._hq_fees)'].fillna(0).replace('', 0)
                         + trandata['hq_fees'].fillna(0).replace('', 0)
                         + trandata['total_expenses'].fillna(0).replace('', 0)
                         + trandata['frequency_(earned)'].fillna(0).replace('', 0)
                         + trandata['severity'].fillna(0).replace('', 0))
                        != 0]

#### Output temp file

In [None]:
trandata.to_parquet(
    f'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/Production/{str(percode)}\\{str(percode)}.localcur.parquet', engine = "pyarrow")



#### Store the DataFrame for other noteboks to use

In [None]:
py_t_df = trandata

%store py_t_df
