<img src="AzPTravel_PPM.png">

## GPM Data Transformation Script

#### This script transforms the single consolidated raw file "{Data Collection}.csv" into it's final GPM input versions "{Data Collection Code}localcur.csv" and "{Data Collection Code}euroconv.csv"
#### Current transformations

-    make headers lowercase and replace spaces with hyphens
-    Remove any rows with null BUs
-    validate columns in validcols
-    output documented here: "Global_Attribute_Catalog.xlsx" you can also edit dataframe entries using the instructions in the file
-    replace various null or placeholder values with "Not Provided"


###

In [74]:
commit = ""
# give a reason for the run

percode = "20XX.QX"
# Data Collection Code, this controls file paths and output names

run_type = 1
#run_type =  0 - lite run with no reporting, not recommended.
#run_type =  1 - lite run with normal reporting, default setting.
#run_type =  2 - Heavy run with full reporting, available for audits and troubleshooting.

specialchars = "-GTHtest"
# optional - add up to a 12 character code in order to mark your instance record .ipynb

from datetime import datetime

inst_datetime = datetime.now().strftime("%m%d%Y%H%M%S")
# a single datetime stamp for the full instance run


### Set Run Control

##### 0 - lite run with no reporting, not recommended.
##### 1 - lite run with normal reporting, default setting.
##### 2 - Heavy run with full reporting, available for audits and troubleshooting.

In [75]:
run_control = 0  # in development mode

In [76]:
#### Packages used
import sys
import os
import pandas as pd
from pandas import ExcelWriter
from numpy import nan

In [77]:
default_dc = "2021.Q1"

try:
    if sys.argv[1] == "-f":
        percode = default_dc
    else:
        percode = sys.argv[1]

except IndexError:
    percode = default_dc
except NameError:
    percode = default_dc

#### Convert paths and files to variables.
#### Read data into pd DataFrames.
#### Make paths for the live sources.

In [78]:
rt_path = f'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/Production/{str(percode)}'

infile = os.path.join(rt_path, f'{str(percode)}.csv')
gppm_file = os.path.join(rt_path, f'GPPM_Input_{str(percode)}.xlsx')
atcat = f'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/Production/{str(percode)}\\{str(percode)}_Attribute_Catalog.xlsx'


#### Read input files

In [79]:
"""this reads the consolidated file that will be transformed"""
input = pd.read_csv(infile, low_memory=False)


#### Make a copy for debug purposes

In [80]:
trandata = input.copy()

#### Format Headers

In [81]:
# save the column names to variable, format them, replace headers
cols = trandata.columns.values

"""format headers"""
fixedcols = []

for f in cols:
    fixedhead = f.replace(' ', '_').lower()
    fixedcols.append(fixedhead)

trandata.columns = fixedcols

#### Update Verification 1

In [82]:
if run_control > -10:
    disp = {"Original": input.columns, "Transformed": trandata.columns}

    disp = pd.DataFrame(disp)

    print(disp.to_markdown())
else:
    print("Skipped Transformation Report 1")

|    | Original                                                                   | Transformed                                                                |
|---:|:---------------------------------------------------------------------------|:---------------------------------------------------------------------------|
|  0 | Business Unit                                                              | business_unit                                                              |
|  1 | Country                                                                    | country                                                                    |
|  2 | Currency                                                                   | currency                                                                   |
|  3 | Region                                                                     | region                                                                     |
|  4 | Reporting Date From        

#### Remove rows with null business units


In [83]:
# todo, remove this from read file
trandata = trandata[trandata.business_unit.notnull()]

#### Update Verification 2

In [84]:
if run_control > -10:

    ibus = input.groupby(['Business Unit']).count()
    ibus = ibus.reset_index()
    tbus = trandata.groupby(['business_unit']).count()
    tbus = tbus.reset_index()

    compdata = pd.merge(ibus, tbus, left_on='Business Unit', right_on='business_unit', how='left')

    disp = {"Business Unit": compdata['Business Unit'], "Original": compdata['Earned Revenues net of Taxes'],
            "Transformed": compdata['earned_revenues_net_of_taxes']}

    disp = pd.DataFrame(disp)

    print("Row Counts")
    print(disp.to_markdown())
else:
    print("Skipped Transformation Report 2")


Row Counts
|    | Business Unit   |   Original |   Transformed |
|---:|:----------------|-----------:|--------------:|
|  0 | AT              |        583 |           583 |
|  1 | AU              |         85 |            85 |
|  2 | CA              |         44 |            44 |
|  3 | CH              |       1810 |          1810 |
|  4 | CN              |         49 |            49 |
|  5 | DE              |        150 |           150 |
|  6 | ES              |         55 |            55 |
|  7 | FOS             |         63 |            63 |
|  8 | FR              |        442 |           442 |
|  9 | GR              |         17 |            17 |
| 10 | IT              |        100 |           100 |
| 11 | NB              |         62 |            62 |
| 12 | NL              |        173 |           173 |
| 13 | NZ              |         41 |            41 |
| 14 | PL              |         31 |            31 |
| 15 | PT              |         67 |            67 |
| 16 | UK        

#### Swap  nulls for "Not Provided" in 'Sub Lob' and 'Distribution Channel'

In [85]:
mults = trandata['sub_lob'][trandata.sub_lob.replace(nan, 'Not Provided').str.contains(',', case=False)]
mults = mults.unique()

trandata['sub_lob'] = trandata['sub_lob'].fillna('Not Provided')
trandata['sub_lob'] = trandata['sub_lob'].replace('', 'Not Provided')
trandata['distribution_channel'] = trandata['distribution_channel'].replace('', 'Not Provided').fillna('Not Provided')

#### Replace multiple entries in "Sub Lob" to "Multiple"

In [86]:
for i in mults:
    trandata['sub_lob'] = trandata['sub_lob'].replace(i, 'Multiple')

# todo automate figuring out which fields fx rates should be applied to somehow

#### Update Verification 3

In [87]:
if run_control > -10:

    ibus = input.groupby(['Sub LOB']).sum()
    ibus = ibus.reset_index()
    tbus = trandata.groupby(['sub_lob']).sum()
    tbus = tbus.reset_index()

    compdatasublob = pd.merge(ibus, tbus, left_on='Sub LOB', right_on='sub_lob', how='outer')

    disp1 = pd.DataFrame(
        {'Original Sub LOB': compdatasublob['Sub LOB'], 'Transformed Sub LOB': compdatasublob['sub_lob'],
         "Original": compdatasublob['Earned Revenues net of Taxes'] / 1000,
         "Transformed": compdatasublob['earned_revenues_net_of_taxes'] / 1000})



    tots1 = pd.DataFrame({'Original Total': (input['Earned Revenues net of Taxes']).sum() / 10000,
                          "Transformed Total": (compdatasublob['earned_revenues_net_of_taxes']).sum() / 10000},
                         index=[0])
else:
    print("Skipped Transformation Report 2")

#### Make cflds, a list of currency fields, force to float, coercion is null or string to 0

In [88]:
cflds = ['written_revenues_net_of_taxes', 'written_revenues', 'earned_revenues_net_of_taxes',
         'earned_revenues', 'earned_base_commissions', 'earned_over-commissions', 'upfront_cash_payments',
         'total_compensation', 'paid_claims', 'ocr_+_ibnr', 'actual_incurred_losses_(paid_+_ocr_+_ibnr)',
         'internal_variable_costs_(excl._az_tech_fee)', 'az_tech_fee', 'internal_fixed_costs_(excl._hq_fees)',
         'hq_fees', 'total_expenses', 'risk_premium', 'profit_or_loss', 'contribution_margin_-_hq_view',
         'contribution_margin_-_bu_view', ]

for i in cflds:
    pd.to_numeric(trandata[i], errors='coerce')

#### Turn selected columns values uppercase

In [89]:
validcols = ['business_unit', 'currency', 'region', 'type_of_analysis', 'type_of_business', 'type_of_account', 'lob',
             'distribution_type', 'distribution_channel', ]

for c in validcols:
    trandata[c] = trandata[c].astype(str)
    trandata[c] = trandata[c].apply(lambda x: x.upper())

#### Read the values from the Global Attribute Catalog, one field per loop iterance and xlsx sheet.

In [90]:
checktabs = []



for s in validcols:
    t = pd.read_excel(atcat, sheet_name = s )
    checktabs.append([[s], [t]])

gacout = []


#### Compare the lists from the previous step and each field. Find those that do not match, make lists of unique values
#### replace any that have replacements in GPPM inoout already, make a list of values without any matches.


In [91]:
for u in enumerate(validcols):
    trandata.loc[0:len(trandata[u[1]]), u[1]] = trandata[u[1]].replace(
        [checktabs[u[0]][1][0]['Upper_Vers']][0].to_numpy(), [checktabs[u[0]][1][0][u[1]]][0].to_numpy())
    d = list(checktabs[u[0]][1][0]['Non Matches'].drop_duplicates().dropna().append(
        pd.Series(trandata[u[1]][~trandata[u[1]].isin(checktabs[u[0]][1][0][u[1]])].drop_duplicates().dropna()),
        ignore_index=True))
    g = [checktabs[u[0]][1][0][str(u[1])], checktabs[u[0]][1][0]['Upper_Vers'], checktabs[u[0]][1][0]['Unnamed: 2'],
         pd.Series(d, dtype='object').drop_duplicates().dropna(), checktabs[u[0]][1][0]['User Defined Corrections']]
    gacout.append(list([g, u[1]]))

#### Rebuild the GPPM input file, with the replaced values in the bad values column of each sheet.
#### Step one, make and xlsx with a notes page.

In [92]:
w = ExcelWriter(atcat)
notes = pd.DataFrame([
    "This page is script generated during the source creation process. Do not edit these notes directly in the file as they will be overwritten",
    "", ""
    , " Purpose :     to manage attribute entries in the data collection process, this workbook documents and organizes all entries and also allows a user to swap those that do \
                        not conform to validation rules  with an entry of their choice",
    " Each attribute field that requires validation has its own sheet tab", ""
    , "Column A:    of each sheet tab contains all unique acceptable responses",
    "Column B:    an upper case version, to wrangle case mismatches",
    "Column D:   is generated by the process, this is a list of an uppercase version of each unique unacceptable response, this builds over time with each collection",
    "Column E:   you can enter accpetable response here (sase sensitive) to be swapped out in the data, save and exit this file, run the process again and they will be replaced"
    , "", "", "Gavin Harmon 9 - July -2020"])

notes.columns = ['Notes']

notes.to_excel(w, index=False, sheet_name="Notes")

#### Step two, build the new sheets for each validcols field

In [93]:
for v in enumerate(validcols):
    df = pd.DataFrame(gacout[v[0]][0], index=[f"{v[1]}", 'Upper_Vers', '', 'Non Matches', 'User Defined Corrections']).T
    df.to_excel(w, index=False, sheet_name=gacout[v[0]][1])
w.save()

#### Step three, read these lists back in, make the necessary replacements in the DataFrame

In [94]:
for s in validcols:
    t = pd.read_excel(f'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/Production/{str(percode)}\\{str(percode)}_Attribute_Catalog.xlsx', sheet_name = s )
    checktabs.append([[s],[t]])

In [95]:
gacout = []

for u in enumerate(validcols):
    trandata.loc[0:len(trandata[u[1]]), u[1]] = trandata[u[1]].replace(
        [checktabs[u[0]][1][0]['Non Matches']][0].to_numpy(),
        [checktabs[u[0]][1][0]['User Defined Corrections']][0].to_numpy())
repper = trandata[['business_unit', 'reporting_date_to']]
repper = pd.DataFrame({"business_unit": (repper['business_unit']), "YearMo": (repper['reporting_date_to'])})
minrep = repper.groupby(['business_unit']).max()
a = trandata['business_unit'].replace(list(minrep.axes[0]), minrep.get("YearMo"))
trandata['rep_date'] = a
trandata.loc[0:len(trandata['business_partner_id_number']), 'business_partner_id_number'] = trandata[
    'business_partner_id_number'].replace('0', 'Not Provided').fillna('Not Provided')
trandata.loc[0:len(trandata['product_id_number']), 'product_id_number'] = trandata['product_id_number'].replace('0',
                                                                                                                'Not Provided').replace(
    '-', 'Not Provided').fillna('Not Provided')
trandata.loc[0:len(trandata['sub_lob']), 'sub_lob'] = trandata['sub_lob'].replace('0', 'Not Provided').replace('-',
                                                                                                               'Not Provided').replace(
    'Other', 'Not Provided').fillna('Not Provided')

#### Remove empty rows, if there is no claims experience and no revenue for a 12 month period, it should not be included

In [96]:
trandata = trandata.loc[(trandata['units_of_risk_(written)'].fillna(0).replace('', 0)
                         + trandata['written_revenues_net_of_taxes'].fillna(0).replace('', 0)
                         + trandata['written_revenues'].fillna(0).replace('', 0)
                         + trandata['number_of_policies_(earned)'].fillna(0).replace('', 0)
                         + trandata['units_of_risk_(earned)'].fillna(0).replace('', 0)
                         + trandata['earned_revenues_net_of_taxes'].fillna(0).replace('', 0)
                         + trandata['earned_revenues'].fillna(0).replace('', 0)
                         + trandata['earned_base_commissions'].fillna(0).replace('', 0)
                         + trandata['upfront_cash_payments'].fillna(0).replace('', 0)
                         + trandata['earned_over-commissions'].fillna(0).replace('', 0)
                         + trandata['total_compensation'].fillna(0).replace('', 0)
                         + trandata['number_of_claims_(paid_+_ocr_+_ibnr)'].fillna(0).replace('', 0)
                         + trandata['number_of_open_claims'].fillna(0).replace('', 0)
                         + trandata['open_claims_%'].fillna(0).replace('', 0)
                         + trandata['number_of_persons_involved_in_claims_(paid_+_ocr_+_ibnr)'].fillna(0).replace('', 0)
                         + trandata['paid_claims'].fillna(0).replace('', 0)
                         + trandata['ocr_+_ibnr'].fillna(0).replace('', 0)
                         + trandata['actual_incurred_losses_(paid_+_ocr_+_ibnr)'].fillna(0).replace('', 0)
                         + trandata['internal_variable_costs_(excl._az_tech_fee)'].fillna(0).replace('', 0)
                         + trandata['az_tech_fee'].fillna(0).replace('', 0)
                         + trandata['internal_fixed_costs_(excl._hq_fees)'].fillna(0).replace('', 0)
                         + trandata['hq_fees'].fillna(0).replace('', 0)
                         + trandata['total_expenses'].fillna(0).replace('', 0)
                         + trandata['frequency_(earned)'].fillna(0).replace('', 0)
                         + trandata['severity'].fillna(0).replace('', 0))
                        != 0]

#### Output temp file

In [98]:
trandata.to_parquet(
    f'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/Production/{str(percode)}\\{str(percode)}.localcur.parquet')




