<img src="../images/AzPTravel_PPM.png">

# Minimal R file for Proof of Concept
### To be built into all trasformations from the Travel Global Report
### Adapted from Travel_Global_Report_Pipeline_Version.Rmd with all Python code removed

##### Parameters section

In [1]:

commit <- "New German Data."
# give a reason for the run

percode <- "2021.Q1"
# Data Collection Code, this controls file paths and output names

run_type <- 1
#run_type =  0 - lite run with no reporting, not recommended.
#run_type =  1 - lite run with normal reporting, default setting.
#run_type =  2 - Heavy run with full reporting, available for audits and troubleshooting.

specialchars <- "-GTHtest"
# optional - add up to a 12 character code in order to mark your instance record .ipynb

#inst_datetime <- datetime.now().strftime("%m%d%Y%H%M%S")
# a single datetime stamp for the full instance run



In [2]:
# Parameters
run_control = 1
percode = "2021.Q1"
commit_message = "Demo Test"
inst_datetime = "05262021182323"


##### Packages

In [3]:

# Optional ultimately maybe parameterize if people need libs installed.

# install.packages("arrow")
# install.packages("openxlsx") # excel
# install.packages("data.table") # data manipulation
# install.packages("dplyr") # data manipulation used in some of the viz
# install.packages("magrittr") # chaining
# install.packages("knitr") # html table output with kable function
# install.packages("kableExtra") # addtl styling to kable tables
# install.packages("ggplot2") # visualization
# install.packages("patchwork") # viz, combine plots in one image
# install.packages("DT") # html table output (javascript)
# install.packages("scales") # plot scales
# install.packages("stringr") # string replace
# install.packages("arsenal") # comparison functionality


#----- Libraries ------

library(arrow) # read/write parquet
library(openxlsx) # excel
library(data.table) # data manipulation
library(dplyr) # data manipulation used in some of the viz
library(magrittr) # chaining
library(knitr) # html table output with kable function
library(kableExtra) # addtl styling to kable tables
library(ggplot2) # visualization
library(patchwork) # viz, combine plots in one image
library(DT) # html table output (javascript)
library(scales) # plot scales
library(stringr) # string replace
library(arsenal) # comparison functionality



"package 'arrow' was built under R version 3.6.3"

See arrow_info() for available features



Attaching package: 'arrow'



The following object is masked from 'package:utils':

    timestamp



"package 'openxlsx' was built under R version 3.6.3"

"package 'data.table' was built under R version 3.6.3"

"package 'dplyr' was built under R version 3.6.3"

Registered S3 methods overwritten by 'tibble':
  method     from  
  format.tbl pillar
  print.tbl  pillar



Attaching package: 'dplyr'



The following objects are masked from 'package:data.table':

    between, first, last



The following objects are masked from 'package:stats':

    filter, lag



The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



"package 'magrittr' was built under R version 3.6.3"


Attaching package: 'magrittr'



The following object is masked from 'package:arrow':

    is_in



"package 'knitr' was built under R version 3.6.3"

"package 'kableExtra' was built under R version 3.6.3"


Attaching package: 'kableExtra'



The following object is masked from 'package:dplyr':

    group_rows



"package 'ggplot2' was built under R version 3.6.3"

"package 'patchwork' was built under R version 3.6.3"

"package 'DT' was built under R version 3.6.3"

"package 'scales' was built under R version 3.6.3"

"package 'stringr' was built under R version 3.6.3"

"package 'arsenal' was built under R version 3.6.3"


Attaching package: 'arsenal'



The following object is masked from 'package:scales':

    ordinal



The following object is masked from 'package:magrittr':

    set_attr



##### Read parquet file 

In [4]:
rt_path <- file.path('//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/Production', percode )

tempfile <- file.path(rt_path,'2021.Q1.localcur.parquet' )

pq_input <-  read_parquet( tempfile,  col_select = NULL,
  as_data_frame = TRUE,  props = ParquetArrowReaderProperties$create())

##### Original data input section, some adaptations for i_bu

In [5]:

# source('ColorPalette.R')

#----- Data ------

# define data directory
data_dir <- rt_path



# read in BU datasets
# data has been pre-processed and already converted to one currency (euro)
#d_input <- fread(file.path(data_dir, '2020.Q4.euroconv.csv'))




# read in dataset for reference items
# file for inputs
inputs <- paste("GPPM_Input_",percode,".xlsx",sep="")

# read in Euro exchange rates
# !!! use rates as of the end of the reporting period
# From Az Connect, https://connect.allianz.com/docs/DOC-220762
i_rates <- as.data.table(
  read.xlsx(
    xlsxFile = file.path(data_dir, inputs), 
    sheet = 'Euro_Rates'
    )
  )

i_def <- as.data.table(read.xlsx(
  xlsxFile = file.path(data_dir, inputs), 
  sheet = 'Definitions', 
  startRow = 1
  )
)
names(i_def)[names(i_def) == 'Data.Type'] <- 'Data Type' # quick way (base R) to replace the "." in Data Type header

i_bu <- as.data.table(
  read.xlsx(
  xlsxFile = file.path(data_dir, inputs), 
  sheet = 'BU_Descr', 
  startRow = 1
  )
)

#create R object from py variable

d_input <- as.data.table(pq_input)

print(d_input)

                       submission_file business_unit country currency
   1: us_dat_au_05142021143612.parquet            AU      AU      AUD
   2: us_dat_au_05142021143612.parquet            AU      AU      AUD
   3: us_dat_au_05142021143612.parquet            AU      AU      AUD
   4: us_dat_au_05142021143612.parquet            AU      AU      AUD
   5: us_dat_au_05142021143612.parquet            AU      AU      AUD
  ---                                                                
1964: us_dat_pt_05202021121056.parquet            PT      PT      EUR
1965: us_dat_pt_05202021121056.parquet            PT      PT      EUR
1966: us_dat_pt_05202021121056.parquet            PT      PT      EUR
1967: us_dat_pt_05202021121056.parquet            PT      PT      EUR
1968: us_dat_pt_05202021121056.parquet            PT      PT      EUR
                           region reporting_date_from   reporting_date_to
   1:                        APAC 2020-12-31 19:00:00 2021-03-30 20:00:00
   2:       

##### Data manipulation functions

In [6]:
# replace all NAs with 0
# stolen from https://stackoverflow.com/questions/7235657/fastest-way-to-replace-nas-in-a-large-data-table
replaceNA <- function(dt, replace = 0) {
  for (j in seq_len(ncol(dt)))
    set(dt, which(is.na(dt[[j]])), j , replace)
}

# Rename columns to make easier to work with in a function
# Replace "."s with "_"s & make lower case
# replace other patterns not needed in description
fixNames <- function(columns) {
  columns <- columns %>%
    tolower() %>% 
    gsub(pattern = '.', replacement = '_', fixed = TRUE) %>% 
    gsub(pattern = '_(paid_+_ocr_+_ibnr)', replacement = '', fixed = TRUE) %>%
    gsub(pattern = '_(excl__az_tech_fee)', replacement = '', fixed = TRUE) %>%
    gsub(pattern = '_(excl__hq_fees)', replacement = '', fixed = TRUE) %>%
    gsub(pattern = 'persons_involved_in_claims', replacement = 'claimants', fixed = TRUE) %>%
    gsub(pattern = 'units_of_risk', replacement = 'insureds', fixed = TRUE) %>%
    gsub(pattern = 'contribution_margin', replacement = 'cm', fixed = TRUE) %>%
    gsub(pattern = 'number_of', replacement = 'num', fixed = TRUE) %>%
    gsub(pattern = '%', replacement = 'pct', fixed = TRUE) %>%
    gsub(pattern = '(', replacement = '', fixed = TRUE) %>% 
    gsub(pattern = ')', replacement = '', fixed = TRUE) %>% 
    gsub(pattern = '-_', replacement = '', fixed = TRUE) %>%
    gsub(pattern = '+', replacement = '', fixed = TRUE) %>% 
    gsub(pattern = '-', replacement = '_', fixed = TRUE)

  return(columns)
}



##### Sample Data Manipulation Applied

In [7]:

# create a copy of the data.table to make check transformations in next step
d_input_in <- d_input

# set new column names
setnames(x = d_input, old = names(d_input), new = fixNames(names(d_input))) 

#----- Data Cleaning (Specific to BU) ------
# !!! temporary step
# pre-processing removes "Global" entry under the "type_of_account" field since it isn't an option in the template
# Currently, only Global-FoS and GLobal-FoE are options, but this is specific to Europe
# this step replaces these blanks with "Global"


d_input[business_unit == 'US' & type_of_account != 'Local', type_of_account := 'Global']
d_input[business_partner_name == 'Delta Vacations, LLC', business_partner_name := 'Delta Airlines'] #US had a change in main office; manual correction to the name

# fix channel for "Car Trawler" for all BUs
d_input[business_partner_name == 'Car Trawler', distribution_channel := 'Online Travel Agencies (OTAs)']

# change "Other" in Switzerland's channel and sublob to "Not Provided"
d_input[business_unit == 'CH' & distribution_channel %in% c('', 'Other'), distribution_channel := 'Not Provided']
d_input[business_unit == 'CH' & sub_lob == 'Other', sub_lob := 'Not Provided']

# Fix some of the characters for b-partners that got distorted in pre-processing
d_input[business_partner_name == 'CornÃ¨r Banca', business_partner_name := 'Cornèr Banca']
d_input[business_partner_name == 'ESL SÃ©jours Linguistiques', business_partner_name := 'ESL Séjours Linguistiques']
d_input[business_partner_name == 'ReisebÃ¼ro Mittelthurgau', business_partner_name := 'Reisebüro Mittelthurgau']
d_input[business_partner_name == 'Ã–KK', business_partner_name := 'ÖKK']
d_input[business_partner_name == 'HK fÃ¼r GÃ¤ste', business_partner_name := 'HK für Gäste']
d_input[business_partner_name == 'Twerenbold â€“ Busreisen Assistance Europa', business_partner_name := 'Twerenbold Busreisen Assistance Europa']
d_input[business_partner_name == 'VÃ¶gele Reisen', business_partner_name := 'Vögele Reisen']
d_input[business_partner_name == 'Last Minute Tours RestplatzbÃ¶rse', business_partner_name := 'Last Minute Tours Restplatzbörse']
d_input[business_partner_name == 'InterdiÃ¶zesane Lourdeswallfahrt', business_partner_name := 'Interdiözesane Lourdeswallfahrt']
d_input[business_partner_name == 'StÃ¶cklin Reisen', business_partner_name := 'Stöcklin Reisen']
d_input[business_partner_name == 'Heilungskosten fÃ¼r GÃ¤ste', business_partner_name := 'Heilungskosten für Gäste']
d_input[business_partner_name == 'Tourasia RÃ¶mer', business_partner_name := 'Tourasia Römer']
d_input[business_partner_name == 'Sumiswalder GrundgebÃ¼hr', business_partner_name := 'Sumiswalder Grundgebühr']



#----- Add currency conversion to raw data ------
# define columns that will need a conversion
curr_cols <- c(
 'written_revenues_net_of_taxes',
 'written_revenues',
 'earned_revenues_net_of_taxes',
'earned_revenues',
 'earned_base_commissions',
 'earned_over_commissions',
 'upfront_cash_payments',
 'total_compensation',
 'paid_claims',
 'actual_incurred_losses',
 'internal_variable_costs',
 'az_tech_fee',
 'internal_fixed_costs',
 'hq_fees',
 'total_expenses',
 'severity',
 'risk_premium',
 'cm_bu_view',
 'cm_hq_view',
 'profit_or_loss'
)

# replace existing China in main dataset
d <- rbindlist(list(d_input), use.names = TRUE, fill = TRUE)

#----- addtl TEMP data manipulation ------


#----- Data Cleaning ------


# save copy of data to manipulate
d <- d_input

# replace all NAs with 0s
replaceNA(d)

# additional manipulations
# change unknown channels and sublobs to "Other"
# ! now completed in pre-processing
#d[distribution_channel == '0', distribution_channel := 'Other']
#d[sub_lob == '0', sub_lob := 'Other']

# add bu names to dataset
# rename "Scandinavia" to "Scandinavia/Baltics" (more inclusive of the dataset)
# changed upstream in process
#i_bu[BU_CODE == 'SC', BU_DESCR := 'Scandinavia/Baltics']

## test
d <- merge(d, i_bu[, .(BU_CODE, bu_descr = BU_DESCR)], by.x = 'business_unit', by.y = 'BU_CODE', all.x = TRUE)


# Fix text in fields

d[product_name %in% c('0', '', '-'), product_name := 'Not Provided']
d[business_partner_name %in% c('-', '0', ''), business_partner_name := 'Not Provided']


# Add a reporting period and combined date of analysis
# F version: changed from "min" to "max"
# !!! special case for CN with different format
#d[business_unit == 'CN', 
#  month_of_analysis_max := format(as.Date(max(as.numeric(date_of_analysis)), 
#                                            origin = '1899-12-30'), '%Y %b'), 
#  by = business_unit]

d[,  month_of_analysis_max := format(as.Date(max(date_of_analysis)), '%Y %b'),  by = business_unit]

#d[,  month_of_analysis_max := max(date_of_analysis),  by = business_unit]

#d[business_unit == 'CN',
#  reporting_period := paste(
#  format(as.Date(as.numeric(reporting_date_from), origin = '1899-12-30'), '%Y %b'),
#  'to',
#  format(as.Date(as.numeric(reporting_date_to), origin = '1899-12-30'), '%Y %b')
#  )]

d[, 
  reporting_period := paste(
  format(as.Date(reporting_date_from), '%Y %b'),
  'to',
  format(as.Date(reporting_date_to), '%Y %b')
  )]

# Replace LoB column with "Travel" since some show up blank or NA
d[, lob := 'Travel']

print(d)


      business_unit                  submission_file country currency
   1:            AU us_dat_au_05142021143612.parquet      AU      AUD
   2:            AU us_dat_au_05142021143612.parquet      AU      AUD
   3:            AU us_dat_au_05142021143612.parquet      AU      AUD
   4:            AU us_dat_au_05142021143612.parquet      AU      AUD
   5:            AU us_dat_au_05142021143612.parquet      AU      AUD
  ---                                                                
1964:            PT us_dat_pt_05202021121056.parquet      PT      EUR
1965:            PT us_dat_pt_05202021121056.parquet      PT      EUR
1966:            PT us_dat_pt_05202021121056.parquet      PT      EUR
1967:            PT us_dat_pt_05202021121056.parquet      PT      EUR
1968:            PT us_dat_pt_05202021121056.parquet      PT      EUR
                           region reporting_date_from   reporting_date_to
   1:                        APAC 2020-12-31 19:00:00 2021-03-30 20:00:00
   2:       

In [8]:
##### Compare the original data.table to the transformed version, example to be built upon.

In [9]:
print("Compare d_input_in to d_input, should be no transformations.")

comparedf(d_input_in, d_input)

print("Compare d_input to d, should be all transformations.")
comparedf(d, d_input)

[1] "Compare d_input_in to d_input, should be no transformations."


Compare Object

Function Call: 
comparedf(x = d_input_in, y = d_input)

Shared: 78 non-by variables and 1968 observations.
Not shared: 0 variables and 0 observations.

Differences found in 0/78 variables compared.
0 variables compared have non-identical attributes.

[1] "Compare d_input to d, should be all transformations."


Compare Object

Function Call: 
comparedf(x = d, y = d_input)

Shared: 78 non-by variables and 1968 observations.
Not shared: 3 variables and 0 observations.

Differences found in 1/78 variables compared.
0 variables compared have non-identical attributes.

In [10]:
##### Output a tempfile to be turned into final outputs

In [11]:
write_parquet(d, tempfile)

#### todo Figure out how to store r objects for runner notebook
