<img src="../images/AzPTravel_PPM.png">
<img src='../images/papermill.png'>

In [1]:
# GPM Data Collection Pipeline Run Controller

### This file is both a user interface and an executable callable from other processes.

### To use:
   - Set your required variable values in the "Set Instance Parameters" Section.
   - You can then run the entire pipeline by running all cells or notebooks one at a time as you wish.

IndentationError: unexpected indent (<ipython-input-1-b1ba14fb1032>, line 6)

In [None]:

import os
from datetime import datetime
import time
import papermill as pm
from IPython.lib.pretty import pprint
import pandas as pd
import matplotlib.pyplot as plt

## Set Instance Parameters

In [2]:
commit = "New German Data."
# give a reason for the run

percode = "2021.Q1"
# Data Collection Code, this controls file paths and output names

run_type = 1
#run_type =  0 - lite run with no reporting, not recommended.
#run_type =  1 - lite run with normal reporting, default setting.
#run_type =  2 - Heavy run with full reporting, available for audits and troubleshooting.

specialchars = "-J9-Demo"
# optional - add up to a 12 character code in order to mark your instance record .ipynb

## Source file format lookup

#### This is a list of file formats for each data collection.
- x : denotes that the source files are MS Excel documents.
- j : denotes the source files are json documents.
- p : denotes the source files are parquet documents.



In [3]:
if percode == "2019.Q4":
    s_format = "x"
elif percode == "2020.Q1":
    s_format = "x"
elif percode == "2020.Q3":
    s_format = "j"
elif percode == "2020.Q4":
    s_format = "j"
elif percode == "2021.Q1":
    s_format = "p"

## Record the datetimestamp for the instance

In [4]:
inst_datetime = datetime.now().strftime("%m%d%Y%H%M%S")
# a single datetime stamp for the full instance run

NameError: name 'datetime' is not defined

## set the instance archive folder

In [None]:
instances = f"//hecate/Insurance_US/Product Development/Product Management/"\
             f"Global PPM/Reporting/Data Collection/Production/{percode}/instances"

## style settings

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Read the source files, append to create initial .csv.

In [None]:
nb = pm.execute_notebook( 'prep.ipynb', os.path.join( instances, f'prep{inst_datetime}-{specialchars}.ipynb' ),
                          {'run_control': run_type, 'percode': percode,
                           'commit_message': commit, 'inst_datetime' : inst_datetime, 'source_type' : s_format } )

### Output Summary

In [None]:
rt_path = f'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/Production/{str(percode)}'

prepfile = f"{percode}prep.csv"

prepcsv = os.path.join(str(rt_path), prepfile)

cols = ["Filename", "Size", "Last Modified" ]
file_info_df = pd.DataFrame(columns = cols)

name = f"{percode}prep.csv"
size = round((os.path.getsize(prepcsv) /1000000),1)  # size in MB
moddate = os.path.getctime(prepcsv)# time of last metadata change;
moddate =  datetime.fromtimestamp(moddate).strftime('%Y-%m-%d %H:%M:%S')# format change;
new_row = pd.DataFrame([[name, size, moddate]], columns = cols)
file_info_df = file_info_df.append(new_row, ignore_index=True)


df =  pd.read_csv(prepcsv,  index_col=False, dtype='unicode')

df["Earned Revenues net of Taxes"] = df["Earned Revenues net of Taxes"].astype(float)

file_info_df

dfp = (df.pivot_table(index=( "Country", "Submission File"), columns=("Reporting Date From","Reporting Date To" ), values="Earned Revenues net of Taxes").fillna(0).astype(int))

dfp


## Read the prep file into the pipeline.

In [None]:
nb = pm.execute_notebook( 'read_dc.ipynb', os.path.join( instances, f'read_dc{inst_datetime}-{specialchars}.ipynb' ),
                          {'run_control': run_type, 'percode': percode,
                           'commit_message': commit, 'inst_datetime' : inst_datetime } )

### Output Summary

In [None]:
tempfile = f"{percode}.parquet"

prepcsv = os.path.join(str(rt_path), tempfile)

df = pd.read_parquet(prepcsv, engine = "pyarrow")

if run_type > 0:

    sumdata = df.groupby(['Business Unit']).sum()
    sumdata = sumdata.reset_index()
    plt.bar(sumdata['Business Unit'], sumdata['Earned Revenues net of Taxes']/1000)
    plt.title('Earned Revenue net of Taxes by BU in thousands LC')
    plt.show()

    countdata = df.groupby(['Business Unit']).count()
    countdata = countdata.reset_index()
    plt.bar(countdata['Business Unit'], countdata['Earned Revenues net of Taxes'])
    plt.title('Row Counts by BU')
    plt.show()

else:
    print("Charts Skipped")

## Read initial .csv, make transformations in python, output to {percode}.localcur.parquet

In [None]:
nb = pm.execute_notebook( 'py_transform.ipynb', os.path.join( instances, f'py_transform{inst_datetime}-{specialchars}.ipynb' ),
                          {'run_control': run_type, 'percode': percode,
                           'commit_message': commit, 'inst_datetime' : inst_datetime  } )

### Output Summary

In [None]:
tempfile = f"{percode}.localcur.parquet"

prepcsv = os.path.join(str(rt_path), tempfile)

df = pd.read_parquet(prepcsv, engine = "pyarrow")

if run_type > 0:

    sumdata = df.groupby(['business_unit']).sum()
    sumdata = sumdata.reset_index()
    plt.bar(sumdata['business_unit'], sumdata['earned_revenues_net_of_taxes']/1000)
    plt.title('Earned Revenue net of Taxes by BU in thousands LC')
    plt.show()

    countdata = df.groupby(['business_unit']).count()
    countdata = countdata.reset_index()
    plt.bar(countdata['business_unit'], countdata['earned_revenues_net_of_taxes'])
    plt.title('Row Counts by BU')
    plt.show()

else:
    print("Charts Skipped")

## Read {percode}.localcur.parquet make transfomrations in r , output to {percode}.localcur.parquet

In [None]:
nb = pm.execute_notebook( 'r_transform.ipynb', os.path.join( instances, f'r_transform{inst_datetime}-{specialchars}.ipynb' ),
                          {'run_control': run_type, 'percode': percode,
                           'commit_message': commit, 'inst_datetime' : inst_datetime  } )

## Read {percode}.localcur.parquet, output to {percode}.localcur.csv, make Euro fx conversions, output to {percode}.euroconv.csv

In [None]:
nb = pm.execute_notebook( 'make_output.ipynb', os.path.join( instances, f'make_output{inst_datetime}-{specialchars}.ipynb' ),
                          {'run_control': run_type, 'percode': percode,
                           'commit_message': commit, 'inst_datetime' : inst_datetime  } )


In [None]:
lfile = f'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/Production/{str(percode)}\\{str(percode)}.localcur.csv'
efile =  f'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/Production/{str(percode)}\\{str(percode)}.euroconv.csv'

cols = ["Filename", "Size", "Last Modified" ]
file_info_df = pd.DataFrame(columns = cols)

name = f"{percode}localcur.csv"
size = round((os.path.getsize(lfile) /1000000),1)  # size in MB
moddate = os.path.getctime(lfile)# time of last metadata change;
moddate =  datetime.fromtimestamp(moddate).strftime('%Y-%m-%d %H:%M:%S')# format change;
new_row = pd.DataFrame([[name, size, moddate]], columns = cols)
l_file_info_df = file_info_df.append(new_row, ignore_index=True)

name = f"{percode}euroconv.csv"
size = round((os.path.getsize(efile) /1000000),1)  # size in MB
moddate = os.path.getctime(efile)# time of last metadata change;
moddate =  datetime.fromtimestamp(moddate).strftime('%Y-%m-%d %H:%M:%S')# format change;
new_row = pd.DataFrame([[name, size, moddate]], columns = cols)
e_file_info_df = file_info_df.append(new_row, ignore_index=True)


print("Pipeline has run successfully.")
l_file_info_df
e_file_info_df
