<img src="../images/AzPTravel_PPM.png">

## Validation Report Creation Script

#### This script reads the output of the "us_vdf*" and "us_vcomments*" data files and summarizes  them into a report: Validations Review _{BU}.xlsx

#### When finished, it will be able to be run on its own or as part of the pipeline wherever called (ie from a papermill script or other user inteface).


### User Variables
- These are overwritten if inherited from run_control.ipynb.
- Feel Free to reset them for a manual run if you like
- Do not save without percode = "-f"

In [1]:
commit_message = "Development and testing."
# Give a brief reason for the run.

run_control = 1
#run_type = 0 - Lite run with no reporting, not recommended.
#run_type = 1 - Lite run with normal reporting, default setting.
#run_type = 2 - Heavy run with full reporting, available for audits and troubleshooting.
#run_type = 5 - A default setting. Indicates the script is being run by an outside process without an inherited value

percode = "2021.Q1"
# Data Collection Code, this controls file paths and output names
# "-f" is the value indicating a bad inheritance from run with arg

s_format = "p"
# denotes the source data format x == Excel; j == json, p == parquet

#----------
# do not edit - this either inherits the full instance timestamp from the papermill book or captures the run time of this script.
from datetime import datetime  # datetime options
inst_datetime = datetime.now().strftime("%m%d%Y%H%M%S")

In [2]:
# Parameters
run_control = 1
percode = "2021.Q1"
commit_message = "Live run, added AT, bad file name, rerun."
inst_datetime = "06012021173656"


#### Notebook display options

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### import packages

In [4]:
#### Packages used

import os # System commands
import sys # System commands

import warnings # custom warnigns options

import glob # Directory operations
import getpass # Work with parquet
import json # Work with json

import matplotlib.pyplot as plt #Plots and Graphs
import numpy as np # Series and math
import pandas as pd #DataFrame and math

#excel operations
import re
import nicexcel as nl # Excel operations
import xlsxwriter # Excel operations
import openpyxl # Excel operations


#### Default Variables, these govern logic, do not edit.

In [5]:
default_dc = "20XX.QX"
default_rc = 0 #extra lite mode
dummy_perc = "33Q3" # bad inheritance

#### Script determining run context ie, manual, run_control.ipynb, or other.

In [6]:
if run_control == 5:
    run_control = default_rc 
else:
    run_control = run_control

try:
    if sys.argv[1] == "-f":
        percode = percode
    else:
        percode = sys.argv[1]

except IndexError:
    percode = default_dc
except NameError:
    percode = default_dc


#### style settings

In [7]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


## Begin Input Read
- Make paths used in script
- all paths should be modular and easily replaced ahead of operations.

In [8]:
#root directory
rt_path = os.path.join(f'\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\Data Collection', percode  )

#live sources directory
ls_path = os.path.abspath(os.path.join( rt_path,".." ,"Production",percode,'live_sources'))

#report ouput dir
rep_path = os.path.abspath(os.path.join(rt_path, '..','..', "Data Collection" ,f"{percode}", 'Surveys and Comments'))
#report ouput archive dir
rep_arch_path = os.path.abspath(os.path.join(rep_path, '..','..',f"{percode}",  'Archive'))


#### Make sources used in script
- all sources should be modular and easily replaced ahead of operations.

In [9]:
rep_xls = os.path.join(rep_path ,"Surveys and Comments_xx.xlsx"   )

#### Get a list of only source files in the path that start with "us_dat".
#### Logic determines the source file types.

#### User instructions:
- Make sure that you have 1 file per source in this folder.
    -  For instance, do not have two files for Portugal. If there is an update, archive the old one.
- Do not overwrite files in the archive.
    - Rename newly archived files, no strict convention, we keep track of these by the modified date.
- It is ok to have multiple sources in one file.

In [10]:
files = os.listdir(ls_path)
files = [files.lower() for files in files]

files_sources = [f for f in files if f[-5:]  == '.json' or  f[-8:] == '.parquet']

for idx, f in enumerate(files_sources):  files_sources[idx] = os.path.join(ls_path, f) 


#### Make a list of BUs that have comments available

In [11]:
prelist_jfiles = glob.glob(os.path.join(ls_path, 'us_vcomments_*.json'))
prelist_pfiles = glob.glob(os.path.join(ls_path, 'us_vcomments_*.parquet'))

prelist_files = []
    
for i in prelist_jfiles : prelist_files.append(i)    
for i in prelist_pfiles : prelist_files.append(i)


for idx, i in enumerate(prelist_jfiles):
    prelist_jfiles[idx] = prelist_jfiles[idx][-23:-19]
    prelist_jfiles[idx] = prelist_jfiles[idx].replace('_','')

for idx, i in enumerate(prelist_pfiles):
    prelist_pfiles[idx] = prelist_pfiles[idx][-26:-22]
    prelist_pfiles[idx] = prelist_pfiles[idx].replace('_','')

list_bus = []
    
for i in prelist_jfiles : list_bus.append(i)    
for i in prelist_pfiles : list_bus.append(i)


BU_set = set(list_bus)
BU_list = list(BU_set)

files_sources


['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_at_+te_q1_2021_at.parquet',
 '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_au_05142021143612.parquet',
 '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_ca_05202021134242.parquet',
 '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_ch_05182021192455.parquet',
 '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_cz_05262021090243.parquet',
 '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collecti

In [12]:

datafiles = [ "us_orig", "us_dat", "us_vcomment", "us_survey", ]

for b in BU_list:
    
    latest_data_sets = []

    for i in datafiles:
        
        print(f"_{b.lower()}")
        list_files = list(filter(lambda x: f"{i}" in  x and f"_{b.lower()}_" in  x  , files_sources))  
        print(list_files)
        recent_vers = max(list_files, key=os.path.getctime)
        try:
            latest_data_sets.append([f'{i}',pd.read_json(recent_vers, orient="table")])

        except UnicodeDecodeError:
            latest_data_sets.append([f'{i}',pd.read_parquet(recent_vers, engine="pyarrow")])

    data_set_dict = {}
    data_set_dict["Original Data Set"] = latest_data_sets[0][1]
    data_set_dict["Final Data Set"] = latest_data_sets[1][1]
    data_set_dict["Validations"] = latest_data_sets[2][1]
    data_set_dict["Survey"] = latest_data_sets[3][1].T.reset_index().rename(columns={'index':'Question Name', 0 :'Selection', 1:'Optional Response',2:'timestamp'})

    nl.to_excel_ms(dfs=data_set_dict, filename=os.path.join(rep_path, f'Survey and Comments_{b}.xlsx'))


_au
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_orig_au_05142021143611.json']


_au
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_au_05142021143612.parquet']


_au
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_au_02172021174434.parquet', '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_au_02172021174440.parquet', '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_au_05142021143612.parquet', '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_au_05142021143617.parquet', '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_au_mnual_creation.parquet']


_au
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_survey_au_02172021174808.parquet', '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_survey_au_05142021143750.parquet']


Original Data Set


Final Data Set


Validations


Survey


_gr
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_orig_gr_05252021115404.parquet']


_gr
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_gr_05252021115404.parquet']


_gr
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_gr_05252021115404.parquet']


_gr
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_survey_gr_05252021115827.parquet']


Original Data Set


Final Data Set


Validations


Survey


_pl
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_orig_pl_05262021084450.parquet']


_pl
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_pl_05262021084450.parquet']


_pl
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_pl_05262021084450.parquet']


_pl
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_survey_pl_05262021084708.parquet']


Original Data Set


Final Data Set


Validations


Survey


_ch
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_orig_ch_05182021192455.parquet']


_ch
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_ch_05182021192455.parquet']


_ch
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_ch_05182021192455.parquet', '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_ch_05182021192456.parquet']


_ch
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_survey_ch_05182021192905.parquet']


Original Data Set


Final Data Set


Validations


Survey


_ca
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_orig_ca_05202021134242.parquet']


_ca
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_ca_05202021134242.parquet']


_ca
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_ca_05202021134242.parquet']


_ca
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_survey_ca_05202021135610.parquet']


Original Data Set


Final Data Set


Validations


Survey


_pt
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_orig_pt_05202021121055.parquet']


_pt
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_pt_05202021121056.parquet']


_pt
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_pt_05202021121056.parquet']


_pt
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_survey_pt_05202021121304.parquet']


Original Data Set


Final Data Set


Validations


Survey


_it
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_orig_it_05242021111520.parquet']


_it
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_it_05242021111520.parquet']


_it
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_it_05242021111521.parquet']


_it
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_survey_it_05242021111759.parquet']


Original Data Set


Final Data Set


Validations


Survey


_cz
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_orig_cz_05242021103837.parquet', '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_orig_cz_05262021090243.parquet']


_cz
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_cz_05262021090243.parquet']


_cz
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_cz_05242021103837.parquet', '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_cz_05262021090243.parquet']


_cz
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_survey_cz_05262021090440.parquet']


Original Data Set


Final Data Set


Validations


Survey


_es
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_orig_es_05202021105141.parquet']


_es
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_es_05202021105141.parquet']


_es
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_es_05202021105141.parquet', '\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_es_05202021105142.parquet']


_es
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_survey_es_05202021105431.parquet']


Original Data Set


Final Data Set


Validations


Survey


_nl
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_orig_nl_05272021163119.parquet']


_nl
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_dat_nl_05272021163120.parquet']


_nl
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_vcomments_nl_05272021163120.parquet']


_nl
['\\\\hecate\\Insurance_US\\Product Development\\Product Management\\Global PPM\\Reporting\\Data Collection\\Production\\2021.Q1\\live_sources\\us_survey_nl_05272021163630.parquet']


Original Data Set


Final Data Set


Validations


Survey


In [13]:
### Open the folder with the documents saved

In [14]:
os.startfile(rep_path)