<img src="../images/AzPTravel_PPM.png">

## Convert json to parquet

- Check if there is anything in "H:/Reporting/Data Collection/percode/Manual Updates/json_to_parquet/parquet_version".
- If so, move contents to "H:/Reporting/Data Collection/percode/Manual Updates/json_to_parquet/Archive".
- Read json files to pandas dfs dict.
- Output as parquet.

### User Variables
- These are overwritten if inherited from run_control.ipynb.
- Feel Free to reset them for a manual run if you like
- Do not save without percode = "-f"

In [67]:
percode = "2021.Q1"
# Data Collection Code, this controls file paths and output names
# "-f" is the value indicating a bad inheritance from run with arg


#### Packages used


In [68]:
import os # System commands
import sys # System commands
import shutil # System utility commands 


import warnings # custom warnigns options

import glob # Directory operations
import getpass # Work with parquet
import json # Work with json

import matplotlib.pyplot as plt #Plots and Graphs
import numpy as np # Series and math
import pandas as pd #DataFrame and math


#### Begin Input Read
- Make paths used in script
- all paths should be modular and easily replaced ahead of operations.

In [69]:
#root directory
rt_path = f'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/{str(percode)}/Manual Updates/json_to_parquet'
#json input directory
json_path = os.path.join( rt_path, 'json_submission')
#parquet output directory
output_path = os.path.join( rt_path, 'parquet_version')
#archive
archive_path = os.path.join( rt_path, 'Archive')
#csv_folder for comparison
csv_path = os.path.join( rt_path, 'csv_comparisons')



#### Move contents of parquet_version to Archive

In [70]:
file_names = os.listdir(output_path)
    
for file_name in file_names:
    shutil.move(os.path.join(output_path, file_name), archive_path)

In [71]:
files = os.listdir(json_path)
files = [files.lower() for files in files]

files_input = [f for f in files if f[-5:]  == '.json']

files_input

['us_dat_au_02172021174434.json',
 'us_dat_au_05142021143612.json',
 'us_orig_au_02172021174433.json',
 'us_orig_au_05142021143611.json',
 'us_survey_au_02172021174808.json',
 'us_survey_au_05142021143750.json',
 'us_vcomments_au_02172021174434.json',
 'us_vcomments_au_02172021174440.json',
 'us_vcomments_au_05142021143612.json',
 'us_vcomments_au_05142021143617.json']

In [72]:
json_dict = {}

for f in files_input:
    json_dict.update({f'{f}': pd.read_json(os.path.join( json_path, f), orient="table")})

In [73]:
v_flo = ['Open Claims %', 'Contribution Margin % on Earned Revenues net of Taxes - BU View', 
         'Contribution Margin % on Earned Revenues net of Taxes - HQ View', 'Loss Ratio', 'Commission Ratio',
         'Expense Ratio', 'comsub', 'expsub','% of IBNR on (OCR + IBNR)', 'Contribution Margin % on Fixed Costs - BU View',
        'Contribution Margin % on Fixed Costs - HQ View']
# add any float columns that give a mixed type error below

#v_str
# add any string columns that give a mixed type error below
#v_int
# add any integer columns that give a mixed type error below
#v_dat
# add any datetime columns that give a mixed type error below

#     for i in v_str: v[i] = v[i].astype('str'  )
#     for i in v_int: v[i] = v[i].astype( 'int' )
#     for i in v_dat: v[i] = v[i].astype('datetime64[ns]')

In [74]:
for k, v in json_dict.items():
    parquet_name = k.replace(".json",".parquet")
    out_file = os.path.join(output_path,parquet_name)
    v = v.fillna(0)
    
    for i in v_flo:
        try:
            v[i] = v[i].replace(r'^\s*$', np.nan, regex=True)
            v[i] = v[i].astype('float')
        except KeyError:
            pass
    v.to_parquet(out_file, engine="pyarrow")

#### commented out cells create docs to proove that the json and parquet outputs are the same.

In [75]:
# for k, v in json_dict.items():
#     csv_name = k.replace(".json",".csv")
#     out_file = os.path.join(csv_path,csv_name)
#     v.to_csv(out_file)
    

In [76]:
# files = os.listdir(output_path)
# files = [files.lower() for files in files]

# files_output = [f for f in files if f[-8:]  == '.parquet']

# files_output

['us_dat_au_02172021174434.parquet',
 'us_dat_au_05142021143612.parquet',
 'us_orig_au_02172021174433.parquet',
 'us_orig_au_05142021143611.parquet',
 'us_survey_au_02172021174808.parquet',
 'us_survey_au_05142021143750.parquet',
 'us_vcomments_au_02172021174434.parquet',
 'us_vcomments_au_02172021174440.parquet',
 'us_vcomments_au_05142021143612.parquet',
 'us_vcomments_au_05142021143617.parquet']

In [83]:
# outname = []
# parq_dict = {}

# for idx, f in enumerate(files_output):
#     outname.append(f.replace(".parquet","_p.csv"))
#     parq_dict.update({f'{outname[idx]}': pd.read_parquet(os.path.join(output_path, f), engine="pyarrow")})
    
# parq_dict

{'us_dat_au_02172021174434_p.csv':    Business Unit Country Currency Region Reporting Date From  \
 0             AU      AU      AUD   APAC          2020-01-01   
 1             AU      AU      AUD   APAC          2020-01-01   
 2             AU      AU      AUD   APAC          2020-01-01   
 3             AU      AU      AUD   APAC          2020-01-01   
 4             AU      AU      AUD   APAC          2020-01-01   
 ..           ...     ...      ...    ...                 ...   
 80            AU      AU      AUD   APAC          2020-01-01   
 81            AU      AU      AUD   APAC          2020-01-01   
 82            AU      AU      AUD   APAC          2020-01-01   
 83            AU      AU      AUD   APAC          2020-01-01   
 84            AU      AU      AUD   APAC          2020-01-01   
 
    Reporting Date To Date of Analysis         Type of Analysis  \
 0         2020-12-31       2021-02-08  Most Recently 12 Months   
 1         2020-12-31       2021-02-08  Most Recen

In [84]:
# for k, v in parq_dict.items():
#     csv_name = k
#     out_file = os.path.join(csv_path,csv_name)
#     v.to_csv(out_file)