<img src="../images/AzPTravel_PPM.png">

## Convert json to parquet

- Check if there is anything in "H:/Reporting/Data Collection/percode/Manual Updates/json_to_parquet/parquet_version".
- If so, move contents to "H:/Reporting/Data Collection/percode/Manual Updates/json_to_parquet/Archive".
- Read json files to pandas dfs dict.
- Output as parquet.

### User Variables

In [23]:
percode = "2021.Q1"
# Data Collection Code, this controls file paths and output names
# "-f" is the value indicating a bad inheritance from run with arg


#### Packages used


In [24]:
import os # System commands
import sys # System commands
import shutil # System utility commands 


import warnings # custom warnigns options

import glob # Directory operations
import getpass # Work with parquet
import json # Work with json

import matplotlib.pyplot as plt #Plots and Graphs
import numpy as np # Series and math
import pandas as pd #DataFrame and math


#### Begin Input Read
- Make paths used in script
- all paths should be modular and easily replaced ahead of operations.

In [25]:
#root directory
rt_path = f'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/{str(percode)}/Manual Updates/xlsx_to_parquet'
#json input directory
xlsx_path = os.path.join( rt_path, 'xlsx_submission')
#parquet output directory
output_path = os.path.join( rt_path, 'parquet_version')
#archive
archive_path = os.path.join( rt_path, 'Archive')
#csv_folder for comparison
csv_path = os.path.join( rt_path, 'csv_comparisons')



#### Move contents of parquet_version to Archive

In [26]:
file_names = os.listdir(output_path)
    
for file_name in file_names:
    shutil.move(os.path.join(output_path, file_name), archive_path)

#### Get list of files in the xlsx folder

In [27]:
files = os.listdir(xlsx_path)
files = [files.lower() for files in files]

files_input = [f for f in files if '.xls' in f[-5:] ]

files_input

['german_allocations.xlsx']

In [28]:
xlsx_dict = {}

for f in files_input:
    xlsx_dict.update({f'{f}': pd.read_excel(os.path.join( xlsx_path, f), 
                                            #skiprows = 3,
                                            sheet_name= 'Sheet1' )})
                                            #'Ptf_Monitoring_GROSS_Reins' )})
    xlsx_dict[f] = xlsx_dict[f][~xlsx_dict[f]['Business Unit'].isna()]
    xlsx_dict[f].reset_index(drop=True, inplace=True)

In [29]:
print(xlsx_dict[f])

   Business Unit Country Currency                           Region  \
0             DE      DE      EUR  North, Central & Eastern Europe   
1             DE      DE      EUR  North, Central & Eastern Europe   
2             DE      DE      EUR  North, Central & Eastern Europe   
3             DE      DE      EUR  North, Central & Eastern Europe   
4             DE      DE      EUR  North, Central & Eastern Europe   
..           ...     ...      ...                              ...   
68            DE      DE      EUR  North, Central & Eastern Europe   
69            DE      DE      EUR  North, Central & Eastern Europe   
70            DE      DE      EUR  North, Central & Eastern Europe   
71            DE      DE      EUR  North, Central & Eastern Europe   
72            DE      DE      EUR  North, Central & Eastern Europe   

   Reporting Date From Reporting Date To Date of Analysis Type of Analysis  \
0           2021-01-01        2021-03-31       2021-06-07     Year To Date   
1  

In [30]:
v_flo = ['Open Claims %', 'Contribution Margin % on Earned Revenues net of Taxes - BU View', 
         'Contribution Margin % on Earned Revenues net of Taxes - HQ View', 'Loss Ratio', 'Commission Ratio',
         'Expense Ratio', 'comsub', 'expsub','% of IBNR on (OCR + IBNR)', 'Contribution Margin % on Fixed Costs - BU View',
        'Contribution Margin % on Fixed Costs - HQ View']

# add any float columns that give a mixed type error below

# v_str

# add any string columns that give a mixed type error below
#v_int

v_str = ['Notes', 'Type of Business' , 'Type of Account', 'Distribution Type' ,'LOB' ,'Distribution Channel', 'Business Partner ID Number']
# add any integer columns that give a mixed type error below
#v_dat
# add any datetime columns that give a mixed type error below


In [31]:
for k, v in xlsx_dict.items():
    
    parquet_name = k.replace(k[-5:],".parquet")
    out_file = os.path.join(output_path,parquet_name)
    
    v = v.fillna(0)
    
    for i in v_flo:
        try:
            v[i] = v[i].replace(r'^\s*$', np.nan, regex=True)
            v[i] = v[i].astype('float')
        except KeyError:
            pass
    for i in v_str:
        try:
            v[i] = v[i].replace(r'^\s*$', np.nan, regex=True)
            v[i] = v[i].astype('str')
        except KeyError:
            pass
    v.to_parquet(out_file, engine="pyarrow")

#### commented out cells create docs to prove that the json and parquet outputs are the same.

In [32]:
for k, v in xlsx_dict.items():
    csv_name = k.replace(k[-5:],".csv")
    out_file = os.path.join(csv_path,csv_name)
    v.to_csv(out_file)

out_file   

'//hecate/Insurance_US/Product Development/Product Management/Global PPM/Reporting/Data Collection/2021.Q1/Manual Updates/xlsx_to_parquet\\csv_comparisons\\german_allocations.csv'

In [33]:
files = os.listdir(output_path)
files = [files.lower() for files in files]

files_output = [f for f in files if f[-8:]  == '.parquet']


In [34]:
outname = []
parq_dict = {}

for idx, f in enumerate(files_output):
    outname.append(f.replace(".parquet","_p.csv"))
    parq_dict.update({f'{outname[idx]}': pd.read_parquet(os.path.join(output_path, f), engine="pyarrow")})
    
parq_dict

{'german_allocations_p.csv':    Business Unit Country Currency                           Region  \
 0             DE      DE      EUR  North, Central & Eastern Europe   
 1             DE      DE      EUR  North, Central & Eastern Europe   
 2             DE      DE      EUR  North, Central & Eastern Europe   
 3             DE      DE      EUR  North, Central & Eastern Europe   
 4             DE      DE      EUR  North, Central & Eastern Europe   
 ..           ...     ...      ...                              ...   
 68            DE      DE      EUR  North, Central & Eastern Europe   
 69            DE      DE      EUR  North, Central & Eastern Europe   
 70            DE      DE      EUR  North, Central & Eastern Europe   
 71            DE      DE      EUR  North, Central & Eastern Europe   
 72            DE      DE      EUR  North, Central & Eastern Europe   
 
    Reporting Date From Reporting Date To Date of Analysis Type of Analysis  \
 0           2021-01-01        2021-03-

In [35]:
for k, v in parq_dict.items():
    csv_name = k
    out_file = os.path.join(csv_path,csv_name)
    v.to_csv(out_file)