***
# Infer Schemas
***

### _Import libraries_

In [1]:
import pandas as pd
import pandera as pda
import tkinter as tk
import os
import yaml
import json
import pandavro as pdx # Import to add .avro files to pandas DataFrames.
from tkinter import messagebox, filedialog

### _Create dialog window_

In [2]:
root = tk.Tk()
root.withdraw()
root.focus_force()
root.call('wm', 'attributes', '.', '-topmost', True)

''

### _Select input and output folders_

In [3]:
# Select input folder
messagebox.showinfo(title='Folder Selection', message='Please select the input folder containing the datasets.')
input_directory = filedialog.askdirectory()
if input_directory == '':
    messagebox.showinfo(title='Folder Selection', message='No folder was selected. Please select a folder.')
    os._exit(0)

# Select output folder
messagebox.showinfo(title='Folder Selection', message='Please select the folder you would like to extract the schema files to')
output_directory = filedialog.askdirectory()
if output_directory == '':
    messagebox.showinfo(title='Folder Selection', message='No folder was selected. Please select a folder.')
    os._exit(0)

# Check if output directory exists
    # Create output directory if does not exist
try:
    output_directory = os.path.join(output_directory, 'schemas-output-folder')
    output_directory = os.path.normpath(output_directory)
    yaml_output_directory = os.path.join(output_directory, 'yaml-schemas-output-folder')
    yaml_output_directory = os.path.join(yaml_output_directory)
    os.mkdir(output_directory)
    os.mkdir(yaml_output_directory)
    print(yaml_output_directory)
except OSError as error: 
    print(error) 
    
%gui tk

/Users/veles/Applications/dev/schema-inference/data/schemas-output-folder/schemas-output-folder/yaml-schemas-output-folder


In [3]:
# Select input folder
messagebox.showinfo(title='Folder Selection', message='Please select the input folder containing the datasets.')
if input_directory != None:
   input_directory = filedialog.askdirectory(initialdir=input_directory)
else:
    input_directory = filedialog.askdirectory()
    
if input_directory == '':
    messagebox.showinfo(title='Folder Selection', message='No folder was selected. Please select a folder.')
    os._exit(0)

# Select output folder
messagebox.showinfo(title='Folder Selection', message='Please select the folder you would like to extract the schema files to')
output_directory = filedialog.askdirectory()
    
if output_directory == '':
    messagebox.showinfo(title='Folder Selection', message='No folder was selected. Please select a folder.')
    os._exit(0)

# Check if output directory exists
    # Create output directory if does not exist
try:
    output_directory = os.path.join(output_directory, 'schemas-output-folder')
    output_directory = os.path.normpath(output_directory)
    yaml_output_directory = os.path.join(output_directory, 'yaml-schemas-output-folder')
    yaml_output_directory = os.path.normpath(yaml_output_directory)
    os.mkdir(output_directory)
    os.mkdir(yaml_output_directory)
except OSError as error: 
    print(error) 
    
%gui tk

NameError: name 'input_directory' is not defined

### _Create Methods_

In [4]:
# Convert yaml to json and export
def convert_and_export(file_name, yaml_path):

    # Join output directory with file name
    json_path = str(os.path.join(output_directory, file_name + '.json'))

    # Load yaml file
    with open(yaml_path) as yaml_file:
        yaml_config = yaml.safe_load(yaml_file)

    # Create json file with write permissions
    with open(json_path, 'w') as json_file:
        json_file = json.dump(yaml_config, json_file, indent=2)

In [5]:
def run(input_directory):
    # Add global scope variable
    global df
    
    count = 0
   
    # Iterate through files in input_directory
    for file in os.listdir(input_directory):
        
        # Combine input directory with file name for file export
        f = os.path.join(input_directory, file)
        
        # Normalize path string to OS default
        f = os.path.normpath(f)
        
        print(f)
        
        # Check for file extension
            # Add to pandas DataFrame
        if '.parquet' in f:
            df = pd.read_parquet(f)
        elif '.csv' in f:
            df = pd.read_csv(f)
        elif '.json' in f:
            df = pd.read_json(f)
        elif '.avro' in f:
            df = pdx.read_avro(f)
        else:
            continue

        # Split data file name to remove the file extension
        # Used to name the schema file
        split_string = file.split('.')
        file_name = split_string[-2] + '-schema'

        print('file name: ' + file_name)
        
        for i in df.columns:
            if df[i].dtype == object:
                df[i] = df[i].astype(pd.StringDtype())

        # Create schema in python DataFrame object
        schema = pda.infer_schema(df)

        # Export to yaml (pandera default)
        yaml_path = str(os.path.join(yaml_output_directory, file_name + '.yaml'))
        yaml_schema = schema.to_yaml(yaml_path)

        # Convert yaml to json and export
        convert_and_export(file_name, yaml_path)

        count += 1
        
    print('Exported ' + str(count) + ' file(s) to ' + output_directory)

### _Run program_

In [6]:
run(input_directory)

/Users/veles/Applications/dev/schema-inference/data/.DS_Store
/Users/veles/Applications/dev/schema-inference/data/userdata1.avro
file name: userdata1-schema
/Users/veles/Applications/dev/schema-inference/data/userdata1.parquet
file name: userdata1-schema
/Users/veles/Applications/dev/schema-inference/data/schemas-output-folder
Exported 2 file(s) to /Users/veles/Applications/dev/schema-inference/data/schemas-output-folder/schemas-output-folder


#### Pros:
    - Easy to write code with the panderas library creating the schemas.
#### Cons: 
    - Still potentionally need to modify schemas, as panderas documentation states it only creates rough drafts.