In [None]:
import sys
import os
import glob
import json
import pandas as pd
import yaml
import nbformat as nbf
import shutil
import unicodedata
import re


conversion_map = {
    'textfield': 'text',
    'text': 'text',
    'integer': ('numerical', 'Int64'),
    'decimal': ('numerical', 'Float'),
    'date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
    'single': 'categorical',
    'radio': 'categorical',
    'text_multiline': 'text',
    'list': 'text',
    'checkbox': 'categorical',
    'textmultiline': 'text',
    'multiples': 'categorical',
    'multiple': 'categorical'
}


def import_csv_folder(folder_path):
    """Looks for the right folders to go fetch all the informations

    Args:
        folder_path (path): Folder path

    Returns:
        metadata and dicos: Returns the metadata and dico list
    """
    if folder_path:
        structure_folder_path = os.path.join(folder_path, '1_structure')
        link_folder_path = os.path.join(folder_path, '2_link/link.csv')
        dico_folder_path = os.path.join(folder_path, '4_dico/dico.csv')
        if os.path.exists(structure_folder_path) and \
        os.path.isdir(structure_folder_path):
            all_files = glob.glob(os.path.join(structure_folder_path, '*.csv'))
            fichier = pd.read_csv(link_folder_path, sep=';')
            dicos = pd.read_csv(dico_folder_path, sep = ';')
            return convert_to_metadata(all_files, fichier, dicos)
        else:
            print("'1_structure' folder not found in the selected folder.")


def import_json_file(json_file_path, calc):
    """Reads the JSON file

    Args:
        json_file_path (path): The path to JSON file

    Returns:
        json and dico list: Returns the metadata and dico list
    """
    if json_file_path:
        json_code = read_json_file(json_file_path)
        return convert_json_to_metadata(json_code, calc)


def read_json_file(json_file_path):
    """Reads the JSON File

    Args:
        json_file_path (path): Path to JSON file

    Returns:
        json_file: The read JSON file
    """
    json_file_path_js = json_file_path + ".json"
    with open(json_file_path_js, 'r', encoding='utf-8') as json_file:
        return json.load(json_file)


def convert_to_metadata(all_files, link, dico):
    """Converts all the CSVs to SDV Metadata

    Args:
        all_files (list): A list of all tables
        link (list): A list of relationships
        dico (dict): A dictionnary

    Returns:
        metadata and list: Returns the right SDV Metadata format with the 
        dictionnaries
    """
    metadata_json = {
        "METADATA_SPEC_VERSION": "MULTI_TABLE_V1",
        "tables": {},
        "relationships": [],
    }
    
    listo = []

    for filename in all_files:

        df = pd.read_csv(filename, sep=";")
        table_name = df.loc[df["type"] == "P", "varset"].to_string(index=False)
        specs = df.loc[df["type"] == "V"]
        table_specs = {
            "primary_key": "",
            "columns": {},
            "column_relationships": [],
        }

        for line in specs.iterrows():
            big_dico = {'table_name':"",
            'col': "",
            'type': "",
            'values':[]}
            column_id = line[1]["field_name"]
            sdtype = line[1]["field_type"]

            if sdtype == 'radio':
                big_dico['table_name'] = table_name
                
                big_dico['col'] = column_id
                type = line[1]['dico']
                big_dico['type'] = type
                values = dico[dico['dico_name'] == type]
                for value in values.iterrows():
                    big_dico['values'].append(value[1]['code'])
                listo.append(big_dico)

            sdtype = conversion_map.get(sdtype, sdtype)
            if isinstance(sdtype, tuple):
                table_specs["columns"][column_id] \
                = {"sdtype": sdtype[0], "computer_representation": sdtype[1]}
            elif isinstance(sdtype, dict): 
                table_specs["columns"][column_id] = sdtype
            else:
                table_specs["columns"][column_id] = {"sdtype": sdtype}

        metadata_json["tables"][table_name] = table_specs
    for index, all in link.iterrows():
        parent = all['varset_1']
        child = all['varset_2']
        nom_id = parent + '.id_data'
        relation = {
            'parent_table_name': parent,
            'child_table_name': child,
            'parent_primary_key': 'id_data',
            'child_foreign_key': nom_id
        }
        metadata_json['tables'][parent]['columns']['id_data'] \
        = {"sdtype": "id"}
        metadata_json['tables'][child]['columns']['id_data'] = {"sdtype": "id"}
        metadata_json['tables'][child]['columns'][nom_id] = {"sdtype": "id"}
        metadata_json['tables'][parent]['primary_key'] = 'id_data'
        metadata_json['tables'][child]['primary_key'] = 'id_data'
        metadata_json['tables'][child]['foreign_key'] = nom_id
        metadata_json['relationships'].append(relation)

    return metadata_json, listo


def convert_json_to_metadata(root, calc):
    """Takes in the JSON file and converts it to the metadata using imbeded 
    functions

    Args:
        root (json_dict): The raw JSON imported file
        calc (bool): The setting to take account the calculted values
    """


    def convert_to_metadata_json(liste, liens, dicos, rep):
        """Convert the JSON file to the right SDV Metadata format

        Args:
            liste (list): Tables descriptions
            liens (list): Tables relationships
            dicos (dict): A list that contains all the datasets dictionnary
            rep (bool): The setting to take account the calculted values

        Returns:
            json and list: Returns a metadata and dataset dictionnary for the
            dictionnary conversion
        """
        metadata_json = {
            "METADATA_SPEC_VERSION": "MULTI_TABLE_V1",
            "tables":{},
            "relationships": []
        }

        listo = []
        linked_tables = []

        for m in range(len(liens)):
            parent = liens[m][0]['name']
            child = liens[m][1]['name']
            linked_tables.append(parent)
            linked_tables.append(child)

        for i in range(len(liste)):
            if liste[i]['nom'] in list(set(linked_tables)) or len(liste) == 1:
                table_names = liste[i]['nom']
                tables_specs =  {
                    "primary_key": "",
                    "columns": {},
                    "column_relationships": []
                }
                for j in range(len(liste[i]['valeur'])):
                    big_dico = {'table_name':"",
                        'col': "",
                        'type': "",
                        'values':[]}
                    colonne = liste[i]['valeur'][j]['nom']
                    sdtype = liste[i]['valeur'][j]['type']
                    if sdtype == 'radio' or sdtype == 'single':
                        dico_id = liste[i]['valeur'][j]['dico']
                        for k in range(len(dicos)):
                            if dicos[k]['id'] == dico_id:
                                big_dico['table_name'] = table_names
                                big_dico['col'] = colonne
                                big_dico['type'] = sdtype
                                for l in range(len(dicos[k]\
                                                   ['attrs']['value'])):
                                    big_dico['values']\
                                        .append(dicos[k]['attrs']['value']\
                                                [l]['code'])
                        listo.append(big_dico)


                    
                    sdtype = conversion_map.get(sdtype, sdtype)
                    if isinstance(sdtype, tuple):
                        tables_specs["columns"][colonne] \
                            = {"sdtype": sdtype[0], 
                               "computer_representation": sdtype[1]}
                    elif isinstance(sdtype, dict): 
                        tables_specs["columns"][colonne] = sdtype
                    elif sdtype == 'calculated' and rep == True:
                        tables_specs["columns"][colonne] = {"sdtype": 'text'}
                    elif sdtype == 'calculated' and rep == False:
                        continue
                    else:
                        tables_specs["columns"][colonne] = {"sdtype": sdtype}

                tables_specs["columns"]['sys_id'] = {"sdtype" : "id" }
                tables_specs['primary_key'] = 'sys_id'
                metadata_json['tables'][table_names] = tables_specs
            else:
                print(f"The table {liste[i]['nom']} is not explicitly linked.")
                continue

        for s in range(len(liens)):
            relat = {
                "parent_table_name": "",
                "child_table_name": "",
                "parent_primary_key": "",
                "child_foreign_key": ""
            }
            parent = liens[s][0]['name']
            child = liens[s][1]['name']

            relat['parent_table_name'] = parent
            relat['child_table_name'] = child
            relat['parent_primary_key'] \
                = metadata_json['tables'][parent]['primary_key']
            parent_key = parent + '.sys_id'
            metadata_json['tables'][child]['columns'][parent_key] \
                = {'sdtype' : 'id'}
            relat['child_foreign_key'] = parent_key

            metadata_json['relationships'].append(relat)

        return metadata_json, listo


    def extract_component_info(data, nam):
        """Recursive function to find all variables within the pages of a 
        Voozanoo 4 project

        Args:
            data (json_dict): the JSON file converted to understandable python
            nam (string): The name of the page/table/varset

        Returns:
            metadate (dict) : Specific format dictionnary that will allow 
            the convert function to work
        """
        metadate = {'nom': nam, 'valeur': []}
        if isinstance(data, dict):
            attrs = data.get('attrs', {})
            try:
                name = attrs['name']
                render_type = attrs.get('render-type')
                subtype = attrs.get('subtype')
                label_position = attrs.get('labelPosition')
                
                if attrs.get('type') == 'component' and render_type != 'form':
                    if render_type == 'single' or render_type == 'multiples' \
                        or render_type == 'multiple':
                        dico = attrs.get('dico')
                        metadate['valeur'].append({'nom': name, 
                                                   'type': render_type, 
                                                   'dico': dico})
                    elif subtype == 'boolean' and label_position:
                        metadate['valeur'].append({'nom': name, 
                                                   'type': 'boolean'})
                    else:
                        metadate['valeur'].append({'nom': name, 
                                                   'type': render_type})
                elif attrs.get('type') == 'datasource' and \
                    attrs.get('subtype') == 'custom':
                    label = attrs.get('label', '').lower()
                    label = unicodedata.normalize('NFKD', 
                                                  label)\
                                                    .encode('ascii', 'ignore')\
                                                        .decode('utf-8')
                    label = re.sub(r'[^a-zA-Z0-9_]', '_', label)
                    metadate['valeur'].append({'nom': label, 
                                               'type': 'calculated'})
            except KeyError:
                pass
            for child in data.get('child', []):
                child_metadate = extract_component_info(child, nam)
                metadate['valeur'].extend(child_metadate['valeur'])
        elif isinstance(data, list):
            for item in data:
                child_metadate = extract_component_info(item, nam)
                metadate['valeur'].extend(child_metadate['valeur'])
        return metadate


    def parse_pages(data):
        """Parse pages using a recursive function to look for the right ones
        and use the extract_component function to extract all the variables to
        create the right format for the final converter

        Args:
            data (json_dict): JSON portion that contains all the pages 
            of the project

        Returns:
            metadates (list): A list of all the tables
            that the project contains under the right input format for the 
            final converter
        """
        metadates = []
        if isinstance(data, dict):
            attrs = data.get('attrs', {})
            component_type = attrs.get('type', '')
            component_subtype = attrs.get('subtype', '')
            if component_type == 'component' and component_subtype == 'page':
                if attrs.get('render-type', '') == 'form':
                    no = data['attrs']['varset']
                    metadate = extract_component_info(data, no)
                    metadates.append(metadate)
            for child in data.get('child', []):
                metadates.extend(parse_pages(child))
        elif isinstance(data, list):
            for item in data:
                metadates.extend(parse_pages(item))
        return metadates


    def parse_dicos(data):
        """Look for the dictionnaries informations and creates a list of these 
        informations for the converter to search into

        Args:
            data (json_dict): JSON portion that contains all the dictionnaries 
            of the project

        Returns:
            dicos (list): A list of all the dictionnaries 
            that the project contains under the right input format for the 
            final converter
        """
        dicos = []
        for item in data:
            dicos.append(item)
        return dicos


    def parse_relationship(data):
        """Look for the relationships informations and creates a list of these 
        informations for the converter to search into

        Args:
            data (json_dict): JSON portion that contains all the relationships 
            of the project

        Returns:
            liaisons (list): A list of all the relationships that the project 
            contains under the right input format for the final converter
        """
        liaisons = []
        for item in data:
            liaisons.append(item['attrs']['varsets'])
        return liaisons


    def treat_form_pages(data):
        """A JSON parser that will parse the JSON file and seperate it a uses 
        the right funciton to ease the process

        Args:
            data (json_dict): The read JSON file using the read function.

        Returns:
            metadates, dicos, liaisons (dicts and lists): Dictionnaries and 
            Lists to the right format so the convert_to_metadata_json function 
            can use them to create the right metadata format
        """
        metadates = []
        dicos = []
        liaisons = []
        if isinstance(data, dict):
            id = data.get('id')
            if id == 'pages':
                metadates = parse_pages(data)
            elif id == 'dicos':
                dicos = parse_dicos(data['child'])
            elif id == 'relations':
                liaisons = parse_relationship(data['child'])
            else:
                for child in data.get('child', []):
                    result = treat_form_pages(child)
                    metadates.extend(result[0])
                    dicos.extend(result[1])
                    liaisons.extend(result[2])
        elif isinstance(data, list):
            for item in data:
                result = treat_form_pages(item)
                metadates.extend(result[0])
                dicos.extend(result[1])
                liaisons.extend(result[2])
        return metadates, dicos, liaisons


    def merge_dicts_with_same_table_name(metadates):
        """If a same varset contains multiple pages, the function will merge
        them together in order to not create multiple same name tables

        Args:
            metadates (list): A list of the parsed pages of the the JSON file

        Returns:
            merged_tables (list): A list with the tables that had the same 
            names merged with their values merged (same name values will be
            merged too)
        """
        merged_metadates = {}
        for metadate in metadates:
            table_name = metadate['nom']
            if table_name not in merged_metadates:
                merged_metadates[table_name] = metadate['valeur']
            else:
                merged_metadates[table_name].extend(metadate['valeur'])
        return [{'nom': table_name, 'valeur': values} for table_name, 
                values in merged_metadates.items()]


    metadates, dicos, liaisons = treat_form_pages(root) 

    merged_metadates = merge_dicts_with_same_table_name(metadates)

    metadata, lista = convert_to_metadata_json(merged_metadates, 
                                               liaisons, 
                                               dicos, 
                                               calc)

    return metadata, lista


def create_folder(folder_name):
    """Creates the new folder that will contain the new files.

    Args:
        folder_name (string): The name of the new folder.
    """
    f_n = './data/' + folder_name
    os.makedirs(f_n, exist_ok=True)

    source = './.mod.py'
    shutil.copy(source, f_n)

def create_notebook(folder_name, output, met):
    """Creates the first notebook to first visualize and process the metadata.
    It will also make the user modify the metadata so the right sdtypes are 
    applied.
    
    Args:
        folder_name (str): The name of the folder where to put it into
        output (str): The name of the created file
        met (str): The name of the SDV JSON file created to load
    """    
    f_n = './data/' + folder_name
    os.chdir(f_n)

    output_file = output + ".json"

    with open(output_file, 'w') as f:
        json.dump(met, f, indent=4)

    nb = nbf.v4.new_notebook()


    text = """# Premiere visualisation
Vous avez la possibilite de visualiser le metadata en choisissant dabord le \
bon kernel"""
    nb.cells.append(nbf.v4.new_markdown_cell(text))

    code = f"""from sdv.metadata import MultiTableMetadata
    \nimport pandas as pd
    \nmetadata = MultiTableMetadata.load_from_json('./{output}.json')
    \nmetadata.visualize()"""
    nb.cells.append(nbf.v4.new_code_cell(code))

    text1 = """# Generation des donnees sans modifications au Metadata
Apres la conversion, le Metadata est pret a l'emploi cependant il est \
possible de le personnaliser.
Importons rapidement un synthetiseur pour generer des donnees. \
Vous apprendrez par la suite comment utiliser ce synthetiseur"""
    nb.cells.append(nbf.v4.new_markdown_cell(text1))

    code1 = f"""from sdv_enterprise.sdv.multi_table.dayz.day_zero import \
        DayZSynthesizer
    \nsynthesizer = DayZSynthesizer(metadata)
    \nsynthetic_data = synthesizer.sample()"""
    nb.cells.append(nbf.v4.new_code_cell(code1))

    text2 = """Allons voir dans une table les donnees generees."""
    nb.cells.append(nbf.v4.new_markdown_cell(text2))

    code_1 = """synthetic_data['nom_dune_table'].head() # N'oubliez pas de \
    changer le nom par un nom d'une de vos tables"""
    nb.cells.append(nbf.v4.new_code_cell(code_1))


    text3 = """# Vous avez la possibilite de modifier les types"""
    nb.cells.append(nbf.v4.new_markdown_cell(text3))

    phtm_code = """# Par exemple
# metadata.update_column(
#   table_name = 'patient',
#   column_name = 'nom',
#   sdtype = 'last_name')
#
# Dans le cas ou vous devez remplacer plusieurs colonnes dans un meme tableau \
vous avez la possibilite d'utiliser cette commande
#
# metadata.update_columns_metadata(
#    table_name='users',
#    column_metadata={
#        'age': { 'sdtype': 'numerical' },
#        'ssn': { 'sdtype': 'ssn', 'pii': True },
#        'gender': { 'sdtype': 'categorical' },
#        'dob': { 'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d' },
#        ...
#    }
#)"""
    nb.cells.append(nbf.v4.new_code_cell(phtm_code))

    text3_5 = """Pour modifier correctement le sdtype suivez \
[ce lien](https://epiconcept.gitbook.io/synthetic-data-vault-sdv/multi-tables/preparation-des-donnees/api-multi-table-metadata#update-api)

    
Pour plus d'informations sur les sdtypes voir \
[ce lien](https://epiconcept.gitbook.io/synthetic-data-vault-sdv/reference/metadata-spec/sdtypes)"""
    nb.cells.append(nbf.v4.new_markdown_cell(text3_5))

    text4 = """# Visualisation et Export
Maintenant nous allons pouvoir exporter le nouveau metadata pour la \
generation de donnees. N'oubliez pas de donner un nom au nouveau fichier."""   
    nb.cells.append(nbf.v4.new_markdown_cell(text4))

    code2_0 = """metadata.visualize()"""
    nb.cells.append(nbf.v4.new_code_cell(code2_0))

    code2 = f"""metadata.save_to_json('./{output}_v2.json')"""
    nb.cells.append(nbf.v4.new_code_cell(code2))

    text4 = """# Generation des donnees rapide
Apres la petite personnalisation, generons un echantillon a partir du \
nouveau metadata.
Reutilisons le meme synthetiseur."""
    nb.cells.append(nbf.v4.new_markdown_cell(text4)) 

    code3 = f"""synthesizer = DayZSynthesizer(metadata)
    \nsynthetic_data = synthesizer.sample()"""
    nb.cells.append(nbf.v4.new_code_cell(code3))

    code4 = """synthetic_data['nom_dune_table'].head() # N'oubliez pas \
de changer le nom par un nom d'une de vos tables"""
    nb.cells.append(nbf.v4.new_code_cell(code4))

    text5 = """Pour generer des meilleures donnees, nous vous invitons \
de continuer vers le Notebook numero 1."""
    nb.cells.append(nbf.v4.new_markdown_cell(text5)) 

    notebook_filename = "0-Visualisation_et_Changements.ipynb"
    with open(notebook_filename, 'w') as f:
        nbf.write(nb, f)
    
    print(f"Metadata converted, Notebook created, your folder {folder_name} \
is ready.")


def create_from_scratch(output, dico_list, sep):
    """Creates the second notebook if the scratch setting was set to TRUE
    in order to apply the parsed dictionnaries and sets the seperator of the 
    exported CSV. If the separator was precised then it will be applied 
    otherwise the default separator is ","
    
    Args:
        output (str): The name of the created file and we will add _v2 in order
        to follow the first notebook
        dico_list (list): The list of the treated dictionnaries to load into 
        the synthesizer
        sep (str): The default or custom separator for the exported CSV
    """
    os.rename('.mod.py', 'mod.py')
    nb = nbf.v4.new_notebook()
    
    text = """# Generation de donnees synthetique a partir du Metadata seul
Si vous lancez les cases suivantes, il vous sera possible de generer des \
donnees synthetiques seulement a l'aide de votre metadata. Vous n'avez pas \
besoin d'importer les dictionnaires, cela est fait automatiquement pour vous. \
Il vous suffit juste de verifier si vous avez charge le bon metadata et de \
lancer toutes les cases.
## Visualisation
Verifiez si votre Metadata est correct"""
    nb.cells.append(nbf.v4.new_markdown_cell(text))

    code = f"""from sdv.metadata import MultiTableMetadata
    \nimport pandas as pd
    \nmetadata = MultiTableMetadata.load_from_json('./{output}_v2.json')
    \nmetadata.visualize()"""
    nb.cells.append(nbf.v4.new_code_cell(code))


    text1 = """## Generation des donnees avec les dictionnaires
Apres l'import du nouveau Metadata. Importons rapidement un synthetiseur \
pour generer des donnees. Nous avons recupere les dictionnaires de votre \
metadata et les avons implementes dans ce notebook. Il vous suffit de \
tout lancer pour ajouter ces dictionnaires."""
    nb.cells.append(nbf.v4.new_markdown_cell(text1))

    code1 = f"""from sdv_enterprise.sdv.multi_table.dayz.day_zero import \
DayZSynthesizer
    \nsynthesizer = DayZSynthesizer(metadata)"""
    nb.cells.append(nbf.v4.new_code_cell(code1))

    code0 = """from mod import map_category_values"""
    nb.cells.append(nbf.v4.new_code_cell(code0))

    code = f"""dico = {dico_list}

map_category_values(synthesizer, dico)"""
    nb.cells.append(nbf.v4.new_code_cell(code))

    code1_1 = """# synthesizer.get_parameters()
# decommentez la fonction au dessus pour voir tous les dictionnaires \
ajoute a votre synthetiseur"""
    nb.cells.append(nbf.v4.new_code_cell(code1_1))

    text2 = """# Generons vos donnees
Lancez la commande suivante pour generer vos donnees. N'oubliez \
pas que vous pouvez changer le nombre de ligne pour toutes les tables en \
changeant le nombre en bas."""
    nb.cells.append(nbf.v4.new_markdown_cell(text2))

    code2 = f"""synthetic_data = synthesizer.sample(num_rows = 1000)"""
    nb.cells.append(nbf.v4.new_code_cell(code2))

    code2_1 = """synthetic_data['nom_de_table'] # Petite visualisation"""
    nb.cells.append(nbf.v4.new_code_cell(code2_1))

    text3 = """# Enregistrer les donnees synthetiques
Lancez la fonction suivante afin d'obtenir un dossier zip avec toutes les \
tables a l'interieur. Le dossier sera sous le nom de **synthetic_data.zip**"""
    nb.cells.append(nbf.v4.new_markdown_cell(text3))

    code3 = f"""import zipfile
from io import BytesIO
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, 'a', zipfile.ZIP_DEFLATED, False) as zip_file:
    for filename, df in synthetic_data.items():
        csv_buffer = BytesIO()
        df.to_csv(csv_buffer, index=False, sep = '{sep}')
        csv_buffer.seek(0)
        
        filename_csv = filename + ".csv"
        zip_file.writestr(filename_csv, csv_buffer.getvalue())

with open('synthetic_data.zip', 'wb') as f:
    f.write(zip_buffer.getvalue())
"""
    nb.cells.append(nbf.v4.new_code_cell(code3))
    
    notebook_filename = "1-DayZ_Generation.ipynb"
    with open(notebook_filename, 'w') as f:
        nbf.write(nb, f)


def create_with_data(output, data_path, sep):
    """Creates the second notebook if the scratch setting was set to FALSE
    for the model to learn from real data and sets the seperator of the 
    exported CSV. If the separator was precised then it will be applied 
    otherwise the default separator is ","
    
    Args:
        output (str): The name of the created file and we will add _v2 in order
        to follow the first notebook
        data_path (list): The path to the data in order to train the model
        sep (str): The default or custom separator for the exported CSV
    """    
    nb = nbf.v4.new_notebook()
    
    text = """# Generation de donnees synthetique a partir du Metadata et de \
    donnees existantes (apprentissage)
Si vous lancez les cases suivantes, il vous sera possible de generer des \
donnees synthetiques a partir de vos donnees reelles. Vous n'avez pas besoin \
d'importer les dictionnaires, les valeurs des colonnes categoriques seront \
detectees automatiquement. Il vous suffit juste de verifier si vous avez \
charge le bon metadata et d'avoir charge les bonnes donnees et enfin, de \
lancer toutes les cases. 

***Attention! Le liens entre les tables sont faites de maniere automatique \
par le convertisseur. Si vos tables possedent deja une logique et des \
liens, vous avez encore la possibilite de modifier les relations \
entre les tables pour que l'outil fonctionne correctement. Faites les \
modifications avant l'import du modele. Sinon vous n'arriverez pas a \
synthetiser les donnees a partir du modele d'apprentissage et il faudra \
passer par la generation "from scratch"*** 

## Visualisation
Verifiez si votre Metadata est correct"""
    nb.cells.append(nbf.v4.new_markdown_cell(text))

    code = f"""from sdv.metadata import MultiTableMetadata
    \nimport pandas as pd
    \nmetadata = MultiTableMetadata.load_from_json('./{output}_v2.json')
    \nmetadata.visualize()"""
    nb.cells.append(nbf.v4.new_code_cell(code))

    text2 = """# Import de vos donnees
Lancez la case suivante afin de charger vos donnees reelles pour \
l'apprentissage du modele. Il faut noter que le modele va repliquer toutes \
les subtilites statistiques de vos donnees (repartitions, distributions, \
correlations...) mais ne va pas attribuer de logique reelles. Ainsi, il \
n'oubliez pas que vous avez la possibilite d'ajouter des \
[contraintes](https://epiconcept.gitbook.io/synthetic-data-vault-sdv/multi-tables/modelisation/personnalisations) \
pour rendre vos donnees plus realistes. La documentation pandas peut vous \
servir pour l'import des tables de donnees CSV en cas de choses differentes. \
La variable 'datasets' joue un role de dictionnaires dans lequel vous pouvez \
chercher vos tables avec le nom de la table en cle. Si vos CSV suivent un \
format specifique ou possedent des caracteristiques particulieres vous pouvez \
allez voir les \
[parametres d'import](https://epiconcept.gitbook.io/synthetic-data-vault-sdv/multi-tables/preparation-des-donnees/charger-les-donnees#load_csv)."""
    nb.cells.append(nbf.v4.new_markdown_cell(text2))

    path_to_data = './data/' + data_path

    code2 = f"""from sdv.datasets.local import load_csvs
\ndatasets = load_csvs(
    folder_name = '{path_to_data}')"""
    nb.cells.append(nbf.v4.new_code_cell(code2))

    code3 = """datasets['nom_de_table'].head() # Visualisez \
    l'une de vos tables grace a cette commande. Il vous suffit d'ajouter le \
nom d'une des tables dans vos jeu de donnees."""
    nb.cells.append(nbf.v4.new_code_cell(code3))

    text3 = """# Import du modele et apprentissage
Vous allez ici faire apprendre au modele les caracteristiques statistiques \
de vos donnees reelles. Lancez les codes suivants afin de faire apprendre \
votre modele. Nous avons ici le modele le plus performant qui va travailler \
sur les relations intra-table mais aussi inter-tables. Le modele est le \
[HSASynthesizer](https://epiconcept.gitbook.io/synthetic-data-vault-sdv/multi-tables/modelisation/synthetiseurs/hsa-synthesizer) \
vous pouvez essayer d'autres synthetiseur \
[ici](https://epiconcept.gitbook.io/synthetic-data-vault-sdv/multi-tables/modelisation/synthetiseurs)"""
    nb.cells.append(nbf.v4.new_markdown_cell(text3))

    code4 = """from sdv_enterprise.sdv.multi_table.hsa import HSASynthesizer
synthesizer = HSASynthesizer(metadata) # Nous precisons la structures \
des donnees."""
    nb.cells.append(nbf.v4.new_code_cell(code4))

    code5 = """synthesizer.fit(datasets) # L'apprentissage se fait a cette \
    etape. Le temps d'apprentissage peut etre different selon la taille de \
        vos donnees."""
    nb.cells.append(nbf.v4.new_code_cell(code5))

    text4 = """# Generation des donnees synthetiques
Si vous avez deja genere des donnees synthetiques, le principe reste le meme. \
Ici nous allons defenir comme au dessus une sorte de bibliotheque qui va \
contenir toutes nos donnees et on va pouvoir generer cette fois une echelle. \
L'echelle est le nombre de fois que le modele va multiplier la quantite de \
donnee initiale. Si vous voulez 2 fois la quantite de base de vos donnees \
il vous suffit de mettre '2'."""
    nb.cells.append(nbf.v4.new_markdown_cell(text4))

    code6 = """synthetic_data = synthesizer.sample(scale=1) # Ici nous \
    voulons que la quantite generee soit identique aux donnees reelles."""
    nb.cells.append(nbf.v4.new_code_cell(code6))

    code6_1 = """synthetic_data['nom_de_table'] # Petite visualisation"""
    nb.cells.append(nbf.v4.new_code_cell(code6_1))

    text5 = """# Enregistrer les donnees synthetiques
Lancez la fonction suivante afin d'obtenir un dossier zip avec toutes les \
tables a l'interieur. Le dossier sera sous le nom de **synthetic_data.zip**"""
    nb.cells.append(nbf.v4.new_markdown_cell(text5))

    code7 = f"""import zipfile
from io import BytesIO
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, 'a', zipfile.ZIP_DEFLATED, False) as zip_file:
    for filename, df in synthetic_data.items():
        csv_buffer = BytesIO()
        df.to_csv(csv_buffer, index=False, sep = '{sep}')
        csv_buffer.seek(0)
        
        filename_csv = filename + ".csv"
        zip_file.writestr(filename_csv, csv_buffer.getvalue())

with open('synthetic_data.zip', 'wb') as f:
    f.write(zip_buffer.getvalue())
"""
    nb.cells.append(nbf.v4.new_code_cell(code7))


    notebook_filename = "1-Learning_Generation.ipynb"
    with open(notebook_filename, 'w') as f:
        nbf.write(nb, f)


def main(input_file):
    """Takes the YAML config file to set up everything.

    Args:
        input_file (.yml): YML file that contains all the set up.
            input file : either json_file_path or folder_path
            output_file : name of the metadata.json file
            new_folder : the new folder that will contains all the files
    """
    with open(input_file, 'r') as f:
        config = yaml.safe_load(f)
        output_file = config['output_metadata_name']
        folder_name = config['generation_folder_name']
        scratch = config['scratch_mode']

    if 'separator' in config:
        separator = config['separator']
    else:
        separator = ','
    if 'calculated' in config:
        calc = config['calculated']
        print(f"Your calculated values are taken in account. You precised : \
{calc}.")
    else:
        print("All your calculated values are by default not taken account.",
              "If otherwise please specify it in the configuration file.")
        calc = False

    if 'input_folder_path' in config:
        f_p = './data/' + config['input_folder_path']
        metadata_json, dico_list = import_csv_folder(f_p)
        create_folder(folder_name)
        create_notebook(folder_name, output_file, metadata_json)
        if scratch == True:
            create_from_scratch(output_file, dico_list,separator)
            print('DayZSynthesizer notebook created.')
        elif scratch == False:
            data_folder_name = config['data_folder_name']
            if 'data_folder_name' in config:
                create_with_data(output_file, data_folder_name, separator)
                print('HSASynthesizer notebook created.')
            else:
                print('Please input your data folder name.')
        else:
            print("Input Scratch answer. Or you won't get any data \
generetion notebook.")
    elif 'input_json_file_path' in config:
        j_p = './data/' + config['input_json_file_path']
        metadata_json,  dico_list = import_json_file(j_p, calc)
        create_folder(folder_name)
        create_notebook(folder_name, output_file, metadata_json)
        if scratch == True:
            create_from_scratch(output_file, dico_list, separator)
            print('DayZSynthesizer notebook created.')
        elif scratch == False:
            data_folder_name = config['data_folder_name']
            if 'data_folder_name' in config:
                create_with_data(output_file, data_folder_name, separator)
                print('HSASynthesizer notebook created.')
            else:
                print('Please input your data folder name.')
        else:
            print("Input Scratch answer. Or you won't get any data \
generetion notebook.")
    else:
        print("Invalid input file format.")


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python app.py /app/data/<input_yaml_file>")
    else:
        input_yaml_file = sys.argv[1]
        main(input_yaml_file)
