# DataSynthesis is a framework to generate data files based on the input spec. It generates real-looking but fake data. Referential integrity is maintained across multiple files.

In [1]:
import pandas as pd
import seaborn as sns
import yaml
from yaml.loader import SafeLoader
import random
import import_ipynb
from FakeWrapper import FakeWrapper
from Utils import Utils
from Lookup import Lookup
import os
from datetime import date

importing Jupyter notebook from FakeWrapper.ipynb
importing Jupyter notebook from Utils.ipynb
importing Jupyter notebook from Lookup.ipynb


In [2]:
class Synthesis():
    def __init__(self,opbasefilename,
                 numrows=1,colsep=',',
                 opextension='.dat',
                 specextension='.yaml',
                 filedate=date.today().strftime('%Y%m%d'),
                 datadir = './data/',
                 configdir = './config/'):
        self.opbasefilename = opbasefilename
        self.specfilename = opbasefilename + specextension
        self.numrows = numrows
        self.colsep = colsep
        self.opfilenameext = opextension
        self.filedate = filedate
        self.fhandle = Utils()
        self.faker = FakeWrapper()
        self.datadir = datadir
        self.configdir = configdir
    
    def synthesis(self):
        
        fopen = self.fhandle.openfile(f'{self.configdir}{self.specfilename}',permission='r+')
        specread = self.fhandle.load_yaml(fopen)
        df = pd.DataFrame()
        
        for columnnames in specread[self.opbasefilename]['columnnames']:
            for header, values in columnnames.items():
                if values['lookup'] != None:
                    self.lkey = Lookup(lookup=values['lookup'],
                                            matchon= '.' if values['matchon'] == None else values['matchon'],
                                            filedate=self.filedate,
                                            extension=self.opfilenameext,
                                            datadir=self.datadir)
                    
                    if values['matchon'] != None:
                        self.lkey.load_dataframe()
                        #Handle for none matchby
                        values['matchby'] = values['matchon'].split('.')[1] if values['matchby'] == None else values['matchby']
                        df[header] = self.lkey.matched_series(df[values['matchby']].to_frame())
                        if values['fakerprovider'] == 'price_in_range':
                            df[header] = df.apply(lambda row : self.faker.getfake(values['fakerprovider'],{'input_price':row[header]}),axis=1)                                             
                    else:
                        self.lkey.load_series()
                        df[header] = [self.lkey.random_foreign_value() for i in range(self.numrows)]
                else:
                    df[header] = [self.faker.getuniquefake(values['fakerprovider'],values['fakerarguments']) 
                                  if values['candidatekey'] == True
                                  else self.faker.getfake(values['fakerprovider'],values['fakerarguments'])
                                  for i in range(self.numrows)]

        self.fhandle.df_to_csv(df,f'{self.datadir}{self.opbasefilename}')
        self.fhandle.rename_with_date(f'{self.datadir}{self.opbasefilename}',self.opfilenameext,self.filedate)
        return df
    


In [3]:
if __name__ == "__main__":
    seq_utils = Utils()
    seq_loader = seq_utils.openfile('./config/Sequence.yaml')
    seq_stream = seq_utils.load_yaml(seq_loader)
    
    for key, value in seq_stream.items():
        for list_of_files in value:
            for file_to_synthesize, properties in list_of_files.items():
                print(file_to_synthesize)
                print(properties)
                syn = Synthesis(file_to_synthesize,properties['numrows'],properties['extension'])
                df = syn.synthesis()
                del(df)

Actimize03_RefPrice
{'numrows': 100000, 'extension': '.dat'}
Actimize01_Orders
{'numrows': 10000, 'extension': '.dat'}
Actimize02_Executions
{'numrows': 500, 'extension': '.dat'}
