The purpose of this notebook is to store script for generating tabular random data


- single view data
- multi view (folder of csv files)
- multi view data (all contained in a single csv)

In [1]:
# 1 create random data


import pandas as pd
import numpy as np
import os

from typing import Iterator, Union, List, Dict, Tuple

In [163]:
class TabularDataGenerator:
    def __init__(self, n_samples:int, 
                 feature_names:Iterator[str]=None,
                 is_multi_view:bool=False,
                 as_multi_index:bool=False):
        self._array = None
        
        self._n_samples = n_samples
        self._views = {}
        self._features_names = []
        self._primary_key = None  # either None or a pandas serie
        #self._is_view_set = False
        self._available_char = 'qwertyuiopasdfghjklzxcvbnm '  
        #all characters available for generating primary keys (eg name_surname)
        self._shuffle_primary_key = False
    
    def add_primary_key(self, col_name: str,
                        dtype:str,
                        is_shuffled: bool=False,
                        char_len:int=20):
        if dtype == 'int':
            # primary key is an index
            _idx = np.arange(self._n_samples)
        elif dtype == 'str':
            # primary key is a string (ie patient name)
            n_char = len(self._available_char)  #
            
            assert char_len ** n_char > self._n_samples, f'char_len **n_char is supposed to be greater than n_samples'
            
            _idx = []
            for sample in range(self._n_samples):
                _is_unique = False
                while not _is_unique:
                    _order = np.random.randint(0, n_char, size=(char_len,))
                    _new_char_id = ''.join([self._available_char[x] for x in _order])
                    if _new_char_id not in _idx:
                        _is_unique = True
                _idx.append(_new_char_id)
        else:
            raise ValueError(f"dtype: {dtype} is not valid")
        if is_shuffled:
            np.random.shuffle(_idx)
        _idx = pd.DataFrame(_idx)
        if col_name is not None:
            _idx.columns = [col_name]
        self._concatenate(_idx)
        self._primary_key = _idx
        
    def set_primary_key(self, col_name: Union[str, int]=None):
        
        col_indx = self._get_index(col_name)
        _primary_key = self._array.iloc[:,col_indx]
        assert len(np.unique(_primary_key)) == self._n_samples, 'value in primary key must be unique: aborting'
        
        self._primary_key = _primary_key

    def shuffle_primary_key(self):
        self._shuffle_primary_key = True
        
    def set_view(self, view_name:str):
        if self._array is not None:
            _df = self.get_single_view_dataframe()
        self._views[view_name] = _df
        self._features_names = []
        self._array = None
    
    def get_single_view_dataframe(self, view_name: Union[str, int]=None):
        #_df = pd.DataFrame(self._array, columns=self._features_names)
        
        print(f'Dataframe for view {view_name}')
        if view_name:
            _df = self._views[view_name]
        else:
            _df = self._array
        return _df
    
    def add_integers_values(self, n_col: int=1,
                            col_name:Union[str, Iterator[str]] =None,
                            l_bound:int=0,
                           u_bound:int=100,
                           is_unique=False):
        if is_unique:
            _rand_int = np.arange(self._n_samples)
        else:
            _rand_int = np.random.randint(l_bound, u_bound, size=(self._n_samples, n_col))
        _rand_int = pd.DataFrame(_rand_int)
        if col_name is not None:
            _rand_int.columns = [col_name]
        self._concatenate(_rand_int)
        if col_name is not None:
            self._features_names.append(col_name)
    
    def add_float_values(self, n_col:int=1, col_name:str=None):
        _rand = np.random.random((self._n_samples, n_col))
        _rand = pd.DataFrame(_rand)
        if col_name is not None:
            _rand.columns = [col_name]
        self._concatenate(_rand)
        if col_name is not None:
            self._features_names.append(col_name)
            
    def add_discrete_values(self, n_col:int=1,
                            col_name:Union[str, Iterator[str]] =None,
                            l_bound:int=0,
                           u_bound:int=100):
        
        _rand_int = np.random.randint(l_bound, u_bound, size=(self._n_samples, n_col))
        _rand_int = np.array(_rand_int, dtype=np.float64) # changing type from int to float
        _rand_int = pd.DataFrame(_rand_int)
        if col_name is not None:
            _rand_int.columns = [col_name]
        self._concatenate(_rand_int)
        if col_name is not None:
            self._features_names.append(col_name)
            
    def add_datetime_values(self,  start=None, end=None, col_name:str=None, freq='H', **kwargs):
        _dti = pd.date_range(start, end, periods=self._n_samples, freq=freq,**kwargs)
        _dti = pd.DataFrame(_dti)
        if col_name is not None:
            _dti.columns = [col_name]
        self._concatenate(_dti)
        if col_name is not None:
            self._features_names.append(col_name)
    
    def add_missing_samples_to_column(self, col_name:Union[str, int], n_missing_points:int=1):
        
        if self._array is None:
            raise ValueError("please add data before completing it with missig data")
        if n_missing_points >= self._n_samples:
            raise ValueError(f"too much missing points (n_missing_points {n_missing_points} < n_samples {self._n_samples})")
        col_indx = self._get_index(col_name)
        _idx = np.arange(self._n_samples)
        np.random.shuffle(_idx)
        _array = self._array.iloc[:, col_indx]
        _shuffled_array = _array.iloc[_idx].values
        _shuffled_array[:n_missing_points] = np.nan
        self._array.iloc[:, col_indx] = _shuffled_array[_idx]
    
    def add_boolean_values(self, n_col:int=1, col_name: Union[str, Iterator[str]]=None):
        _rand_bool = np.random.randint(0,2,size=(self._n_samples, n_col), dtype=bool)
        _rand_bool = pd.DataFrame(_rand_bool)
        if col_name is not None:
            _rand_bool.columns = [col_name]
        self._concatenate(_rand_bool)
        if col_name is not None:
            self._features_names.append(col_name)
            
    def add_character_values(self,
                             n_col:int=1,
                             col_name: Union[str, Iterator[str]]="Gender",
                            char_values: List[str]=['MAN', 'WOMAN']):
        
        n_values = len(char_values)
        _rand_values = np.random.randint(0,n_values,size=(self._n_samples, n_col))
        _rand_string = {}
        
        if col_name is None:
            col_name = list(range(n_col))
        
        if isinstance(col_name, str):
            col_name = [col_name]  # convert into a list so `col_name` can be iterable
        for i, col in enumerate(col_name):
            _rand_string[col] = [char_values[x] for x in _rand_values[:, i]]

        _rand_string = pd.DataFrame(_rand_string)
        self._concatenate(_rand_string)
        
    def shuffle_columns(self):
        pass
    
    def shuffle_samples(self):
        _idx = np.arange(self._n_samples)
        np.random.shuffle(_idx)
        self._array = self._array.iloc[_idx]
        print('dataset samples shuflled')
    
    def get_multi_index_dataframe(self):
        _dataset = self.get_multi_view_dataset()
        _multi_index_df = create_multi_view_dataframe(_dataset)
        return _multi_index_df
    
    def get_multi_view_dataset(self) -> Dict[str, pd.DataFrame]:
        """gets tab"""
        if not self._views:
            self._views['view_0'] = self._array
        if self._primary_key is not None:
            # set a primary key to all dataframe
            for view_name in self._views.keys():
                # iterate over all views
                _primary_key = self._primary_key.values
                if self._shuffle_primary_key:
                        np.random.shuffle(_primary_key)
                _primary_key = pd.DataFrame(_primary_key)
                
                if hasattr(_primary_key, 'name'):
                    # case if primary key is a pandas series
                    _primary_key_name = self._primary_key.name
                else:
                    # case where primary key is a dataframe
                    _primary_key_name = self._primary_key.columns[0]
                _primary_key.columns = [_primary_key_name]
                
                if _primary_key_name not in self._views[view_name].columns.values:
                    # case where primary key is not present
                    
                    
                    self._views[view_name] = pd.concat([self._views[view_name],
                                                        _primary_key], axis=1)
                else:
                    # case where primary key is present: we are updating just in case

                    self._views[view_name][_primary_key_name] = self._primary_key
        return self._views

    def save_multi_view_dataframe(self, path_folder:str):
        """saves multiview dataframe (folder containing multiple csvs)"""
        _multi_view_dataframe = self.get_multi_view_dataset()
        os.mkdir(path_folder)
        
        if self._primary_key is not None:
            _index = False  # if primary key is set, remove indexes
        else:
            _index = True
        for name in _multi_view_dataframe.keys():
            file_name = os.path.join(path_folder, name)
            _multi_view_dataframe[name].to_csv(file_name, index=_index)
        print(f'multi view dataset saved at {file_name}')
        
    def _concatenate(self, array):
        if self._array is None:
            self._array = array
        else:
            _act_col_names = self._array.columns.values.tolist()
            #_act_col_names.extend(array.columns.values.tolist())
            self._array = pd.concat([self._array, array], axis=1)
            _curr_col_name = self._array.columns.values 

            _act_col_names = reformate_col_name(_act_col_names)
            _curr_col_name = reformate_col_name(_curr_col_name)
            # needed for formatting columns name accordingly
            # (otherwise concatenation happen badly)
            
            self._array = self._array.rename(columns={k:v for k, v in zip(_curr_col_name, _act_col_names)})
            
    
    def _get_index(self, col_name:Union[str, int])->int:
        if isinstance(col_name, str):
            col_indx = self._array.columns.values.tolist().index(col_name)
        elif isinstance(col_name, int):
            col_indx = col_name
        return col_indx
    
    @staticmethod        
    def _check_if_valid_args(n_col, col_names):
        if isinstance(col_names, (list, tuple)):
            if n_col == len(col_names):
                raise ValueError(f"Mismatch: n_col ({n_col}) != len(col_names) ({len(col_names)})")
        

In [164]:
def reformate_col_name(col_names:List[Union[Tuple[str], int]]) -> List[Union[str, int]]:
    """reformates names of columns contained in list"""
    for i, name in enumerate(col_names):
        if isinstance(name, tuple):
            col_names[i] = col_names[i][0]
    return col_names
    

def create_multi_view_dataframe(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    _header_labels = ['views', 'feature_name']
    # 1. create multiindex header

    _feature_name_array = np.array([])  # store all feature names
    _view_name_array = []  # store all views (ie modalities) names

    _concatenated_datasets = np.array([])  # store dataframe values

    for key in datasets.keys():
        #_sub_dataframe_header.append(list(datasets[key].columns.values))
        _feature_name_array = np.concatenate([_feature_name_array,
                                              datasets[key].columns.values])
        if len(_concatenated_datasets) <= 0:
            # first pass 
            _concatenated_datasets = datasets[key].values
        else:
            # next passes
            try:
                _concatenated_datasets = np.concatenate(
                                        [_concatenated_datasets,
                                         datasets[key].to_numpy()
                                         ], axis=1)
            except ValueError as val_err:
                # catching case where nb_samples are differents
                raise ValueError(
                    'Cannot create multi view dataset: different number of samples for each modality have been detected'\
                        + 'Details: ' + str(val_err)
                    )
        for _ in datasets[key].columns.values:
            _view_name_array.append(key)

    _header = pd.MultiIndex.from_arrays([_view_name_array,
                                         _feature_name_array],
                                        names=_header_labels)


    # 2. create multi index dataframe

    multi_view_df = pd.DataFrame(_concatenated_datasets,
                                  columns = _header)
    return multi_view_df

## how to use Tabular data generator

In [171]:


tbg = TabularDataGenerator(100)

# first create 4 columns containing only integers
tbg.add_integers_values(4, ['a', 'e', 'i', 'o'])

tbg.add_boolean_values(4)  # add 4 boolean columns

tbg.add_datetime_values("2018-01-01", col_name='time')  # add datetime column
tbg.add_float_values(2, col_name=['pressure', 'sp02']) # add float
tbg.add_integers_values(4, ['a', 'e', 'i', 'o'])

tbg.add_character_values(col_name="gender")  #add character values
tbg.add_character_values(col_name="blood type", char_values=["A", "B", "O", "AB"])  # add values whithin ["A", "B", "O", "AB"]
tbg.add_missing_samples_to_column("blood type")  # add missing values whithin 'blood type'


In [170]:
tbg._array['blood type'].isna().any()

True

In [176]:
tbg.add_primary_key('pkey', dtype='str')  # add a primary key
tbg.set_view('file1')  # define the first view


tbg.add_boolean_values(4) # add data into the second view
tbg.add_datetime_values("2018-01-01", col_name='time')
tbg.add_float_values(col_name='pH')
tbg.shuffle_samples()
tbg.shuffle_primary_key()  # shuffle primary key
tbg.set_view('file2')

tbg.add_discrete_values(col_name="discrete")
tbg.add_character_values(col_name="city", char_values=['Paris', 'Marseille', 'Lille'])
tbg.set_view('contatct')

Dataframe for view None
dataset samples shuflled
Dataframe for view None
Dataframe for view None


In [177]:
tbg.get_multi_view_dataset()

{'file1':     discrete                  pkey
 0       39.0  ofhdbuktd iuwavnstge
 1       70.0  q tfysafqbvz mcpy zw
 2       28.0  mby avwhugriwmmolmzw
 3       50.0  qqkvlrynigercafefifz
 4       23.0  skotisyjg gehjomedp 
 ..       ...                   ...
 95      25.0  idpqmtzfodngftmzqrbb
 96      94.0  jcpgyduxuxh jhlqbmni
 97      50.0  jzheeqfylbjnqzrfmvev
 98      69.0  vf xvzhjyvhiywjjlacc
 99      76.0  dq gisjidyjidvtqwaoa
 
 [100 rows x 2 columns],
 'file2':         0      1      2      3                time        pH  \
 0   False  False   True  False 2018-01-01 00:00:00  0.946474   
 1    True   True  False  False 2018-01-01 01:00:00  0.513771   
 2   False   True   True  False 2018-01-01 02:00:00  0.577206   
 3    True  False   True  False 2018-01-01 03:00:00  0.179228   
 4    True   True  False   True 2018-01-01 04:00:00  0.823091   
 ..    ...    ...    ...    ...                 ...       ...   
 95  False   True   True  False 2018-01-04 23:00:00  0.437368   
 96

In [148]:
tbg._views['file2']

Unnamed: 0,0,1,2,3,time,0.1,0.2
0,False,False,False,True,2018-01-01 00:00:00,0.840792,vajimsztatlzitquscsm
1,True,True,False,False,2018-01-01 01:00:00,0.551657,duwkyykyzzqsoyoy jho
2,False,True,True,True,2018-01-01 02:00:00,0.787221,q xwmr ngpy igy rwzj
3,True,False,True,True,2018-01-01 03:00:00,0.862961,dgfzmbispegnvnmfxtew
4,False,False,True,True,2018-01-01 04:00:00,0.471008,yutmqvuc iqdbemdlbhb
...,...,...,...,...,...,...,...
95,True,False,True,True,2018-01-04 23:00:00,0.616472,nfahgscvorcbhdeyuu c
96,False,True,True,False,2018-01-05 00:00:00,0.788425,zwpugiyenckkizvwjpjr
97,False,False,True,False,2018-01-05 01:00:00,0.140394,mfavhdtoropccdkpdgwb
98,False,True,False,False,2018-01-05 02:00:00,0.747228,aztefxvtzrcrfmlhgkgt


In [155]:
tbg.save_multi_view_dataframe('test7') # save mutliview dataset

multi view dataset saved at test7/file2
