The purpose of this notebook is to store script for generating tabular random data


- single view data
- multi view (folder of csv files)
- multi view data (all contained in a single csv)

In [4]:
# 1 create random data


import pandas as pd
import numpy as np
import os

from typing import Iterator, Union, List, Dict, Tuple

In [34]:
class TabularDataGenerator:
    def __init__(self, n_samples:int, 
                 feature_names:Iterator[str]=None,
                 is_multi_view:bool=False,
                 as_multi_index:bool=False):
        self._array = None
        
        self._n_samples = n_samples
        self._views = {}
        self._features_names = []
        self._primary_key = None  # either None or a pandas serie
        #self._is_view_set = False
        
    def set_primary_key(self, col_name: Union[str, int]=None):
        if col_name is not None:
            col_indx = self._get_index(col_name)
            print('col_indx', col_indx)
            self._primary_key = self._array.iloc[:,col_indx]

    def set_view(self, view_name:str):
        if self._array is not None:
            _df = self.get_single_view_dataframe()
        self._views[view_name] = _df
        self._features_names = []
        self._array = None
    
    def get_single_view_dataframe(self):
        #_df = pd.DataFrame(self._array, columns=self._features_names)
        return self._array
    
    def add_integers_values(self, n_col: int=1,
                            col_name:Union[str, Iterator[str]] =None,
                            l_bound:int=0,
                           u_bound:int=100):

        _rand_int = np.random.randint(l_bound, u_bound, size=(self._n_samples, n_col))
        _rand_int = pd.DataFrame(_rand_int)
        if col_name is not None:
            _rand_int.columns = [col_name]
        self._concatenate(_rand_int)
        if col_name is not None:
            self._features_names.append(col_name)
    
    def add_float_values(self, n_col:int=1, col_name:str=None):
        _rand = np.random.random((self._n_samples, n_col))
        _rand = pd.DataFrame(_rand)
        if col_name is not None:
            _rand.columns = [col_name]
        self._concatenate(_rand)
        if col_name is not None:
            self._features_names.append(col_name)
            
    def add_discrete_values(self, n_col:int=1,
                            col_name:Union[str, Iterator[str]] =None,
                            l_bound:int=0,
                           u_bound:int=100):
        
        _rand_int = np.random.randint(l_bound, u_bound, size=(self._n_samples, n_col))
        _rand_int = np.array(_rand_int, dtype=np.float64) # changing type from int to float
        _rand_int = pd.DataFrame(_rand_int)
        if col_name is not None:
            _rand_int.columns = [col_name]
        self._concatenate(_rand_int)
        if col_name is not None:
            self._features_names.append(col_name)
            
    def add_datetime_values(self,  start=None, end=None, col_name:str=None, freq='H', **kwargs):
        _dti = pd.date_range(start, end, periods=self._n_samples, freq=freq,**kwargs)
        _dti = pd.DataFrame(_dti)
        if col_name is not None:
            _dti.columns = [col_name]
        self._concatenate(_dti)
        if col_name is not None:
            self._features_names.append(col_name)
    
    def add_missing_samples_to_column(self, col_name:Union[str, int], n_missing_points:int=1):
        
        if self._array is None:
            raise ValueError("please add data before completing it with missig data")
        if n_missing_points >= self._n_samples:
            raise ValueError(f"too much missing points (n_missing_points {n_missing_points} < n_samples {self._n_samples})")
        col_indx = self._get_index(col_name)
        _idx = np.arange(self._n_samples)
        np.random.shuffle(_idx)
        _array = self._array[:, col_indx]
        _shuffled_array = _array[_idx]
        _shuffled_array[:n_missing_points] = np.nan
        self._array[:, col_indx] = _shuffled_array[_idx]
    
    def add_boolean_values(self, n_col:int=1, col_name: Union[str, Iterator[str]]=None):
        _rand_bool = np.random.randint(0,2,size=(100, 4), dtype=bool)
        _rand_bool = pd.DataFrame(_rand_bool)
        if col_name is not None:
            _rand_bool.columns = [col_name]
        self._concatenate(_rand_bool)
        if col_name is not None:
            self._features_names.append(col_name)
        
    def shuffle_columns(self):
        pass
    
    def get_multi_index_dataframe(self):
        _dataset = self.get_multi_view_dataset()
        _multi_index_df = create_multi_view_dataframe(_dataset)
        return _multi_index_df
    
    def get_multi_view_dataset(self) -> Dict[str, pd.DataFrame]:
        """gets tab"""
        if not self._views:
            self._views['view_0'] = self._array
        if self._primary_key is not None:
            # set a primary key to all dataframe
            for view_name in self._views.keys():
                # iterate over all views
                if self._primary_key.name not in self._views[view_name].columns.values:
                    # case where primary key is not present
                    self._views[view_name] = pd.concat([self._views[view_name],
                                                        self._primary_key], axis=1)
                else:
                    # case where primary key is present: we are updating just in case
                    self._views[view_name][self._primary_key.name] = self._primary_key
        return self._views
    
    def save_multi_view_dataframe(self, path_folder:str):
        """saves multiview dataframe (folder containing multiple csvs)"""
        _multi_view_dataframe = self.get_multi_view_dataset()
        os.mkdir(path_folder)
        for name in _multi_view_dataframe.keys():
            file_name = os.pah.join(path_folder, name)
            _multi_view_dataframe[name].to_csv(file_name)
        print(f'multi view dataset saved at {file_name}')
        
    def _concatenate(self, array):
        if self._array is None:
            self._array = array
        else:
            _act_col_names = self._array.columns.values.tolist()
            #_act_col_names.extend(array.columns.values.tolist())
            self._array = pd.concat([self._array, array], axis=1)
            _curr_col_name = self._array.columns.values 
            print(_act_col_names)
            _act_col_names = reformate_col_name(_act_col_names)
            _curr_col_name = reformate_col_name(_curr_col_name)
            # needed for formatting columns name accordingly
            # (otherwise concatenation happen badly)
            
            
            self._array = self._array.rename(columns={k:v for k, v in zip(_curr_col_name, _act_col_names)})
            
    
    def _get_index(self, col_name:Union[str, int])->int:
        if isinstance(col_name, str):
            col_indx = self._array.columns.values.tolist().index(col_name)
        elif isinstance(col_name, int):
            col_indx = col_name
        return col_indx
    
    @staticmethod        
    def _check_if_valid_args(n_col, col_names):
        if isinstance(col_names, (list, tuple)):
            if n_col == len(col_names):
                raise ValueError(f"Mismatch: n_col ({n_col}) != len(col_names) ({len(col_names)})")
        

In [35]:
def reformate_col_name(col_names:List[Union[Tuple[str], int]]) -> List[Union[str, int]]:
    """reformates names of columns contained in list"""
    for i, name in enumerate(col_names):
        if isinstance(name, tuple):
            col_names[i] = col_names[i][0]
    return col_names
    

def create_multi_view_dataframe(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    _header_labels = ['views', 'feature_name']
    # 1. create multiindex header

    _feature_name_array = np.array([])  # store all feature names
    _view_name_array = []  # store all views (ie modalities) names

    _concatenated_datasets = np.array([])  # store dataframe values

    for key in datasets.keys():
        #_sub_dataframe_header.append(list(datasets[key].columns.values))
        _feature_name_array = np.concatenate([_feature_name_array,
                                              datasets[key].columns.values])
        if len(_concatenated_datasets) <= 0:
            # first pass 
            _concatenated_datasets = datasets[key].values
        else:
            # next passes
            try:
                _concatenated_datasets = np.concatenate(
                                        [_concatenated_datasets,
                                         datasets[key].to_numpy()
                                         ], axis=1)
            except ValueError as val_err:
                # catching case where nb_samples are differents
                raise ValueError(
                    'Cannot create multi view dataset: different number of samples for each modality have been detected'\
                        + 'Details: ' + str(val_err)
                    )
        for _ in datasets[key].columns.values:
            _view_name_array.append(key)

    _header = pd.MultiIndex.from_arrays([_view_name_array,
                                         _feature_name_array],
                                        names=_header_labels)


    # 2. create multi index dataframe

    multi_view_df = pd.DataFrame(_concatenated_datasets,
                                  columns = _header)
    return multi_view_df

In [36]:
tbg = TabularDataGenerator(100)

tbg.add_integers_values(4, ['a', 'e', 'i', 'o'])
print(tbg._array.columns.to_numpy())
print(tbg._array.columns.values)
print(tbg._array.columns.values)
print(tbg._array.columns.values)
tbg.add_boolean_values(4)
print(tbg._array)
tbg.add_datetime_values("2018-01-01", col_name='time')
tbg.add_float_values(2, col_name=['pressure', 'sp02'])
tbg.add_integers_values(4, ['a', 'e', 'i', 'o'])

print(tbg._array)

[('a',) ('e',) ('i',) ('o',)]
[('a',) ('e',) ('i',) ('o',)]
[('a',) ('e',) ('i',) ('o',)]
[('a',) ('e',) ('i',) ('o',)]
[('a',), ('e',), ('i',), ('o',)]
     a   e   i   o      0      1      2      3
0   78  66  72  53   True  False   True  False
1   67  98   1  50   True  False   True   True
2    2  70  46  75   True   True  False   True
3   26   3   3  69  False  False   True  False
4   57  61  78  40  False   True  False  False
..  ..  ..  ..  ..    ...    ...    ...    ...
95  36  98  30   8  False   True   True  False
96  21  21  48  58  False  False   True  False
97  53  15   6  77   True  False   True   True
98  76  52  33  93   True  False   True  False
99  23  16  56  29   True   True  False   True

[100 rows x 8 columns]
['a', 'e', 'i', 'o', 0, 1, 2, 3]
['a', 'e', 'i', 'o', 0, 1, 2, 3, 'time']
['a', 'e', 'i', 'o', 0, 1, 2, 3, 'time', 'pressure', 'sp02']
     a   e   i   o      0      1      2      3                time  pressure  \
0   78  66  72  53   True  False   True  Fal

In [37]:
tbg._primary_key

In [38]:
tbg.set_primary_key('a')
tbg.set_view('file1')
tbg.add_boolean_values(4)
tbg.add_datetime_values("2018-01-01", col_name='time')
tbg.add_float_values()
tbg.set_view('file2')
tbg.get_multi_view_dataset()


col_indx 0
[0, 1, 2, 3]
[0, 1, 2, 3, 'time']


{'file1':      a   e   i   o      0      1      2      3                time  pressure  \
 0   78  66  72  53   True  False   True  False 2018-01-01 00:00:00  0.209379   
 1   67  98   1  50   True  False   True   True 2018-01-01 01:00:00  0.412792   
 2    2  70  46  75   True   True  False   True 2018-01-01 02:00:00  0.327938   
 3   26   3   3  69  False  False   True  False 2018-01-01 03:00:00  0.453829   
 4   57  61  78  40  False   True  False  False 2018-01-01 04:00:00  0.065071   
 ..  ..  ..  ..  ..    ...    ...    ...    ...                 ...       ...   
 95  36  98  30   8  False   True   True  False 2018-01-04 23:00:00  0.104872   
 96  21  21  48  58  False  False   True  False 2018-01-05 00:00:00  0.118902   
 97  53  15   6  77   True  False   True   True 2018-01-05 01:00:00  0.931347   
 98  76  52  33  93   True  False   True  False 2018-01-05 02:00:00  0.208917   
 99  23  16  56  29   True   True  False   True 2018-01-05 03:00:00  0.836389   
 
         sp02   a

In [39]:

np.array(np.random.randint(0,100,size=(100, 4)), dtype=np.float64)

array([[12., 54., 16., 16.],
       [60., 96., 88., 44.],
       [29., 92., 33., 18.],
       [41., 24.,  7., 32.],
       [59.,  0.,  6., 82.],
       [ 2., 81., 19., 52.],
       [25., 49., 75., 23.],
       [27., 40., 19., 87.],
       [49., 65., 38., 41.],
       [28., 93., 36., 79.],
       [15., 27., 16., 18.],
       [27.,  8., 31., 39.],
       [92., 42., 17., 24.],
       [ 0., 74., 82., 43.],
       [66., 85., 45., 40.],
       [12., 10., 22., 52.],
       [93., 37., 89., 54.],
       [17., 87., 23., 72.],
       [35., 92., 50., 26.],
       [12., 77., 45., 35.],
       [98., 35., 82.,  1.],
       [60., 65., 22., 99.],
       [91., 33., 17., 78.],
       [47., 55., 56., 76.],
       [84., 65., 43., 59.],
       [81., 33., 43., 60.],
       [29., 26., 67., 38.],
       [17., 81., 16., 36.],
       [29., 43., 25., 76.],
       [84., 58., 79., 53.],
       [90., 95., 67., 68.],
       [30., 78., 89., 95.],
       [14., 98., 49., 53.],
       [62., 55., 19., 22.],
       [47., 4