The purpose of this notebook is to store script for generating tabular random data


- single view data
- multi view (folder of csv files)
- multi view data (all contained in a single csv)

In [216]:
# 1 create random data


import pandas as pd
import numpy as np

from typing import Iterator, Union, List, Dict, Tuple

In [229]:
class TabularDataGenerator:
    def __init__(self, n_samples:int, 
                 feature_names:Iterator[str]=None,
                 is_multi_view:bool=False,
                 as_multi_index:bool=False):
        self._array = None
        
        self._n_samples = n_samples
        self._views = {}
        self._features_names = []
        self._primary_key = None  # either None or a pandas serie
        #self._is_view_set = False
        
    def set_primary_key(self, col_name: Union[str, int]=None):
        if col_name is not None:
            col_indx = self._get_index(col_name)
            self._primary_key = self._array[col_indx]

    def set_view(self, view_name:str):
        if self._array is not None:
            _df = self.get_single_view_dataframe()
        self._views[view_name] = _df
        self._features_names = []
        self._array = None
    
    def get_single_view_dataframe(self):
        #_df = pd.DataFrame(self._array, columns=self._features_names)
        return self._array
    
    def add_integers_values(self, n_col: int=1,
                            col_name:Union[str, Iterator[str]] =None,
                            l_bound:int=0,
                           u_bound:int=100):

        _rand_int = np.random.randint(l_bound, u_bound, size=(self._n_samples, n_col))
        _rand_int = pd.DataFrame(_rand_int)
        if col_name is not None:
            _rand_int.columns = [col_name]
        self._concatenate(_rand_int)
        if col_name is not None:
            self._features_names.append(col_name)
    
    def add_float_values(self, n_col:int=1, col_name:str=None):
        _rand = np.random.random((self._n_samples, n_col))
        _rand = pd.DataFrame(_rand)
        if col_name is not None:
            _rand.columns = [col_name]
        self._concatenate(_rand)
        if col_name is not None:
            self._features_names.append(col_name)
            
    def add_discrete_values(self, n_col:int=1,
                            col_name:Union[str, Iterator[str]] =None,
                            l_bound:int=0,
                           u_bound:int=100):
        
        _rand_int = np.random.randint(l_bound, u_bound, size=(self._n_samples, n_col))
        _rand_int = np.array(_rand_int, dtype=np.float64) # changing type from int to float
        _rand_int = pd.DataFrame(_rand_int)
        if col_name is not None:
            _rand_int.columns = [col_name]
        self._concatenate(_rand_int)
        if col_name is not None:
            self._features_names.append(col_name)
            
    def add_datetime_values(self,  start=None, end=None, col_name:str=None, freq='H', **kwargs):
        _dti = pd.date_range(start, end, periods=self._n_samples, freq=freq,**kwargs)
        _dti = pd.DataFrame(dti)
        if col_name is not None:
            _dti.columns = [col_name]
        self._concatenate(_dti)
        if col_name is not None:
            self._features_names.append(col_name)
    
    def add_missing_samples_to_column(self, col_name:Union[str, int], n_missing_points:int=1):
        
        if self._array is None:
            raise ValueError("please add data before completing it with missig data")
        if n_missing_points >= self._n_samples:
            raise ValueError(f"too much missing points (n_missing_points {n_missing_points} < n_samples {self._n_samples})")
        col_indx = self._get_index(col_name)
        _idx = np.arange(self._n_samples)
        np.random.shuffle(_idx)
        _array = self._array[:, col_indx]
        _shuffled_array = _array[_idx]
        _shuffled_array[:n_missing_points] = np.nan
        self._array[:, col_indx] = _shuffled_array[_idx]
    
    def add_boolean_values(self, n_col:int=1, col_name: Union[str, Iterator[str]]=None):
        _rand_bool = np.random.randint(0,2,size=(100, 4), dtype=bool)
        _rand_bool = pd.DataFrame(_rand_bool)
        if col_name is not None:
            _rand_bool.columns = [col_name]
        self._concatenate(_rand_bool)
        if col_name is not None:
            self._features_names.append(col_name)
        
    def shuffle_columns(self):
        pass
    
    def get_multi_index_dataframe(self):
        _dataset = self.get_multi_view_dataset()
        _multi_index_df = create_multi_view_dataframe(_dataset)
        return _multi_index_df
    
    def get_multi_view_dataset(self) -> Dict[str, pd.DataFrame]:
        if not self._views:
            self._views['view_0'] = self._array
        if self._primary_key is not None:
            # set a primary key to all dataframe
            for view_name in self._views.keys():
                if self._primary_key.name not in self._views[view_name].columns.values:
                    pd.concatenate([self._views[view_name], self._primary_key], axis=1)
                else:
                    self._views[view_name][self._primary_key.name] = self._primary_key
        return self._views
    
    def _concatenate(self, array):
        if self._array is None:
            self._array = array
        else:
            _act_col_names = self._array.columns.values.tolist()
            #_act_col_names.extend(array.columns.values.tolist())
            self._array = pd.concat([self._array, array], axis=1)
            _curr_col_name = self._array.columns.values 
            print(_act_col_names)
            _act_col_names = reformate_col_name(_act_col_names)
            _curr_col_name = reformate_col_name(_curr_col_name)
            # needed for formatting columns name accordingly
            # (otherwise concatenation happen badly)
            
            
            self._array = self._array.rename(columns={k:v for k, v in zip(_curr_col_name, _act_col_names)})
            
    
    def _get_index(self, col_name:Union[str, int])->int:
        if isinstance(col_name, str):
            col_indx = self._array.columns.values.tolist().index(col_name)
        elif isinstance(col_name, int):
            col_indx = col_name
        return col_indx
    
    @staticmethod        
    def _check_if_valid_args(n_col, col_names):
        if isinstance(col_names, (list, tuple)):
            if n_col == len(col_names):
                raise ValueError(f"Mismatch: n_col ({n_col}) != len(col_names) ({len(col_names)})")
        

In [230]:
def reformate_col_name(col_names:List[Union[Tuple[str], int]]) -> List[Union[str, int]]:
    """reformates names of columns contained in list"""
    for i, name in enumerate(col_names):
        if isinstance(name, tuple):
            col_names[i] = col_names[i][0]
    return col_names
    

def create_multi_view_dataframe(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    _header_labels = ['views', 'feature_name']
    # 1. create multiindex header

    _feature_name_array = np.array([])  # store all feature names
    _view_name_array = []  # store all views (ie modalities) names

    _concatenated_datasets = np.array([])  # store dataframe values

    for key in datasets.keys():
        #_sub_dataframe_header.append(list(datasets[key].columns.values))
        _feature_name_array = np.concatenate([_feature_name_array,
                                              datasets[key].columns.values])
        if len(_concatenated_datasets) <= 0:
            # first pass 
            _concatenated_datasets = datasets[key].values
        else:
            # next passes
            try:
                _concatenated_datasets = np.concatenate(
                                        [_concatenated_datasets,
                                         datasets[key].to_numpy()
                                         ], axis=1)
            except ValueError as val_err:
                # catching case where nb_samples are differents
                raise ValueError(
                    'Cannot create multi view dataset: different number of samples for each modality have been detected'\
                        + 'Details: ' + str(val_err)
                    )
        for _ in datasets[key].columns.values:
            _view_name_array.append(key)

    _header = pd.MultiIndex.from_arrays([_view_name_array,
                                         _feature_name_array],
                                        names=_header_labels)


    # 2. create multi index dataframe

    multi_view_df = pd.DataFrame(_concatenated_datasets,
                                  columns = _header)
    return multi_view_df

In [231]:
tbg = TabularDataGenerator(100)

tbg.add_integers_values(4, ['a', 'e', 'i', 'o'])
print(tbg._array.columns.to_numpy())
print(tbg._array.columns.values)
print(tbg._array.columns.values)
print(tbg._array.columns.values)
tbg.add_boolean_values(4)
print(tbg._array)
tbg.add_datetime_values("2018-01-01", col_name='time')
tbg.add_float_values(2, col_name=['pressure', 'sp02'])
tbg.add_integers_values(4, ['a', 'e', 'i', 'o'])

print(tbg._array)

[('a',) ('e',) ('i',) ('o',)]
[('a',) ('e',) ('i',) ('o',)]
[('a',) ('e',) ('i',) ('o',)]
[('a',) ('e',) ('i',) ('o',)]
[('a',), ('e',), ('i',), ('o',)]
     a   e   i   o      0      1      2      3
0   99  14  91  52  False   True   True  False
1    8  28  57  96   True   True  False   True
2   45  77  14  23  False   True  False  False
3   96  53   1  90  False   True  False  False
4   60  75  47  58   True   True   True  False
..  ..  ..  ..  ..    ...    ...    ...    ...
95  68  28  62  23   True   True   True  False
96  27  80   2  84   True   True  False  False
97  58  16  78  20  False   True  False   True
98  60  61  38   7  False   True  False   True
99  61  65  57  68  False  False   True   True

[100 rows x 8 columns]
['a', 'e', 'i', 'o', 0, 1, 2, 3]
['a', 'e', 'i', 'o', 0, 1, 2, 3, 'time']
['a', 'e', 'i', 'o', 0, 1, 2, 3, 'time', 'pressure', 'sp02']
     a   e   i   o      0      1      2      3                time  pressure  \
0   99  14  91  52  False   True   True  Fal

In [237]:
tbg._primary_key.name

0

In [232]:
tbg.set_primary_key('a')
tbg.set_view('file1')
tbg.add_boolean_values(4)
tbg.add_datetime_values("2018-01-01", col_name='time')
tbg.add_float_values()
tbg.set_view('file2')
tbg.get_multi_view_dataset()
tbg.get_multi_index_dataframe()

[0, 1, 2, 3]
[0, 1, 2, 3, 'time']


AttributeError: 'Series' object has no attribute 'columns'

In [215]:

np.array(np.random.randint(0,100,size=(100, 4)), dtype=np.float64)

array([[ 6., 52., 43., 43.],
       [11., 59., 34., 95.],
       [ 6., 43., 42., 31.],
       [66.,  5., 96., 53.],
       [19., 82., 50., 70.],
       [53., 58., 74., 81.],
       [59., 12., 74., 46.],
       [ 7., 13., 84., 45.],
       [97., 27., 12., 42.],
       [34.,  4., 50., 51.],
       [74., 87., 66., 69.],
       [65., 42., 14., 11.],
       [85., 44., 69., 95.],
       [65.,  2., 99., 74.],
       [80., 31., 92., 88.],
       [56., 82., 62., 57.],
       [79., 74., 37., 58.],
       [ 1., 81., 14., 35.],
       [ 3., 16., 69., 30.],
       [67., 56., 31., 41.],
       [75., 26.,  3.,  5.],
       [79., 94., 99., 73.],
       [14., 85., 67., 70.],
       [58., 10., 40., 41.],
       [98., 10., 66.,  0.],
       [21., 29., 93., 43.],
       [11., 68., 11., 21.],
       [28., 61., 22., 84.],
       [13., 16., 62., 42.],
       [13., 90., 99., 72.],
       [70.,  3., 97., 89.],
       [79.,  1., 59., 71.],
       [71.,  9., 69., 17.],
       [56.,  0., 47.,  9.],
       [24., 9