In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from joblib import Parallel,delayed
import multiprocessing
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from tqdm import tqdm
import sys
import yaml
sys.path.append('./..')
sys.path.append('./../..')
import warnings
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)
import re
from joblib import Parallel, delayed
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [19]:
try:
    from common_utils import utils
except:
    from .common_utils import utils

In [2]:

DATA_SOURCE = './../generated_data_v1/'
CONFIG = None
DIR_LOC = None
CONFIG = None
CONFIG_FILE = 'config.yaml'
id_col = 'PanjivaRecordID'
non_CoOcc_dict = {}
numeric_col_stats = {}
DATA_PATH = ''

In [3]:
    
def aux_gen(
    row,
    discrete_dim_list, # Ordered
    num_real_dims,
    column_encoder,
    num_samples = 10
):
    row_vals = row.values
    num_cat = len(discrete_dim_list)
    real_part = row_vals[-num_real_dims:]
    cat_part = row_vals[:num_cat]
    
    nr = num_real_dims//2
    ns = num_samples

    # ======
    a = max(1, num_real_dims//4)
    b = max(1, num_real_dims//4)
    
    c = num_real_dims - (a + b)
    # Adding -.5 to shift noise to be between -.5 to .5
    noise = np.concatenate(
        [np.random.random_sample([ns,a])  + -0.5, 
         np.random.random_sample([ns,b])  +  0.5, 
         np.zeros([ns,c])],
        axis=1
    )
   
    for i in range(ns):
        np.random.shuffle(noise[i])
    # ---
    # noise shape [ ns, num_real_dims ]

    part_r_duplicated = np.tile(real_part, ns).reshape([ns, num_real_dims])
    part_r_duplicated = part_r_duplicated + noise
    
    
    
    # ------------------------------
    # For categorical variables
    # ------------------------------
    
    P = [ np.power( _/sum(discrete_dim_list), 0.75)  for _ in discrete_dim_list]  
    P = [ _/sum(P) for _ in P]  
    part_c_duplicated = np.tile(cat_part,ns).reshape([ns,num_cat])
   
    res = []
    for i in range(ns):
        _copy = np.array(row_vals)[:num_cat]
        if num_cat < 3 :
            pert_idx = np.random.choice( list(np.arange(num_cat)) , size=1, replace = False, p = P)
        else:
            pert_idx = np.random.choice(
                list(np.arange(num_cat)),
                size = np.random.randint(1, num_cat//2+1 ),
                replace=False,
                p = P
            )

        for j in pert_idx:
            _copy[j] = np.random.choice(
                np.arange(discrete_dim_list[j]),1
            )
        part_c_duplicated[i] = _copy
        
        
    _samples = np.concatenate([part_c_duplicated, part_r_duplicated], axis=1)
    row_vals = np.reshape(row.values,[1,-1] )

    samples = np.concatenate([row_vals, _samples], axis=0)
    sample_cat_part = samples[:, :num_cat]
    samples_real_part = samples[:, -num_real_dims: ]

    # =========================
    # Do a 1-hot transformation
    # Drop binary columns
    # =========================

    onehot_xformed = column_encoder.fit_transform(sample_cat_part)
    onehot_xformed = onehot_xformed.astype(np.int)
    print('>>> 1-0 part ', onehot_xformed.shape)
    samples = np.concatenate([onehot_xformed, samples_real_part],axis=1)
    
    pos = samples[0]
    neg = samples[1:]
    return pos, neg

In [4]:
def generate_pos_neg_data (
        train_df,
        cat_domain_dims,
        num_samples=10
):
    try:
        del train_df['label']
    except:
        pass 
 
    num_cat = len(cat_domain_dims)
    num_real = len(train_df.columns) - num_cat
    
    oh_encoder_list = []
    idx = 0
    
    for _ , dim in cat_domain_dims.items():
        if dim ==2 :
            _drop = 'first'
        else:
            _drop = None
        name = "oh_"+str(idx) 
        oh_encoder = OneHotEncoder(
            np.reshape( list(range(dim)),[1,-1] ),
            sparse=False,
            drop=_drop
        ) 
        oh_encoder_list.append((name, oh_encoder, [idx]))
        idx +=1
    column_encoder = ColumnTransformer(
        oh_encoder_list
    )
                                
    discrete_dim_list = list(cat_domain_dims.values())
    n_jobs = multiprocessing.cpu_count()
    
    res = Parallel(n_jobs)(delayed(aux_gen)(
            row, discrete_dim_list, num_real, column_encoder, num_samples
        ) for i,row in tqdm(train_df.iterrows(), total=train_df.shape[0])
    )
      
    pos = []
    neg = []
    for r in res:
        pos.append(r[0])
        neg.append(r[1])

    pos = np.array(pos)
    neg = np.array(neg)

    return pos, neg

In [5]:
def set_up_config(_DIR = None):
    global DIR
    global CONFIG
    global CONFIG_FILE
    global use_cols
    global freq_bound
    global num_neg_samples_ape
    global save_dir
    global column_value_filters
    global num_neg_samples
    global NUMERIC_COLUMNS
    global id_col
    global DISCRETE_COLUMNS
    global DATA_PATH
    global DATA_SOURCE
    
    with open(CONFIG_FILE) as f:
        CONFIG = yaml.safe_load(f)
    
    DATA_PATH = os.path.join(DATA_SOURCE, DIR)
    if _DIR is not None:
        DIR = _DIR
        CONFIG['DIR'] = _DIR
    else:
        DIR = CONFIG['DIR']

    DATA_PATH = os.path.join(DATA_SOURCE, DIR)
    save_dir =  CONFIG['save_dir']
    
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    save_dir = os.path.join(
        CONFIG['save_dir'],
        DIR
    )
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
        
    use_cols = CONFIG[DIR]['use_cols']
 
    NUMERIC_COLUMNS = CONFIG[DIR]['numeric_columns']
    _cols = list(use_cols)
    _cols.remove(id_col)
    for nc in NUMERIC_COLUMNS:
        _cols.remove(nc)
        
    DISCRETE_COLUMNS = list(sorted(_cols))
    return 

def fetch_data_sets():
    global DATA_PATH
    df_train = pd.read_csv(os.path.join(DATA_PATH,'train_data.csv'),index_col=None)
    df_test = pd.read_csv(os.path.join(DATA_PATH,'test_data.csv'),index_col=None)
    return df_train, df_test

In [6]:
def fetch_anomalies():
    global DATA_PATH
    df_anomalies = pd.read_csv(os.path.join(DATA_PATH,'gen_anomalies.csv'),index_col=None)
    return df_anomalies

In [7]:
DIR = 'us_import1'
set_up_config(DIR)

In [8]:
# Normalise the numeric values
def normalize(val,_min,_max):
    return (val-_min)/(_max - _min)

df_train, df_test = fetch_data_sets()
df_anomalies = fetch_anomalies()
for nc in NUMERIC_COLUMNS:
    _max = np.max(df_train[nc])
    _min = np.min(df_train[nc])
    df_train[nc] =  df_train[nc].parallel_apply(normalize, args=(_min,_max,))
    df_test[nc] =  df_test[nc].parallel_apply(normalize, args=(_min,_max,))
    df_anomalies[nc] = df_anomalies[nc].parallel_apply(normalize, args=(_min,_max,))
    

In [9]:

def process_data_01(df, cat_domain_dims):
    global DISCRETE_COLUMNS
    global id_col
    
    try:
        del df[id_col]
    except:
        pass
    
    num_cat = len(cat_domain_dims)
    num_real = len(df.columns) - num_cat
    
    oh_encoder_list = []
    idx = 0
    
    for _ , dim in cat_domain_dims.items():
        if dim ==2 :
            _drop = 'first'
        else:
            _drop = None
        name = "oh_"+str(idx) 
        oh_encoder = OneHotEncoder(
            np.reshape(list(range(dim)),[1,-1]),
            sparse=False,
            drop=_drop
        ) 
       
        oh_encoder_list.append((name, oh_encoder, [idx]))
        idx +=1
        
    column_encoder = ColumnTransformer(
        oh_encoder_list
    )
    samples = df.values
    print(samples[0])
    sample_cat_part = samples[:, :num_cat]
    samples_real_part = samples[:, -num_real: ]
    onehot_xformed = column_encoder.fit_transform(sample_cat_part)
    onehot_xformed = onehot_xformed.astype(np.int)
    samples = np.concatenate([onehot_xformed, samples_real_part],axis=1)
    return samples


In [10]:
domain_dims_df = pd.read_csv(os.path.join(DATA_PATH,'data_dimensions.csv'),index_col=None)
cat_domain_dims = { k:v for k,v in zip(domain_dims_df['column'],domain_dims_df['dimension']) }

In [11]:
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)



In [12]:
try:
    del df_train['PanjivaRecordID']
except: 
    pass


In [13]:
pos, neg = generate_pos_neg_data (
        df_train,
        cat_domain_dims,
        num_samples=10
)


100%|██████████| 73635/73635 [07:22<00:00, 166.58it/s]


In [14]:
anomalies_X = process_data_01(df_anomalies, cat_domain_dims)
test_X = process_data_01(df_test, cat_domain_dims)

[ 2.62000000e+02  1.31100000e+03  1.19000000e+02  1.36000000e+02
  8.00000000e+00  5.00000000e+00  6.10000000e+01  7.32000000e+02
  2.49185201e-03 -2.30894577e-07 -2.28301452e-10]


  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':


[4.19000000e+02 5.81000000e+02 3.54000000e+02 4.20000000e+01
 2.50000000e+01 6.50000000e+01 1.40000000e+01 2.74800000e+03
 6.02409639e-03 5.77232264e-03 1.08897525e-05]


  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':


In [15]:
test_X.shape

(62980, 8540)

In [16]:
anomalies_X.shape

(15745, 8540)

In [17]:
anomalies_X[0]

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        2.49185201e-03, -2.30894577e-07, -2.28301452e-10])

In [26]:
file_path = os.path.join(save_dir,'data_testX.npy')
np.save(file_path,test_X,allow_pickle=True)