In [1]:
import os
import sys
import pandas as pd
import numpy as np
import sklearn
import glob
import pickle
import random
from joblib import Parallel, delayed

In [2]:
DIR = 'us_import'

# Let us create APE style test & training sets
## us import :  Train on 2015(01-07) Test(08-09)

In [18]:
def get_files(_type='all'):
    data_dir = os.path.join(
        './../wwf_data_v1',
        DIR
    )
    if _type == 'train':
        files = sorted(glob.glob(os.path.join(data_dir,'*0[1-4]**2015*.csv')))
    elif _type == 'test':
        files = sorted(glob.glob(os.path.join(data_dir,'*0[5-6]*2015*.csv')))
            
    else:
        files = sorted(glob.glob(os.path.join(data_dir,'*.csv')))
        
    return files

In [4]:
use_cols = [ 
    'PanjivaRecordID',
    'ConsigneeCountry',
    'ConsigneePanjivaID',
    'ShipperCountry',
    'ShipperPanjivaID',
    'ShipmentOrigin',
    'ShipmentDestination',
    'hscode_6',
    'PortOfUnlading',
    'PortOfLading',
    'Carrier',
]

freq_bound = 5
column_value_filters = {
    'Carrier':['(Usa))']
}
id_col = 'PanjivaRecordID'
ns_id_col = 'NegSampleID'
term_2_col = 'term_2'
term_4_col = 'term_4'
num_neg_samples = 3

In [5]:
def replace_attr_with_id(row, attr, val2id_dict):
    val = row[attr]
    if val not in val2id_dict.keys():
        print(attr,val)
        return None
    else:
        return val2id_dict[val]

In [7]:
files = get_files(_type='train')
print(files)
for f in files:
    df = pd.read_csv(f)
    print(len(df))
    print(df.columns)
    feature_cols = list(df.columns)
    feature_cols.remove(id_col)
    feature_cols = list(sorted(feature_cols))

['./../wwf_data_v1/us_import/panjiva_us_imports_01_2015_filtered.csv', './../wwf_data_v1/us_import/panjiva_us_imports_02_2015_filtered.csv', './../wwf_data_v1/us_import/panjiva_us_imports_03_2015_filtered.csv', './../wwf_data_v1/us_import/panjiva_us_imports_04_2015_filtered.csv', './../wwf_data_v1/us_import/panjiva_us_imports_05_2015_filtered.csv', './../wwf_data_v1/us_import/panjiva_us_imports_06_2015_filtered.csv', './../wwf_data_v1/us_import/panjiva_us_imports_07_2015_filtered.csv']
55209
Index(['PanjivaRecordID', 'ConsigneeCity', 'ConsigneeStateRegion',
       'ConsigneeCountry', 'ConsigneePanjivaID', 'ShipperCity',
       'ShipperCountry', 'ShipperPanjivaID', 'Carrier', 'ShipmentOrigin',
       'ShipmentDestination', 'PortOfUnlading', 'PortOfLading', 'VolumeTEU',
       'WeightKg', 'ValueOfGoodsUSD', 'hscode_6'],
      dtype='object')
51774
Index(['PanjivaRecordID', 'ConsigneeCity', 'ConsigneeStateRegion',
       'ConsigneeCountry', 'ConsigneePanjivaID', 'ShipperCity',
       'Shi

In [8]:
def convert_to_ids(
    df,  
    save_dir
):
    global id_col
    
    feature_columns = list(df.columns)
    feature_columns.remove(id_col)
    domain_dims_dict = {}
    col_val2id_dict = {}
  
    for col in sorted(feature_columns):
        
        vals = list(set(df[col]))
        # 0 : item1 , 1 :item2, ....
        id2val_dict = {
            e[0]: e[1]
            for e in enumerate(vals, 0)
        }

        # item1 : 0, item2 : 1, ...
        val2id_dict = {v: k for k, v in id2val_dict.items()}
        
        col_val2id_dict[col] = val2id_dict
        
        # replace
        df[col] = df.apply(
            replace_attr_with_id,
            axis=1,
            args=(
                col,
                val2id_dict,
            )
        )
        domain_dims_dict[col] = len(id2val_dict)
    domain_dims = []
    domain_dims_res = {}
    print(list(df.columns))
    
    for col in list(df.columns):
        if col in domain_dims_dict.keys():
            print(col)
            domain_dims_res[col] = domain_dims_dict[col]
            domain_dims.append(domain_dims_dict[col])
        
    domain_dims =  np.array(domain_dims)
    print(domain_dims_res)

    file = 'domain_dims.pkl'
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
        
    f_path = os.path.join(save_dir, file)
    
    with open(f_path,'wb') as fh:
        pickle.dump(
            domain_dims_res,
            fh,
            pickle.HIGHEST_PROTOCOL
        )
    return df, col_val2id_dict

In [9]:
def collate(file_list):
    global id_col
    _master_df = None
    for file in file_list:
        _df = pd.read_csv(
            file, 
            low_memory=False,
            usecols = use_cols
        )
        _df = _df.dropna()
        if _master_df is None:
            _master_df = pd.DataFrame(_df)
        else:
            _master_df = _master_df.append(
                _df,
                ignore_index=True
            )
    feature_cols = list(_master_df.columns)
    feature_cols.remove(id_col)
    feature_cols = list(sorted(feature_cols))
    all_cols = [id_col]
    all_cols.extend(feature_cols)
    print(all_cols)
    _master_df = _master_df[all_cols]
    return _master_df

In [10]:
def remove_low_frequency_values(_df):
    global id_col
    global freq_bound 
    from collections import Counter
    
    freq_column_value_filters = {}
    
    feature_cols = list(_df.columns)
    feature_cols.remove(id_col)
    
    for c in feature_cols:
        values = list(_df[c])
        freq_column_value_filters [c] = []
        items = set(values)
        obj_counter = Counter(values)
        for _item, _count in  obj_counter.items():
            if _count < freq_bound :
                  freq_column_value_filters[c].append(_item)
    
    for c,_items in freq_column_value_filters.items():
        print(c, len(_items))
    print(len(_df))
    for col,val in freq_column_value_filters.items():
        _df = _df.loc[
            (~_df[col].isin(val))
        ]
    print(len(_df))   
    return _df

In [11]:
def validate(row,ref_df):
    global id_col
    query_str = []
    for _c, _i in row.to_dict().items():
        if _c == id_col: 
            continue
        query_str.append(' ' + _c + ' == ' + str(_i) )
    query_str = ' & '.join(query_str)
    res_query = ref_df.query(query_str)
  
    if len(res_query) > 0 : 
        return False
    return True 


In [12]:
'''
returns c random items as a dict
column_name : item_id
'''

def  get_c_vals(anomaly_cols, col_val2id_dict):
    res_dict = {}
    for col in anomaly_cols:
        res_dict[col] = random.sample(list(col_val2id_dict[col].values()), 1)[0]
    return res_dict

In [13]:
def create_anomalies(test_df, train_df, col_val2id_dict, c=3):
    global id_col
    feature_cols = list(test_df.columns)
    feature_cols.remove(id_col)
    feature_cols_id = { e[0]:e[1] for e in enumerate(feature_cols)}
    ref_df = pd.DataFrame(train_df,copy=True)
    ref_df = ref_df.append(
        test_df,
        ignore_index=True
    )
    new_df = pd.DataFrame(columns=list(test_df.columns))
    
    for i,row  in test_df.iterrows():   
         while True:
            _anomaly_cols = [feature_cols_id[_] 
                             for _ in random.sample(
                                 list(feature_cols_id.keys()), 
                                 k=3
                             )
                        ]
            c_vals = get_c_vals(_anomaly_cols, col_val2id_dict)
            row_copy = pd.Series(row,copy = True)
            for _col, _item_id in c_vals.items():
                row_copy[_col] = _item_id
            if validate(row_copy, ref_df):
                row_copy[id_col] = int( str(row_copy[id_col]) + '01' )
                new_df = new_df.append(row_copy, ignore_index=True)
                break;
        
    # sample c cols
    new_df = new_df.drop_duplicates(subset=feature_cols)
    print(' Length of anomalies_df ',new_df)
    return new_df

In [14]:
# ------------------- #

In [15]:
def setup_testing_data( test_df, train_df, col_val2id_dict):
    global id_col
    # Replace with None if ids are not in train_set
    print('----')
    feature_cols = list(test_df.columns)
    feature_cols.remove(id_col)
    
    for col in feature_cols:
        valid_items = list(col_val2id_dict[col].keys())
        test_df = test_df.loc[test_df[col].isin(valid_items)]
        
    print(' Length of testing data' , len(test_df))
    
    
    # First convert to to ids
    for col in feature_cols:
        val2id_dict = col_val2id_dict[col]
        test_df[col] = test_df.apply(
                replace_attr_with_id,
                axis = 1,
                args = (
                    col,
                    val2id_dict,
                )
            )
    '''
    Remove duplicates :
    '''
    
    print(' Length of test df :: ', len(test_df) )
    new_test_df = pd.DataFrame( columns= list(test_df.columns))
    
    for i,row in test_df.iterrows():
        if validate(row, train_df):
            new_test_df = new_test_df.append(row,ignore_index=True)
            print(len(new_test_df))
    print(' After deduplication :: ', len(new_test_df))
    
    anomalies_df = create_anomalies(new_test_df, train_df, col_val2id_dict, c=3)
    return new_test_df, anomalies_df

In [16]:
def create_us_import_train_test_sets():
    global use_cols
    global DIR
    global save_dir
    global column_value_filters
    train_files = get_files('train')
    test_files = get_files('test')
  
    # combine train_data :
    train_master_df = collate(train_files)
    test_master_df = collate(test_files)
    
    print(' Train initial ', len(train_master_df)) 
    print(' Test initial ', len(test_master_df)) 
            
    save_dir = os.path.join('./../generated_data',DIR)
    
    '''
    test data preprocessing
    '''
    print(len(train_master_df))
    
    '''
    Remove values that are garbage
    '''
    for col,val in column_value_filters.items():
        train_master_df = train_master_df.loc[
            (~train_master_df[col].isin(val))
        ]
         
    print(' Length of training data ', len(train_master_df))  
    
    train_master_df = remove_low_frequency_values(
        train_master_df
    )
    
    train_master_df_1, col_val2id_dict = convert_to_ids(
        train_master_df,
        save_dir
    )
    
    new_test_df, anomalies_df = setup_testing_data(
        test_master_df,
        train_master_df_1,
        col_val2id_dict
    )
    
    # Save the data
    new_test_df.to_csv(os.path.join(save_dir,'test_data.csv'),index=False)
    train_master_df_1.to_csv(os.path.join(save_dir,'train_data.csv'),index=False)
    anomalies_df.to_csv(os.path.join(save_dir,'anomalies_test_data.csv'),index=False)
    
    return 
    

In [20]:
create_us_import_train_test_sets()


['PanjivaRecordID', 'Carrier', 'ConsigneeCountry', 'ConsigneePanjivaID', 'PortOfLading', 'PortOfUnlading', 'ShipmentDestination', 'ShipmentOrigin', 'ShipperCountry', 'ShipperPanjivaID', 'hscode_6']
['PanjivaRecordID', 'Carrier', 'ConsigneeCountry', 'ConsigneePanjivaID', 'PortOfLading', 'PortOfUnlading', 'ShipmentDestination', 'ShipmentOrigin', 'ShipperCountry', 'ShipperPanjivaID', 'hscode_6']
 Train initial  177242
 Test initial  100900
177242
 Length of training data  177242
Carrier 272
ConsigneeCountry 37
ConsigneePanjivaID 19758
PortOfLading 139
PortOfUnlading 32
ShipmentDestination 51
ShipmentOrigin 39
ShipperCountry 46
ShipperPanjivaID 20545
hscode_6 8
177242
134662


KeyboardInterrupt: 

In [21]:
import math 

def get_neg_sample_ape(_k, column_id, column_name, ref_df, column_valid_values, orig_row, P_A, feature_cols_id ):
    global id_col
    global ns_id_col
    global term_4_col 
    global term_2_col
    
    new_row = pd.Series(orig_row,copy=True)
    Pid_val = orig_row[id_col]
    while True:
        _random = random.sample(
            column_valid_values[column_name], 1
        )[0]
        new_row[column_name] = _random
        if validate(new_row, ref_df):
            new_row = pd.Series(orig_row,copy=True)
            new_row[ns_id_col] = int( '10' + str(_k) + str(column_id) + str(Pid_val) + '01' )
            new_row[term_4_col] =  np.log(P_A[column_id][_random])
            _tmp = 0                         
            for _fci, _fcn  in feature_cols_id.items():
                _val = P_A[_fci][orig_row[_fcn]]
                _tmp += math.log( _val, math.e)
            _tmp /= len(feature_cols_id)
            new_row[term_2_col] = _tmp                             
            return new_row   
    

In [22]:
def create_negative_samples_ape_aux(idx, df_chunk, feature_cols, ref_df, column_valid_values, save_dir, P_A, feature_cols_id):
    global ns_id_col
    global term_4_col
    global term_2_col
    global id_col
    
    ns_id_col='NegSampleID'
    
    term_2_col = 'term_2'
    term_4_col = 'term_4'
    feature_cols_id = { 
        e[0]:e[1] 
        for e in enumerate(feature_cols)
    }
    
    new_df = pd.DataFrame(
        columns=list(ref_df.columns)
    )
    
    new_df[ns_id_col] = 0
    new_df[term_4_col] = 0
    new_df[term_2_col] = 0
    
    for i,row  in df_chunk.iterrows():
        # for each column
        Pid_val = row[id_col]
        for column_id,column_name in feature_cols_id.items():
             for _k in range(num_neg_samples):
                _res = get_neg_sample_ape(
                    _k, column_id, column_name, ref_df, column_valid_values, row,  P_A, feature_cols_id
                )   
                new_df = new_df.append(
                    _res,
                    ignore_index=True
                )    
                
            
    if not os.path.exists(os.path.join(save_dir, 'tmp')):
        os.mkdir(os.path.join(save_dir, 'tmp'))
    f_name = os.path.join(save_dir, 'tmp', 'tmp_df_'+str(idx)+'.csv')
    new_df.to_csv(
        f_name,
        index=None
    )
    
    return f_name


In [23]:
def create_negative_samples_ape():
    global DIR
    global save_dir
    global id_col
    global ns_id_col
    global num_neg_samples
    save_dir = os.path.join(
        './../generated_data',
        DIR
    )
    
    train_data_file = os.path.join(save_dir, 'train_data.csv')
    test_data_file = os.path.join(save_dir, 'test_data.csv')
    
    train_df = pd.read_csv(
        train_data_file, 
        index_col=None
    )
       
    '''
    Randomly generate samples
    choose k=3 * m=7 = 21 negative samples per training instance
    For negative samples pick one entity & replace it it randomly 
    Validate if generated negative sample is not part of the test or training set
    '''
    ref_df = pd.DataFrame(
        train_df,
        copy=True
    ) 
       
    feature_cols = list(train_df.columns)
    feature_cols.remove(id_col)
    feature_cols_id = { 
        e[0]:e[1] 
        for e in enumerate(feature_cols)
    }
    
    # get the domain dimensions
    with open(os.path.join(save_dir,'domain_dims.pkl'),'rb') as fh:
        domain_dims = pickle.load(fh)
        
    print(domain_dims)
    
    # This id for the 4th term
    P_A = {}
    for _fci, _fcn  in feature_cols_id.items():
        _series = pd.Series(train_df[_fcn])
        tmp = _series.value_counts(normalize=True)
        P_Aa = tmp.to_dict()
        for _z in range(domain_dims[_fcn]):
            if _z not in P_Aa.keys():
                P_Aa[_z] = math.pow(10, -3)
        P_A[_fci] = P_Aa
        
        
    # Store what are valid values for each columns
    column_valid_values = {}
    for _fc_name in feature_cols:
        column_valid_values[_fc_name] = list(set(list(ref_df[_fc_name])))
        
    num_chunks = 10
    chunk_len = int(len(train_df)/(num_chunks-1))
    
    list_df_chunks = np.split(
        train_df.head(
            chunk_len*(num_chunks-1)
        ),num_chunks-1
    )
    
    end_len = len(train_df) -  chunk_len*(num_chunks-1)
    list_df_chunks.append(train_df.tail(end_len))
    for _l in  range(len(list_df_chunks)):
        print(len(list_df_chunks[_l]), _l)
    
    results = []
    
    #     for _i in range(len(list_df_chunks)):
    #         _res = create_negative_samples_aux(_i, list_df_chunks[_i], feature_cols, ref_df, column_valid_values, save_dir)
    #         results.append(_res)

    results = Parallel(n_jobs = 10)(delayed
        (create_negative_samples_ape_aux)(
            _i, list_df_chunks[_i], feature_cols, ref_df, column_valid_values, save_dir, P_A, feature_cols_id)
            for _i in range(
                len(list_df_chunks)
            )
        )
    
    new_df = None
    for _f in results :
        _df = pd.read_csv(_f, index_col = None)
        
        if new_df is None:
            new_df = _df
        else :
            new_df = new_df.append(_df, ignore_index=True)
        print(' >> ' ,len(new_df))
    
    new_df.to_csv(os.path.join(save_dir,'negative_samples_ape_1.csv'),index=False)
    return new_df

In [24]:
neg_df = create_negative_samples_ape()

{'Carrier': 502, 'ConsigneeCountry': 67, 'ConsigneePanjivaID': 4197, 'PortOfLading': 225, 'PortOfUnlading': 63, 'ShipmentDestination': 108, 'ShipmentOrigin': 112, 'ShipperCountry': 111, 'ShipperPanjivaID': 5133, 'hscode_6': 93}
14962 0
14962 1
14962 2
14962 3
14962 4
14962 5
14962 6
14962 7
14962 8
4 9
 >>  448860
 >>  897720
 >>  1346580
 >>  1795440
 >>  2244300
 >>  2693160
 >>  3142020
 >>  3590880
 >>  4039740
 >>  4039860


In [25]:
'''
Create numpy arrays 
Store in .pkl files
'''

def create_ape_model_data():
    global DIR
    global term_2_col
    global term_4_col
    global save_dir
    global id_col
    global ns_id_col
    global num_neg_samples
    save_dir = os.path.join(
        './../generated_data',
        DIR
    )
    
    train_pos_data_file = os.path.join(save_dir, 'train_data.csv')
    train_neg_data_file = os.path.join(save_dir, 'negative_samples_ape_1.csv')
    test_data_file = os.path.join(save_dir, 'test_data.csv')
    anomalies_data_file = os.path.join(save_dir, 'anomalies_test_data.csv')
    
    # ------------------- #
    
    train_pos_df = pd.read_csv(
        train_pos_data_file, 
        index_col=None
    )
    
    test_df = pd.read_csv(
        test_data_file, 
        index_col=None
    )
    
    neg_samples_df = pd.read_csv(
        train_neg_data_file, 
        index_col=None
    )
    
    anomalies_df = pd.read_csv(
        anomalies_data_file,
        index_col=None
    )
    
    feature_cols =  list(train_pos_df.columns)
    feature_cols.remove(id_col)
    neg_samples = num_neg_samples * len(feature_cols)
    
    
    # Anomalies generated have fake panjiva id
    test_anomaly_idList = list(anomalies_df[id_col])
    test_normal_idList = list(test_df[id_col])
    
    try:
        del test_df[id_col] 
        del anomalies_df[id_col]
    except:
        pass
    
    matrix_test = test_df.values
    matrix_anomaly = anomalies_df.values
    
    num_data_pts = len(train_pos_df)
    num_domains = len(feature_cols)
    
    matrix_pos = []
    matrix_neg = []
    
    term_2 = []
    term_4 = []
    
    index = 0 
    for i,row in train_pos_df.iterrows():
        _tmp = pd.DataFrame(
            neg_samples_df.loc[neg_samples_df[id_col]==row[id_col]],
            copy=True
        )
        
        _term_2 = list(_tmp[term_2_col])[0]
        _term_4 = list(_tmp[term_4_col])
        
        del _tmp[ns_id_col]
        del _tmp[id_col]
        del _tmp[term_2_col]
        del _tmp[term_4_col]
        del row[id_col]
        
        vals_n = np.array(_tmp.values)
        vals_p = list(row.values)
        matrix_neg.append(vals_n)
        matrix_pos.append(vals_p)
        
        term_2.append(_term_2)
        term_4.append(_term_4)
        index += 1 
       
    matrix_pos = np.array(matrix_pos)
    matrix_neg = np.array(matrix_neg)
    
    term_2 = np.array(term_2)
    term_4 = np.array(term_4)
    
    print(matrix_pos.shape, matrix_neg.shape)
    print(term_2.shape, term_4.shape)
    
    # Save files
    f_path =  os.path.join(
        save_dir,
        'matrix_train_positive.pkl'
    )
    
    with open(f_path,'wb') as fh:
        pickle.dump(
            matrix_pos,
            fh,
            pickle.HIGHEST_PROTOCOL
        )
    f_path =  os.path.join(save_dir,'ape_negative_samples.pkl')
    with open(f_path,'wb') as fh:
        pickle.dump(
            matrix_neg,
            fh,
            pickle.HIGHEST_PROTOCOL
    )
    
    
    f_path =  os.path.join(save_dir,'ape_term_2.pkl')
    with open(f_path,'wb') as fh:
        pickle.dump(
            term_2,
            fh,
            pickle.HIGHEST_PROTOCOL
    )
        
    f_path =  os.path.join(save_dir,'ape_term_4.pkl')
    with open(f_path,'wb') as fh:
        pickle.dump(
            term_4,
            fh,
            pickle.HIGHEST_PROTOCOL
    )
    
    
    f_path =  os.path.join(save_dir,'matrix_test_positive.pkl')
    with open(f_path,'wb') as fh:
        pickle.dump(
            matrix_test,
            fh,
            pickle.HIGHEST_PROTOCOL
    )
        
    
    f_path =  os.path.join(save_dir,'matrix_test_anomalies.pkl')
    with open(f_path,'wb') as fh:
        pickle.dump(
            matrix_anomaly,
            fh,
            pickle.HIGHEST_PROTOCOL
    )
        
    f_path =  os.path.join(save_dir,'test_idList.pkl')
    with open(f_path,'wb') as fh:
        pickle.dump(
            [test_anomaly_idList, test_normal_idList],
            fh,
            pickle.HIGHEST_PROTOCOL
    )
        
        
    
        

In [26]:
create_ape_model_data()

(134662, 10) (134662, 30, 10)
(134662,) (134662, 30)


 # --------------------------------------------------------------- #

In [27]:
def get_neg_sample_v1(
    _k,
    ref_df,
    column_valid_values,
    orig_row,
    feature_cols_id
):
    
    global id_col
    global ns_id_col
    
    
    Pid_val = orig_row[id_col]
    num_features = len(feature_cols_id)
    num_randomizations = random.randint(1,int(num_features/2))
    
    # iterate while a real noise is not generated
    while True:
        
        target_cols = [feature_cols_id[_] 
                             for _ in random.sample(
                                 list(feature_cols_id.keys()), 
                                 k=num_randomizations
                             )
                        ]
        c_vals = {}
        for _tc in target_cols:
            c_vals[_tc] = random.sample(column_valid_values[_tc], 1)[0]
        
        new_row = pd.Series(orig_row,copy=True)
        for _col, _item_id in c_vals.items():
            new_row[_col] = _item_id
            
            
        if validate(new_row, ref_df):
            new_row[ns_id_col] = int(  str(Pid_val) + '01' + str(_k)  )
            break
            
    return new_row   
    

In [28]:
def create_negative_samples_v1_aux(idx, df_chunk, feature_cols, ref_df, column_valid_values, save_dir, feature_cols_id , num_neg_samples_v1):
    
    global ns_id_col
    global id_col
    
    ns_id_col='NegSampleID'
    
    feature_cols_id = { 
        e[0]:e[1] 
        for e in enumerate(feature_cols)
    }
    
    new_df = pd.DataFrame(
        columns=list(ref_df.columns)
    )
    
    new_df[ns_id_col] = 0
    for i,row  in df_chunk.iterrows():
        
        Pid_val = row[id_col]
        for _k in range(num_neg_samples_v1):
                    
                _res = get_neg_sample_v1(
                    _k, ref_df, column_valid_values, row, feature_cols_id
                )   
                new_df = new_df.append(
                    _res,
                    ignore_index=True
                )    
       
            
    if not os.path.exists(os.path.join(save_dir, 'tmp')):
        os.mkdir(os.path.join(save_dir, 'tmp'))
    f_name = os.path.join(save_dir, 'tmp', 'tmp_df_'+str(idx)+'.csv')
    new_df.to_csv(
        f_name,
        index=None
    )
    
    return f_name

In [29]:
def create_negative_samples_v1():
    global DIR
    global save_dir
    global id_col
    global ns_id_col
    
    save_dir = os.path.join(
        './../generated_data',
        DIR
    )
    
    train_data_file = os.path.join(save_dir, 'train_data.csv')
    
    train_df = pd.read_csv(
        train_data_file, 
        index_col=None
    )
       
    '''
    Randomly generate samples
    choose 15 negative samples per training instance
    For negative samples pick m entities & replace it it randomly 
    m randomly between (1, d/2)
    Validate if generated negative sample is not part of the test or training set
    '''
    ref_df = pd.DataFrame(
        train_df,
        copy=True
    ) 
    num_neg_samples_v1 = 15   
    feature_cols = list(train_df.columns)
    feature_cols.remove(id_col)
    feature_cols_id = { 
        e[0]:e[1] 
        for e in enumerate(feature_cols)
    }
    
    # get the domain dimensions
    with open(
        os.path.join(save_dir,'domain_dims.pkl'),'rb'
    ) as fh:
        domain_dims = pickle.load(fh)        
        
    # Store what are valid values for each columns
    column_valid_values = {}
    for _fc_name in feature_cols:
        column_valid_values[_fc_name] = list(set(list(ref_df[_fc_name])))
        
    num_chunks = 10
    chunk_len = int(len(train_df)/(num_chunks-1))
    
    list_df_chunks = np.split(
        train_df.head(
            chunk_len*(num_chunks-1)
        ),num_chunks-1
    )
    
    end_len = len(train_df) -  chunk_len*(num_chunks-1)
    list_df_chunks.append(train_df.tail(end_len))
    for _l in  range(len(list_df_chunks)):
        print(len(list_df_chunks[_l]), _l)
    
    results = []

    results = Parallel(n_jobs = 10)(delayed
        (create_negative_samples_v1_aux)(
            _i, list_df_chunks[_i], feature_cols, ref_df, column_valid_values, save_dir, feature_cols_id, num_neg_samples_v1)
            for _i in range(
                len(list_df_chunks)
            )
        )
    
    new_df = None
    for _f in results :
        _df = pd.read_csv(_f, index_col = None)
        
        if new_df is None:
            new_df = _df
        else :
            new_df = new_df.append(_df, ignore_index=True)
        print(' >> ' ,len(new_df))
    
    new_df.to_csv(os.path.join(save_dir,'negative_samples_v1.csv'),index=False)
    return new_df

In [30]:
new_df = create_negative_samples_v1()

14962 0
14962 1
14962 2
14962 3
14962 4
14962 5
14962 6
14962 7
14962 8
4 9
 >>  224430
 >>  448860
 >>  673290
 >>  897720
 >>  1122150
 >>  1346580
 >>  1571010
 >>  1795440
 >>  2019870
 >>  2019930


In [31]:

def create_model_data_v1():
    global DIR
    global term_2_col
    global term_4_col
    global save_dir
    global id_col
    global ns_id_col
    global num_neg_samples
    save_dir = os.path.join(
        './../generated_data',
        DIR
    )
    
    train_pos_data_file = os.path.join(save_dir, 'train_data.csv')
    train_neg_data_file = os.path.join(save_dir, 'negative_samples_v1.csv')
    
    # ------------------- #
    
    train_pos_df = pd.read_csv(
        train_pos_data_file, 
        index_col=None
    )
    
    neg_samples_df = pd.read_csv(
        train_neg_data_file, 
        index_col=None
    )
    
    
    feature_cols =  list(train_pos_df.columns)
    feature_cols.remove(id_col)
    neg_samples = num_neg_samples * len(feature_cols)
    
    
    try:
        del test_df[id_col] 
        del anomalies_df[id_col]
    except:
        pass
    
    
    num_data_pts = len(train_pos_df)
    num_domains = len(feature_cols)
    
    matrix_pos = []
    matrix_neg = []

    
    index = 0 
    for i,row in train_pos_df.iterrows():
        _row = pd.Series(row,copy=True)
        _tmp = pd.DataFrame(
            neg_samples_df.loc[neg_samples_df[id_col]==row[id_col]],
            copy=True
        )
        
        del _tmp[ns_id_col]
        del _tmp[id_col]
        del _row[id_col]
        
        vals_n = np.array(_tmp.values)
        vals_p = list(_row.values)
        matrix_neg.append(vals_n)
        matrix_pos.append(vals_p)
        
        index += 1 
       
    matrix_pos = np.array(matrix_pos)
    matrix_neg = np.array(matrix_neg)

    
    print(matrix_pos.shape, matrix_neg.shape)

    
    # Save files
    f_path =  os.path.join(
        save_dir,
        'matrix_train_positive_v1.pkl'
    )
    
    with open(f_path,'wb') as fh:
        pickle.dump(
            matrix_pos,
            fh,
            pickle.HIGHEST_PROTOCOL
        )
    f_path =  os.path.join(save_dir,'negative_samples_v1.pkl')
    with open(f_path,'wb') as fh:
        pickle.dump(
            matrix_neg,
            fh,
            pickle.HIGHEST_PROTOCOL
    )

    
    
        
    
 

In [32]:
 create_model_data_v1()

(134662, 10) (134662, 15, 10)
