In [173]:
import os
import sys
import pandas as pd
import numpy as np
import sklearn
import glob
import pickle
import random
from joblib import Parallel, delayed

In [174]:
DIR = 'china_import'

# Let us create APE style test & training sets
## china import :  Train on 2015(02-07) Test(08-09)

In [148]:
def get_files(_type='all'):
    data_dir = os.path.join(
        './../wwf_data_v1',
        DIR
    )
    if _type == 'train':
        files = sorted(glob.glob(os.path.join(data_dir,'*0[1-7]*2015*.csv')))
    elif _type == 'test':
        files = sorted(glob.glob(os.path.join(data_dir,'*0[1-7]*2016*.csv')))
            
    else:
        files = sorted(glob.glob(os.path.join(data_dir,'*0[2-9]*2015*.csv')))
        
        
    return files

In [151]:
use_cols = [ 
    'PanjivaRecordID',
    'ConsigneePanjivaID',
    'ShipmentOrigin',
    'Province',
    'CountryOfSale',
    'TransportMethod',
    'AdminRegion',
    'TradeType',
    'hscode_6'
]

freq_bound = 5
column_value_filters = {
    'ShipmentOrigin' : ['Countries (reg.) un-known'],
    'CountryOfSale' : ['Countries (reg.) un-known'] 
}
id_col = 'PanjivaRecordID'
num_neg_samples = 3

In [152]:
def replace_attr_with_id(row, attr, val2id_dict):
    val = row[attr]
    if val not in val2id_dict.keys():
        print(attr,val)
        return None
    else:
        return val2id_dict[val]

In [153]:
def convert_to_ids(
    df,  
    save_dir
):
    global id_col
    
    feature_columns = list(df.columns)
    feature_columns.remove(id_col)
    domain_dims_dict = {}
    col_val2id_dict = {}
  
    for col in sorted(feature_columns):
        
        vals = list(set(df[col]))
        # 0 : item1 , 1 :item2, ....
        id2val_dict = {
            e[0]: e[1]
            for e in enumerate(vals, 0)
        }

        # file_name = col+'_id2val.pkl'
        # with open(
        #   os.path.join(
        #   save_dir,
        #   file_name
        #   ), 'wb') as f:
        # pickle.dump(id2val_dict, f, pickle.HIGHEST_PROTOCOL)
        
        # item1 : 0, item2 : 1, ...
        val2id_dict = {v: k for k, v in id2val_dict.items()}
        
        col_val2id_dict[col] = val2id_dict
        
        # replace
        df[col] = df.apply(
            replace_attr_with_id,
            axis=1,
            args=(
                col,
                val2id_dict,
            )
        )
        domain_dims_dict[col] = len(id2val_dict)
    domain_dims = []
    domain_dims_res = {}
    print(list(df.columns))
    
    for col in list(df.columns):
        if col in domain_dims_dict.keys():
            print(col)
            domain_dims_res[col] = domain_dims_dict[col]
            domain_dims.append(domain_dims_dict[col])

    domain_dims =  np.array(domain_dims)
    print(domain_dims_res)

    file = 'domain_dims.pkl'
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
        
    f_path = os.path.join(save_dir, file)
    
    with open(f_path,'wb') as fh:
        pickle.dump(
            domain_dims_res,
            fh,
            pickle.HIGHEST_PROTOCOL
        )
    return df, col_val2id_dict

In [154]:
def collate(file_list):
    _master_df = None
    for file in file_list:
        _df = pd.read_csv(
            file, 
            low_memory=False,
            usecols = use_cols
        )
        _df = _df.dropna()
        if _master_df is None:
            _master_df = pd.DataFrame(_df)
        else:
            _master_df = _master_df.append(
                _df,
                ignore_index=True
            )
    return _master_df

In [155]:
def remove_low_frequency_values(_df):
    global id_col
    global freq_bound 
    from collections import Counter
    
    freq_column_value_filters = {}
    
    feature_cols = list(_df.columns)
    feature_cols.remove(id_col)
    
    for c in feature_cols:
        values = list(_df[c])
        freq_column_value_filters [c] = []
        items = set(values)
        obj_counter = Counter(values)
        for _item, _count in  obj_counter.items():
            if _count < freq_bound :
                  freq_column_value_filters[c].append(_item)
    
    for c,_items in freq_column_value_filters.items():
        print(c, len(_items))
    print(len(_df))
    for col,val in freq_column_value_filters.items():
        _df = _df.loc[
            (~_df[col].isin(val))
        ]
    print(len(_df))   
    return _df

In [None]:
def validate(row,ref_df):
    global id_col
    query_str = []
    for _c, _i in row.to_dict().items():
        if _c == id_col: 
            continue
        query_str.append(' ' + _c + ' == ' + str(_i) )
    query_str = ' & '.join(query_str)
    res_query = ref_df.query(query_str)
  
    if len(res_query) > 0 : 
        return False
    return True 


In [158]:
'''
returns c random items as a dict
column_name : item_id
'''

def  get_c_vals(anomaly_cols, col_val2id_dict):
    res_dict = {}
    for col in anomaly_cols:
        res_dict[col] = random.sample(list(col_val2id_dict[col].values()), 1)[0]
    return res_dict

In [127]:
def create_anomalies(test_df, train_df, col_val2id_dict, c=3):
    global id_col
    feature_cols = list(test_df.columns)
    feature_cols.remove(id_col)
    feature_cols_id = { e[0]:e[1] for e in enumerate(feature_cols)}
    ref_df = pd.DataFrame(train_df)
    ref_df = ref_df.append(
        test_df,
        ignore_index=True
    )
    new_df = pd.DataFrame(columns=list(test_df.columns))
    for i,row  in test_df.iterrows():
        _anomaly_cols = [feature_cols_id[_] 
                             for _ in random.sample(
                                 list(feature_cols_id.keys()), 
                                 k=3
                             )
                        ]
        while True:
            c_vals = get_c_vals(_anomaly_cols, col_val2id_dict)
            for _col, _item_id in c_vals.items():
                row[_col] = _item_id
            if validate(row,ref_df):
                row[id_col] = int( str(row[id_col]) + '1' )
                new_df = new_df.append(row,ignore_index=True)
                break;
        
    # sample c cols
    new_df = new_df.drop_duplicates(subset=feature_cols)
    print(' Length of anomalies_df ',new_df)
    return new_df

In [128]:
def setup_testing_data( test_df, train_df, col_val2id_dict):
    global id_col
    # Replace with None if ids are not in train_set
    print('----')
    feature_cols = list(test_df.columns)
    feature_cols.remove(id_col)
    
    for col in feature_cols:
        valid_items = list(col_val2id_dict[col].keys())
        test_df = test_df.loc[test_df[col].isin(valid_items)]
        
    print(' Length of testing data' , len(test_df))
    
    
    # First convert to to ids
    for col in feature_cols:
        val2id_dict = col_val2id_dict[col]
        test_df[col] = test_df.apply(
                replace_attr_with_id,
                axis = 1,
                args = (
                    col,
                    val2id_dict,
                )
            )
    '''
    Remove duplicates :
    '''
    
    print(' Length of test df :: ', len(test_df) )
    new_test_df = pd.DataFrame( columns= list(test_df.columns))
    
    for i,row in test_df.iterrows():
        if validate(row, train_df):
            new_test_df = new_test_df.append(row,ignore_index=True)
            print(len(new_test_df))
    print(' After deduplication :: ', len(new_test_df))
    
    anomalies_df = create_anomalies(test_df, train_df, col_val2id_dict, c=3)
    return new_test_df, anomalies_df

In [130]:
def create_china_import_train_test_sets():
    global use_cols
    global DIR
    global save_dir
    global column_value_filters
    train_files = get_files('train')
    test_files = get_files('test')
  
    # combine train_data :
    train_master_df = collate(train_files)
    test_master_df = collate(test_files)
    
    print(' Train initial ', len(train_master_df)) 
    print(' Test initial ', len(test_master_df)) 
            
    save_dir = os.path.join('./../generated_data',DIR)
    
    '''
    test data preprocessing
    '''
    print(len(train_master_df))
    
    '''
    Remove values that are garbage
    '''
    for col,val in column_value_filters.items():
        train_master_df = train_master_df.loc[
            (~train_master_df[col].isin(val))
        ]
         
    print(' Length of training data ', len(train_master_df))  
    
    train_master_df = remove_low_frequency_values(
        train_master_df
    )
    
    train_master_df_1, col_val2id_dict = convert_to_ids(
        train_master_df,
        save_dir
    )
    
    new_test_df, anomalies_df = setup_testing_data(
        test_master_df,
        train_master_df_1,
        col_val2id_dict
    )
    
    # Save the data
    new_test_df.to_csv(os.path.join(save_dir,'test_data.csv'),index=False)
    train_master_df_1.to_csv(os.path.join(save_dir,'train_data.csv'),index=False)
    anomalies_df.to_csv(os.path.join(save_dir,'anomalies_test_data.csv'),index=False)
    
    return 
    

In [131]:
create_china_import_train_test_sets()


 Train initial  66926
 Test initial  74469
66926
 Length of training data  66917
ConsigneePanjivaID 4801
ShipmentOrigin 19
Province 1
CountryOfSale 24
TransportMethod 0
hscode_6 1
AdminRegion 104
TradeType 5
66917
57856
['PanjivaRecordID', 'ConsigneePanjivaID', 'ShipmentOrigin', 'Province', 'CountryOfSale', 'TransportMethod', 'hscode_6', 'AdminRegion', 'TradeType']
ConsigneePanjivaID
ShipmentOrigin
Province
CountryOfSale
TransportMethod
hscode_6
AdminRegion
TradeType
{'ConsigneePanjivaID': 3375, 'ShipmentOrigin': 115, 'Province': 30, 'CountryOfSale': 107, 'TransportMethod': 6, 'hscode_6': 74, 'AdminRegion': 321, 'TradeType': 9}
----
 Length of testing data 52642
 Length of test df ::  52642
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102


In [186]:
def get_neg_sample(_k, column_id, column_name, ref_df, column_valid_values, orig_row):
    global id_col
    ns_id_col='NegSampleID'
    new_row = pd.Series(orig_row,copy=True)
    Pid_val = orig_row[id_col]
    while True:
        _random = random.sample(
            column_valid_values[column_name], 1
        )[0]
        new_row[column_name] = _random
        if validate(new_row, ref_df):
            new_row = pd.Series(orig_row,copy=True)
            new_row[ns_id_col] = int( '10' + str(_k) + str(column_id) + str(Pid_val) + '01' )
            return new_row   
    

In [187]:
def create_negative_samples():
    global DIR
    global save_dir
    global id_col
    global num_neg_samples
    save_dir = os.path.join(
        './../generated_data',
        DIR
    )
    
    train_data_file = os.path.join(save_dir, 'train_data.csv')
    test_data_file = os.path.join(save_dir, 'test_data.csv')
    
    train_df = pd.read_csv(
        train_data_file, 
        index_col=None
    )
    
    test_df = pd.read_csv(
        test_data_file, 
        index_col=None
    )
    '''
    Randomly generate samples
    choose k=3 * m=7 = 21 negative samples per training instance
    For negative samples pick one entity & replace it it randomly 
    Validate if generated negative sample is not part of the test or training set
    '''
   
    feature_cols = list(test_df.columns)
    feature_cols.remove(id_col)
    feature_cols_id = { 
        e[0]:e[1] 
        for e in enumerate(feature_cols)
    }
    
    ref_df = pd.DataFrame(train_df,copy=True)
    #     ref_df = ref_df.append(
    #         test_df,
    #         ignore_index=True
    #     )
    
    # Store what are valid values for each columns
    column_valid_values = {}
    for _fc_name in feature_cols:
        column_valid_values[_fc_name] = list(set(list(ref_df[_fc_name])))
        
    new_df = pd.DataFrame(
        columns=list(test_df.columns)
    )
    ns_id_col='NegSampleID'
    new_df[ns_id_col] = 0
    
    for i,row  in train_df.iterrows():
        # for each column
        Pid_val = row[id_col]
        for column_id,column_name in feature_cols_id.items():
            results = Parallel(n_jobs=4)(
                delayed(get_neg_sample)(
                    _k, column_id, column_name, ref_df, column_valid_values, row) 
                for _k in range(num_neg_samples)
            )
            for _res in results:
                new_df = new_df.append(
                    _res,
                    ignore_index=True
                )    
        print(' >> ' ,len(new_df))
    
    new_df.to_csv(os.path.join(save_dir,'negative_samples.csv'),index=False)
    return new_df

In [None]:
neg_df = create_negative_samples()

 >>  24
 >>  48
 >>  72
 >>  96
 >>  120
 >>  144
 >>  168
 >>  192
 >>  216
 >>  240
 >>  264
 >>  288
 >>  312
 >>  336
 >>  360
 >>  384
 >>  408
 >>  432
 >>  456
 >>  480
 >>  504
 >>  528
 >>  552
 >>  576
 >>  600
 >>  624
 >>  648
 >>  672
 >>  696
 >>  720
 >>  744
 >>  768
 >>  792
 >>  816
 >>  840
 >>  864
 >>  888
 >>  912
 >>  936
 >>  960
 >>  984
 >>  1008
 >>  1032
 >>  1056
 >>  1080
 >>  1104
 >>  1128
 >>  1152
 >>  1176
 >>  1200
 >>  1224
 >>  1248
 >>  1272
 >>  1296
 >>  1320
 >>  1344
 >>  1368
 >>  1392
 >>  1416
 >>  1440
 >>  1464
 >>  1488
 >>  1512
 >>  1536
 >>  1560
 >>  1584
 >>  1608
 >>  1632
 >>  1656
 >>  1680
 >>  1704
 >>  1728
 >>  1752
 >>  1776
 >>  1800
 >>  1824
 >>  1848
 >>  1872
 >>  1896
 >>  1920
 >>  1944
 >>  1968
 >>  1992
 >>  2016
 >>  2040
 >>  2064
 >>  2088
 >>  2112
 >>  2136
 >>  2160
 >>  2184
 >>  2208
 >>  2232
 >>  2256
 >>  2280
 >>  2304
 >>  2328
 >>  2352
 >>  2376
 >>  2400
 >>  2424
 >>  2448
 >>  2472
 >>  2496
 >>  

In [None]:
'''
Create numpy arrays 
Store in .pkl files
'''

def create_model_data():
    global DIR
    global save_dir
    global id_col
    global num_neg_samples
    save_dir = os.path.join(
        './../generated_data',
        DIR
    )
    
    train_pos_data_file = os.path.join(save_dir, 'train_data.csv')
    train_neg_data_file = os.path.join(save_dir, 'train_neg.csv')
    test_data_file = os.path.join(save_dir, 'test_data.csv')
    anomalies_data_file = os.path.join(save_dir, 'anomalies_test_data.csv')
    
    train_pos_df = pd.read_csv(
        train_pos_data_file, 
        index_col=None
    )
    
    test_df = pd.read_csv(
        test_data_file, 
        index_col=None
    )
    
    neg_samples_df = pd.read_csv(
        train_neg_data_file, 
        index_col=None
    )
    
    anomalies_df = pd.read_csv(
        anomalies_data_file,
        index_col=None
    )
    
    feature_cols =  list(train_pos_df.columns)
    feature_cols.remove(id_col)
    neg_samples = num_neg_samples * len(feature_cols)
    
    
    test_anomly_idList = list(anomalies_df[id_col])
    test_normal_idList = list(test_df[id_col])
    
    try:
        del test_df[id_col] 
        del anomalies_df[id_col]
    except:
        pass
    
    matrix_test = test_df.values
    matrix_anomaly = anomalies_df.values
    
    num_data_pts = matrix_positive.shape[0]
    num_domains = matrix_positive.shape[-1]
    
    matrix_pos = np.array([num_data_pts, num_domains])
    matrix_neg = np.array([num_data_pts, neg_samples, num_domains])
    
    
    index = 0 
    for i,row in train_pos_df.iterrows():
        _tmp = neg_samples_df.loc[neg_samples_df[id_col]==row[id_col]]
        del _tmp['NegSampleID']
        del _tmp[id_col]
        del row[id_col]
        
        vals = _tmp.values
        
        matrix_neg[index] = _tmp.values
        matrix_pos[index] = row.values
        index += 1   
    
  
    # Save files
    f_path =  os.path.join(save_dir,'matrix_train_positive.pkl')
    with open(f_path,'wb') as fh:
        pickle.dump(
            matrix_pos,
            fh,
            pickle.HIGHEST_PROTOCOL
        )
    f_path =  os.path.join(save_dir,'matrix_trian_negative.pkl')
    with open(f_path,'wb') as fh:
        pickle.dump(
            matrix_neg,
            fh,
            pickle.HIGHEST_PROTOCOL
    )
        
    f_path =  os.path.join(save_dir,'matrix_test_positive.pkl')
    with open(f_path,'wb') as fh:
        pickle.dump(
            matrix_test,
            fh,
            pickle.HIGHEST_PROTOCOL
    )
        
    
    f_path =  os.path.join(save_dir,'matrix_test_anomalies.pkl')
    with open(f_path,'wb') as fh:
        pickle.dump(
            matrix_anomaly,
            fh,
            pickle.HIGHEST_PROTOCOL
    )
        
    f_path =  os.path.join(save_dir,'test_idList.pkl')
    with open(f_path,'wb') as fh:
        pickle.dump(
            [test_anomly_idList, test_normal_idList],
            fh,
            pickle.HIGHEST_PROTOCOL
    )
        
        
    
        

In [None]:
create_model_data()