In [1]:
import os
import sys
import pandas as pd
import numpy as np
import sklearn
import glob
import pickle
import random
from joblib import Parallel, delayed
import yaml
import math


In [39]:
DIR = 'china_import'
id_col = 'PanjivaRecordID'
freq_bound = 2

In [40]:

def get_regex(_type):
    global DIR
    if DIR == 'us_import':
        if _type == 'train':
            return '*0[1-4]**2015*.csv'
        if _type == 'test':
            return '*0[5-6]*2015*.csv'

    if DIR == 'china_import':
        if _type == 'train':
            return '*0[1-9]*2015*.csv'
        if _type == 'test':
            return '*0[1-6]*2016*.csv'

    if DIR == 'peru_export':
        if _type == 'train':
            return '*201[5-6]*.csv'
        if _type == 'test':
            return '*201[7,8]*.csv'

    if DIR == 'china_export':
        if _type == 'train':
            return '*0[1-4]*2015*.csv'
        if _type == 'test':
            return '*0[5-6]*2015*.csv'

    return '*.csv'


In [41]:

def get_files(_type='all'):
    global DIR
    data_dir = os.path.join(
        './../../wwf_data_v1',
        DIR
    )

    regex = get_regex(_type)
    files = sorted(
        glob.glob(
            os.path.join(data_dir, regex)
        )
    )
    print(files)
    return files



In [46]:
# use_cols = [
#         'PanjivaRecordID',
#         'ShipmentDestination',
#         'hscode_6',
#         'ShipperPanjivaID',
#         'PortOfUnladingUNLOCODE',
#         'CustomsCode',
#         'TransportMethod',
#         'LocationCode'
#      ]

use_cols = [
    'PanjivaRecordID',
    'ConsigneePanjivaID',
    'ShipmentOrigin',
    'Province',
    'CountryOfSale',
    'TransportMethod',
    'AdminRegion',
    'TradeType',
    'hscode_6',
]

In [47]:
def collate(file_list):
    global id_col
    global use_cols
    print(use_cols)
    
     
    _master_df = None
    for file in file_list:
        _df = pd.read_csv(
            file,
            low_memory=False,
            usecols=use_cols
        )
        _df = _df.dropna()
        if _master_df is None:
            _master_df = pd.DataFrame(_df)
        else:
            _master_df = _master_df.append(
                _df,
                ignore_index=True
            )
    feature_cols = list(_master_df.columns)
    feature_cols.remove(id_col)
    feature_cols = list(sorted(feature_cols))
    all_cols = [id_col]
    all_cols.extend(feature_cols)
    print(all_cols)
    _master_df = _master_df[all_cols]
    return _master_df

In [48]:
def remove_low_frequency_values(_df,fr=5):
    global id_col
    global freq_bound
    from collections import Counter
    freq_bound = fr
    freq_column_value_filters = {}

    feature_cols = list(_df.columns)
    feature_cols.remove(id_col)

    for c in feature_cols:
        values = list(_df[c])
        freq_column_value_filters[c] = []

        obj_counter = Counter(values)

        for _item, _count in obj_counter.items():
            if _count < freq_bound:
                freq_column_value_filters[c].append(_item)

    for c, _items in freq_column_value_filters.items():
        print(c, len(_items))
    print(len(_df))
    for col, val in freq_column_value_filters.items():
        _df = _df.loc[
            (~_df[col].isin(val))
        ]
    for col in _df.columns:
        if col ==id_col : continue
        print(col, ':: ', len(set(_df[col])))
    return _df

In [53]:
files = get_files(_type='train')
df = collate(files)
print('>>>',len(df))
_ = remove_low_frequency_values(
        df,3
    )
len(_)

['./../../wwf_data_v1/china_import/panjiva_china_imports_01_2015_filtered.csv', './../../wwf_data_v1/china_import/panjiva_china_imports_02_2015_filtered.csv', './../../wwf_data_v1/china_import/panjiva_china_imports_03_2015_filtered.csv', './../../wwf_data_v1/china_import/panjiva_china_imports_04_2015_filtered.csv', './../../wwf_data_v1/china_import/panjiva_china_imports_05_2015_filtered.csv', './../../wwf_data_v1/china_import/panjiva_china_imports_06_2015_filtered.csv', './../../wwf_data_v1/china_import/panjiva_china_imports_07_2015_filtered.csv', './../../wwf_data_v1/china_import/panjiva_china_imports_08_2015_filtered.csv', './../../wwf_data_v1/china_import/panjiva_china_imports_09_2015_filtered.csv']
['PanjivaRecordID', 'ConsigneePanjivaID', 'ShipmentOrigin', 'Province', 'CountryOfSale', 'TransportMethod', 'AdminRegion', 'TradeType', 'hscode_6']
['PanjivaRecordID', 'AdminRegion', 'ConsigneePanjivaID', 'CountryOfSale', 'Province', 'ShipmentOrigin', 'TradeType', 'TransportMethod', 'hsc

81907