In [1]:
import pandas as pd
import os
import numpy as np
import yaml
import sys
import pathlib
from pathlib import Path
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
DIR = 'us_import1'
fpath = os.path.join('./../generated_data_v1/' + DIR + '/serialized_train_data.csv')
OP_DIR = os.path.join('./processed')

_path = Path(str(OP_DIR))
OP_DIR = os.path.join('processed')
_path = Path(str(OP_DIR))
_path.mkdir( parents=True, exist_ok=True)
df = pd.read_csv(fpath,index_col=None)
print(' >> ', df.columns)

 >>  Index(['PanjivaRecordID', 'Carrier', 'ConsigneePanjivaID', 'HSCode',
       'PortOfLading', 'PortOfUnlading', 'ShipmentDestination',
       'ShipmentOrigin', 'ShipperPanjivaID'],
      dtype='object')


In [3]:
entity_types = ['ConsigneePanjivaID','HSCode','PortOfLading','PortOfUnlading','ShipmentDestination','ShipmentOrigin','ShipperPanjivaID']
df = df[entity_types]

In [4]:
domain_dims = {}
for e in entity_types:
    domain_dims[e] = len(set(df[e]))

In [5]:
domain_dims

{'ConsigneePanjivaID': 8665,
 'HSCode': 10,
 'PortOfLading': 308,
 'PortOfUnlading': 78,
 'ShipmentDestination': 149,
 'ShipmentOrigin': 141,
 'ShipperPanjivaID': 9949}

In [6]:
# Convert to contiguos ids
prev_count = 0
res = []
for dn, ds in domain_dims.items():
    for eid in set(df[dn]):
        r = [dn, eid, eid + prev_count]
        res.append(r)
    prev_count += ds

serial_mapping_df = pd.DataFrame(
    data=res,
    columns=['Domain', 'Entity_ID', 'Serial_ID']
)

In [7]:
serial_mapping_df.loc[(serial_mapping_df['Domain'] == 'HSCode')]

Unnamed: 0,Domain,Entity_ID,Serial_ID
8665,HSCode,9415,18080
8666,HSCode,9416,18081
8667,HSCode,9417,18082
8668,HSCode,9418,18083
8669,HSCode,9419,18084
8670,HSCode,9420,18085
8671,HSCode,9421,18086
8672,HSCode,9422,18087
8673,HSCode,9423,18088
8674,HSCode,9424,18089


In [8]:
def convert_to_SerialID(_row, cols):
    global serial_mapping_df
    row = _row.copy()
    for c in cols:
        val = row[c]
        res = list(
            serial_mapping_df.loc[
                (serial_mapping_df['Domain'] == c) &
                (serial_mapping_df['Entity_ID'] == val)]
            ['Serial_ID']
        )
#         print('res > ',c, val, res)
        row[c] = res[0]
    return row

In [9]:
cols = list(domain_dims.keys())
print('Columns ',entity_types)

Columns  ['ConsigneePanjivaID', 'HSCode', 'PortOfLading', 'PortOfUnlading', 'ShipmentDestination', 'ShipmentOrigin', 'ShipperPanjivaID']


In [10]:
df = df.parallel_apply(
    convert_to_SerialID,
    axis=1,
    args= (cols,)
)

In [11]:
# create graph

entity_dict = {}

for e in enity_types:
    print(e , 'Count : ', len(set(df[e])))
    items = list(sorted(set(df[e])))
    n_df = pd.DataFrame({e: items})
    fname = 'nodes_{}.csv'.format(e)
    fpath = os.path.join(OP_DIR, fname)
    print(' Writing to file {}'.format(fpath))
    n_df.to_csv(fpath,index=False)
# ------------
# Edges
# ------------

edge_types = [
    ['ShipperPanjivaID','HSCode'],
    ['ConsigneePanjivaID','HSCode'],
    ['ShipperPanjivaID','ConsigneePanjivaID'],
    ['PortOfLading','ShipperPanjivaID'],
    ['PortOfLading','PortOfUnlading'],
    ['PortOfUnlading','ConsigneePanjivaID'],
    ['ShipmentDestination','PortOfUnlading'],
    ['PortOfLading','ShipmentOrigin'],
    ['ShipmentDestination', 'ConsigneePanjivaID'],
    ['ShipperPanjivaID','ShipmentOrigin'],
    ['PortOfLading','ShipmentOrigin']
]
num_edges = 0

for edge_type in edge_types:
    e_df = df[edge_type]
    e_df = e_df.groupby(edge_type).size().reset_index(name="count")
    num_edges += len(e_df)
    print('E type ', edge_type, ' count : ', len(e_df))
    name = '_'.join(sorted(edge_type))
    cols= list(e_df.columns)
    col1 = cols[0]
    col2 = cols[1]
    e_df = e_df.rename(columns= { col1:'source', col2:'target' })
    # Save file 
    fname = name + '_edges.csv'
    fpath = os.path.join(OP_DIR, fname)
    print('.Writing to {}'.format(fpath))
    e_df.to_csv(fpath,index=False)

# =======================
# Create data for Hin2vec 
# =======================

all_edges_df = None
e_type_idx = 0
for edge_type in edge_types:
    e_df = df[edge_type]
    e_df = e_df.groupby(edge_type).size().reset_index(name="count")
    num_edges += len(e_df)
    print('E type ', edge_type, ' count : ', len(e_df))
    name = '_'.join(sorted(edge_type))
    cols= list(e_df.columns)
    col1 = cols[0]
    col2 = cols[1]
    e_df = e_df.rename(columns= { col1:'source', col2:'target' })
    try:
        del e_df['count']
    except:
        pass
    e_df['type'] = e_type_idx
    e_type_idx += 1
    
    if all_edges_df is None:
        all_edges_df = e_df
    else:
        all_edges_df = all_edges_df.append(e_df,ignore_index=True)
    
fname = name + 'hin2vec_input.txt'
fpath = os.path.join(OP_DIR, fname)
print('.Writing to {}'.format(fpath))
all_edges_df.to_csv(fpath, index = False, header = False, sep = ',')

NameError: name 'enity_types' is not defined

In [None]:

import matplotlib.pyplot as plt
df1 = df[['ConsigneePanjivaID','HSCode']].drop_duplicates().groupby(by=['ConsigneePanjivaID']).size().reset_index(name="count")
plt.figure()
plt.title('How many HSCodes does each company (Consignee) trade in ?')
df1['count'].plot.hist(color='m')
try:
    plt.show()
except:
    pass

import matplotlib.pyplot as plt
df1 = df[['ShipperPanjivaID','HSCode']].drop_duplicates().groupby(by=['ShipperPanjivaID']).size().reset_index(name="count")
plt.figure()
plt.title('How many HSCodes does each company (Shipper) trade in ?')
df1['count'].plot.hist(color='m')
try:
    plt.show()
except:
    pass

