In [1]:
import sys
print("sys version:", sys.version)
import pandas as pd
pd.options.display.max_colwidth
print("pandas version:", pd.__version__)

sys version: 3.7.1 (default, Dec 10 2018, 22:54:23) [MSC v.1915 64 bit (AMD64)]
pandas version: 0.23.4


# import

In [2]:
prod = pd.read_csv('data/HS6Description2016.csv', encoding = "ISO-8859-1", dtype=str)
importers = pd.read_csv('data/MajorImportersbyHS62016.csv', encoding = "ISO-8859-1", dtype=str)

In [3]:
prod.drop('DESCRIPTION_FRA',axis=1, inplace=True)
prod.columns = ["hs6", "product"]
prod.set_index('hs6', inplace=True)

In [4]:
prod.tail()

Unnamed: 0_level_0,product
hs6,Unnamed: 1_level_1
970200,"Original Engravings, Prints And Lithographs"
970300,Original Sculptures And Statuary In Any Material
970400,Used Postage Or Revenue Stamps And The Like Or...
970500,"Collections, Collector`S Pieces Of Various Sci..."
970600,Antiques Of An Age Exceeding One Hundred Years


In [5]:
print(prod[prod.index=="970500"]['product'])

hs6
970500    Collections, Collector`S Pieces Of Various Sci...
Name: product, dtype: object


In [6]:
prod.shape

(5121, 1)

In [7]:
importers.drop("PROVINCE_FRA", axis=1, inplace=True)
importers.drop("DATA_YEAR-ANNÉE_DES_DONNÉES",axis=1, inplace=True)
importers.columns = ["hs6", "importer", "city", "province", "postalcode"]
importers["importer"] = importers["importer"].str.upper()
importers.set_index(["importer", "hs6"], inplace=True)

In [8]:
importers.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,city,province,postalcode
importer,hs6,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9072-9435 QUEBEC INC.,10121,Saint-Marc-sur-Richelieu,Quebec,J0L 2E0
CAMELOT STABLES,10121,Delta,British Columbia,V4K 1S9
COUNTRY LANE FARM,10121,Delta,British Columbia,V4K 3N2
FOXTRAIL FARMS,10121,Okotoks,Alberta,T1S 1A1
HOERDT'S HOT SHOT SERVICE LTD.,10121,Beaumont,Alberta,T4X 1H9


In [9]:
importers.shape

(108900, 3)

# Create similarity matrix

In [10]:
sim_matrix = importers.sort_index().reset_index()[["importer", "hs6"]]
sim_matrix["_agg"] = 1

In [11]:
print("unique importers:", sim_matrix.importer.nunique())

unique importers: 30530


In [12]:
%%time
sim_matrix = sim_matrix.pivot_table(index="importer", columns="hs6", values="_agg", aggfunc=min)

Wall time: 3.84 s


In [13]:
sim_matrix.iloc[0:5, -5:]

hs6,970200,970300,970400,970500,970600
importer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
#1 ANTIQUE WAREHOUSE FURNISHINGS LTD.,,,,,1.0
#1 APPAREL,,,,,
0 WASTE 2 ENERGY CANADA LTD.,,,,,
0021567 B.C. LTD.,,,,,
0187993 MANITOBA LIMITED,,,,,


In [14]:
# number of importers who imported 970600?
sim_matrix[sim_matrix["970600"]==1].index.size

80

In [15]:
from scipy.spatial.distance import pdist,squareform

In [16]:
def build_matrix(data, metric):
    return pd.DataFrame(squareform(pdist(data, metric=metric)))

In [None]:
%%time
dist_matrix = build_matrix(data=sim_matrix.fillna(0), metric='jaccard')

About jaccard distance: https://www.statisticshowto.datasciencecentral.com/jaccard-index/

In [30]:
idx = sim_matrix.index.values

In [78]:
pd.Series(idx).to_pickle("data/importer_ids.pkl")

In [None]:
# dist_matrix.set_index(idx, inplace=True)
# dist_matrix.columns = idx

In [20]:
dist_matrix.loc[:5, :5]

Unnamed: 0,0,1,2,3,4,5
0,0.0,1.0,1.0,1.0,1.0,1.0
1,1.0,0.0,1.0,1.0,0.952381,1.0
2,1.0,1.0,0.0,1.0,1.0,1.0
3,1.0,1.0,1.0,0.0,1.0,1.0
4,1.0,0.952381,1.0,1.0,0.0,1.0
5,1.0,1.0,1.0,1.0,1.0,0.0


In [None]:
# comp = ["#1 ANTIQUE WAREHOUSE FURNISHINGS LTD.", "A & A CONTRACT CUSTOMS BROKERS LTD.", 
#         "ACU PLASMOLD INC.", "CARGILL LIMITED"]

# dist_matrix.loc[comp, comp]

Store the matrix on disk as-is, in the CSV format

In [21]:
dist_matrix.to_csv("data/distance_matrix_with_imp_ids.csv")

Disk space: 3.7 GB.

Now, save a sparse matrix in pickle format:

In [73]:
%%time
dist_matrix.to_sparse(fill_value=1.0).to_pickle("data/dist_matrix_with_imp_ids_sparse.pkl")

Wall time: 2min 18s


138MB on disk, which means ~96% disk space reduction when compared to storing non-sparse data as CSV...!

In [79]:
%%time
del dist_matrix
del sim_matrix

Wall time: 0 ns


--------------------

## tests -- can ignore

In [22]:
def similar_importers(importer, n=None):
    distances = dist_matrix.loc[(dist_matrix.index!=importer), importer]
    if n is None:
        return distances[distances<1].sort_values()
    else:
        n_to_return = min(n, distances[distances<1].size)
        return distances[distances<1].sort_values()[0:n_to_return]

In [46]:
def imp_to_id(importer):
    ser = pd.Series(range(len(sim_matrix.index.values)), sim_matrix.index.values)
    return ser[ser.index.values==importer][0]

def id_to_imp(imp_id):
    ser = pd.Series(sim_matrix.index.values)
    return ser[imp_id]

In [58]:
imp_to_id("CARGILL LIMITED")

5400

In [47]:
id_to_imp(5400)

'CARGILL LIMITED'

In [48]:
similar_importers(imp_to_id("CARGILL LIMITED"), 5)

3323     0.830986
4568     0.859375
18865    0.863014
14423    0.879310
16118    0.890244
Name: 5400, dtype: float64

In [54]:
def similar_importers_names(importer_name, n=None):
    return [id_to_imp(i) for i in similar_importers(imp_to_id(importer_name), n).index]

In [56]:
similar_importers_names("CARGILL LIMITED", 5)

['BARRY CALLEBAUT CANADA INC.',
 'BUNGE CANADA',
 'MTC ANIMAL HEALTH DIVISION',
 'JBS FOOD CANADA ULC',
 'LES ALIMENTS MULTIBAR INC.']

In [53]:
sim_matrix.index[sim_matrix.index.str.contains("STARBUC")]

Index(['STARBUCKS COFFEE CANADA, INC.', 'STARBUCKS CORPORATION'], dtype='object', name='importer')

In [57]:
similar_importers_names("STARBUCKS COFFEE CANADA, INC.", 10)

['KEURIG CANADA INC.',
 'MAIDSTONE COFFEE CANADA',
 'DAVIDSTEA INC.',
 'GFS-MILTON',
 'WILTON INDUSTRIES CANADA COMPANY',
 'BED BATH & BEYOND CANADA L.P.',
 'NESTLE CANADA-HEAD OFFICE',
 'BUNZL CANADA INC.',
 'UNILEVER COSMETICS INTERNATIONAL CANADA',
 'KRAFT CANADA DON MILLS']

In [None]:
similar_importers("#1 ANTIQUE WAREHOUSE FURNISHINGS LTD.", 10)

In [None]:
comp = ["#1 ANTIQUE WAREHOUSE FURNISHINGS LTD.", "604612 ONTARIO INC.", 
        "A & A CONTRACT CUSTOMS BROKERS LTD.", "ACU PLASMOLD INC.", "CARGILL LIMITED"]
#(1-dist_matrix).loc[comp, comp]
dist_matrix.loc[comp, comp]

In [None]:
imp = importers.reset_index()
def get_products(importer):
    return set(imp.loc[imp.importer==importer, "hs6"])

def get_product_names(importer):
    return set(prod[prod.index.isin(get_products(importer))]["product"])

In [None]:
for i in comp:
    print("------>", i)
    for p in pd.Series(list(get_product_names(i))).sort_values():
        print("   ", p)
    print("")

In [None]:
get_product_names("604612 ONTARIO INC.")

In [None]:
get_product_names("2161587 ONTARIO INC.")

In [None]:
get_product_names("2267106 ONTARIO INC.")