In [1]:
import sys
print("sys version:", sys.version)
import pandas as pd
pd.options.display.max_colwidth
print("pandas version:", pd.__version__)

sys version: 3.7.1 (default, Dec 10 2018, 22:54:23) [MSC v.1915 64 bit (AMD64)]
pandas version: 0.23.4


# import

### products

In [2]:
prod = pd.read_csv('data/HS6Description2016.csv', encoding = "ISO-8859-1", dtype=str)
importers = pd.read_csv('data/MajorImportersbyHS62016.csv', encoding = "ISO-8859-1", dtype=str)

In [3]:
prod.drop('DESCRIPTION_FRA',axis=1, inplace=True)
prod.columns = ["hs6", "product"]
prod.set_index('hs6', inplace=True)

In [4]:
prod.tail()

Unnamed: 0_level_0,product
hs6,Unnamed: 1_level_1
970200,"Original Engravings, Prints And Lithographs"
970300,Original Sculptures And Statuary In Any Material
970400,Used Postage Or Revenue Stamps And The Like Or...
970500,"Collections, Collector`S Pieces Of Various Sci..."
970600,Antiques Of An Age Exceeding One Hundred Years


In [5]:
print(prod[prod.index=="970500"]['product'])

hs6
970500    Collections, Collector`S Pieces Of Various Sci...
Name: product, dtype: object


In [6]:
prod.shape

(5121, 1)

In [7]:
prod.to_pickle("data/products.pkl")

### importers_products

In [8]:
importers.drop("PROVINCE_FRA", axis=1, inplace=True)
importers.drop("DATA_YEAR-ANNÉE_DES_DONNÉES",axis=1, inplace=True)
importers.columns = ["hs6", "importer", "city", "province", "postalcode"]
importers["importer"] = importers["importer"].str.upper()
importers.set_index(["importer", "hs6"], inplace=True)

In [9]:
importers.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,city,province,postalcode
importer,hs6,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9072-9435 QUEBEC INC.,10121,Saint-Marc-sur-Richelieu,Quebec,J0L 2E0
CAMELOT STABLES,10121,Delta,British Columbia,V4K 1S9
COUNTRY LANE FARM,10121,Delta,British Columbia,V4K 3N2
FOXTRAIL FARMS,10121,Okotoks,Alberta,T1S 1A1
HOERDT'S HOT SHOT SERVICE LTD.,10121,Beaumont,Alberta,T4X 1H9


In [11]:
importers.shape

(108900, 3)

In [12]:
importers.to_pickle("data/importers_products.pkl")

# Create similarity matrix

In [None]:
sim_matrix = importers.sort_index().reset_index()[["importer", "hs6"]]
sim_matrix["_agg"] = 1

In [None]:
print("unique importers:", sim_matrix.importer.nunique())

In [None]:
%%time
sim_matrix = sim_matrix.pivot_table(index="importer", columns="hs6", values="_agg", aggfunc=min)

In [None]:
sim_matrix.iloc[0:5, -5:]

In [None]:
# number of importers who imported 970600?
sim_matrix[sim_matrix["970600"]==1].index.size

In [None]:
from scipy.spatial.distance import pdist,squareform

In [None]:
def build_matrix(data, metric):
    return pd.DataFrame(squareform(pdist(data, metric=metric)))

In [None]:
%%time
dist_matrix = build_matrix(data=sim_matrix.fillna(0), metric='jaccard')

About jaccard distance: https://www.statisticshowto.datasciencecentral.com/jaccard-index/

In [None]:
idx = sim_matrix.index.values

In [None]:
pd.Series(idx).to_pickle("data/importer_ids.pkl")

In [None]:
# dist_matrix.set_index(idx, inplace=True)
# dist_matrix.columns = idx

In [None]:
dist_matrix.loc[:5, :5]

In [None]:
# comp = ["#1 ANTIQUE WAREHOUSE FURNISHINGS LTD.", "A & A CONTRACT CUSTOMS BROKERS LTD.", 
#         "ACU PLASMOLD INC.", "CARGILL LIMITED"]

# dist_matrix.loc[comp, comp]

Store the matrix on disk as-is, in the CSV format

In [None]:
dist_matrix.to_csv("data/distance_matrix_with_imp_ids.csv")

Disk space: 3.7 GB.

Now, save a sparse matrix in pickle format:

In [None]:
%%time
dist_matrix.to_sparse(fill_value=1.0).to_pickle("data/dist_matrix_with_imp_ids_sparse.pkl")

138MB on disk, which means ~96% disk space reduction when compared to storing non-sparse data as CSV...!

In [None]:
%%time
del dist_matrix
del sim_matrix