# <center><span style="color:orange">Correlation Matrix (similarity calculation)</span></center>

In [1]:
## Required libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import TruncatedSVD

In [2]:
## A function for keeping track of memory usage
def ram_usg(x):
    import inspect
    import sys
    byte_size = sys.getsizeof(x)
    f = f"Occupied memory by this {type(x)} is: "
    s = str(round(byte_size * 9.537 * 10**-7, 2))
    print(f + s + " MB")

- Generate **UTILITY MATRIX** with sparse data types, using transaction table from the last five years containing only active(sold in the last two years) items.

In [3]:
mlb = MultiLabelBinarizer(sparse_output=True)

df = pd.read_csv("majority_active_five_years_table.csv", usecols=['transaction_id', 'item'])
df = df.groupby(['transaction_id']).item.agg(list).reset_index()['item']
df = pd.DataFrame(df)
df = pd.DataFrame.sparse.from_spmatrix(mlb.fit_transform(df.pop('item')).transpose(), index=mlb.classes_, columns=df.index)

ram_usg(df)

Occupied memory by this <class 'pandas.core.frame.DataFrame'> is: 2.61 MB


In [4]:
## Save DF index for identify material later in function form

id_arr = np.array(list(df.index))
np.save("correlation_matrix_item_ids.npy", id_arr)

In [5]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3990,3991,3992,3993,3994,3995,3996,3997,3998,3999
006I1F,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
007S5U,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00BMEG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00M281,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00TRZG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZZ6BLE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZZ7ULB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZZLRX3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZZNQV0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


- *Memory Usage of **Sparse** Data Frame is **~2 MB***
- ***Non-sparse** DF occupies **+10 GB** of memory*

---

In [6]:
SVD = TruncatedSVD(n_components=10)
matrix = SVD.fit_transform(df)
ram_usg(matrix)

Occupied memory by this <class 'numpy.ndarray'> is: 0.76 MB


In [7]:
matrix = np.corrcoef(matrix)
ram_usg(matrix)
pd.DataFrame(matrix)

Occupied memory by this <class 'numpy.ndarray'> is: 762.96 MB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1.000000,0.522166,0.505390,0.900150,0.538712,0.534202,0.471767,0.536509,0.596389,0.293204,...,0.499071,0.674443,0.300214,0.478020,0.250210,0.532679,0.834334,0.602004,0.359231,0.714833
1,0.522166,1.000000,0.146873,0.581235,0.659689,0.718811,0.906402,0.803578,0.390424,-0.025997,...,0.247670,0.659747,0.303132,0.716696,0.459488,0.807000,0.596465,0.504804,0.339912,0.340719
2,0.505390,0.146873,1.000000,0.543311,0.385162,0.442750,0.378903,0.366182,0.341810,0.722245,...,0.867290,0.482085,0.210233,0.287112,0.624516,0.349923,0.514323,0.465690,0.543065,0.839845
3,0.900150,0.581235,0.543311,1.000000,0.758260,0.516446,0.591972,0.575546,0.574179,0.469530,...,0.576610,0.864291,0.563022,0.634488,0.413676,0.665690,0.879449,0.531780,0.584188,0.781726
4,0.538712,0.659689,0.385162,0.758260,1.000000,0.703077,0.685651,0.449824,0.700132,0.646286,...,0.562074,0.754486,0.833190,0.836447,0.614067,0.818802,0.809679,0.565408,0.613092,0.605358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.532679,0.807000,0.349923,0.665690,0.818802,0.624817,0.805812,0.696454,0.678493,0.367769,...,0.402678,0.761129,0.545346,0.625425,0.614782,1.000000,0.761711,0.728043,0.624909,0.464549
9996,0.834334,0.596465,0.514323,0.879449,0.809679,0.633246,0.531827,0.467528,0.759332,0.541009,...,0.508959,0.771247,0.482546,0.531986,0.438822,0.761711,1.000000,0.658855,0.380958,0.639236
9997,0.602004,0.504804,0.465690,0.531780,0.565408,0.737327,0.633576,0.672554,0.547728,0.350809,...,0.369080,0.315224,0.355932,0.496655,0.529262,0.728043,0.658855,1.000000,0.564992,0.396416
9998,0.359231,0.339912,0.543065,0.584188,0.613092,0.357427,0.628049,0.574756,0.312716,0.538473,...,0.569881,0.560947,0.682092,0.621537,0.688239,0.624909,0.380958,0.564992,1.000000,0.626019


In [8]:
## Save for later use

np.save("correlation_matrix.npy", matrix)
pd.DataFrame(np.load("correlation_matrix.npy"))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1.000000,0.522166,0.505390,0.900150,0.538712,0.534202,0.471767,0.536509,0.596389,0.293204,...,0.499071,0.674443,0.300214,0.478020,0.250210,0.532679,0.834334,0.602004,0.359231,0.714833
1,0.522166,1.000000,0.146873,0.581235,0.659689,0.718811,0.906402,0.803578,0.390424,-0.025997,...,0.247670,0.659747,0.303132,0.716696,0.459488,0.807000,0.596465,0.504804,0.339912,0.340719
2,0.505390,0.146873,1.000000,0.543311,0.385162,0.442750,0.378903,0.366182,0.341810,0.722245,...,0.867290,0.482085,0.210233,0.287112,0.624516,0.349923,0.514323,0.465690,0.543065,0.839845
3,0.900150,0.581235,0.543311,1.000000,0.758260,0.516446,0.591972,0.575546,0.574179,0.469530,...,0.576610,0.864291,0.563022,0.634488,0.413676,0.665690,0.879449,0.531780,0.584188,0.781726
4,0.538712,0.659689,0.385162,0.758260,1.000000,0.703077,0.685651,0.449824,0.700132,0.646286,...,0.562074,0.754486,0.833190,0.836447,0.614067,0.818802,0.809679,0.565408,0.613092,0.605358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.532679,0.807000,0.349923,0.665690,0.818802,0.624817,0.805812,0.696454,0.678493,0.367769,...,0.402678,0.761129,0.545346,0.625425,0.614782,1.000000,0.761711,0.728043,0.624909,0.464549
9996,0.834334,0.596465,0.514323,0.879449,0.809679,0.633246,0.531827,0.467528,0.759332,0.541009,...,0.508959,0.771247,0.482546,0.531986,0.438822,0.761711,1.000000,0.658855,0.380958,0.639236
9997,0.602004,0.504804,0.465690,0.531780,0.565408,0.737327,0.633576,0.672554,0.547728,0.350809,...,0.369080,0.315224,0.355932,0.496655,0.529262,0.728043,0.658855,1.000000,0.564992,0.396416
9998,0.359231,0.339912,0.543065,0.584188,0.613092,0.357427,0.628049,0.574756,0.312716,0.538473,...,0.569881,0.560947,0.682092,0.621537,0.688239,0.624909,0.380958,0.564992,1.000000,0.626019


In [11]:
## Finally, the function and its results

def get_sim(item_no):
    material_list = list(df.index)
    material_ID = material_list.index(item_no)
    correlation_material_IDs = matrix[material_ID]
    recommended_IDs = np.argsort(correlation_material_IDs)[0:10]
    recommended_materials = []
    for x in recommended_IDs:
        recommended_materials.append(df.index[x])
    return recommended_materials

In [12]:
get_sim("ZZ6BLE")

['XC278Y',
 'A7HJTF',
 'MVNUBP',
 'UOMOWO',
 '6JQ2K5',
 'V98QDU',
 'A7GB8W',
 'CLQQZB',
 'DRJY42',
 'K9KX80']