In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [3]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
from numpy import zeros, log, array
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import pairwise_distances
import scipy.spatial.distance as dist
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
path = os.path.expanduser("~/Google Drive/CSVs/customer_order.txt")

In [5]:
def read_data(filename):
    """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
    and a sparse matrix of artist/user/playcount """
    # read in triples of user/artist/playcount from the input dataset


    # map each artist and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['artist'] = data['artist'].astype("category")

    # create a sparse matrix of all the users/plays
    plays = coo_matrix((data['plays'].astype(float),
                       (data['artist'].cat.codes.copy(),
                        data['user'].cat.codes.copy())))

    return data, plays

In [16]:
data = pd.read_table(path,
                     delimiter=";",
                     usecols=[0,1,2,3],
                     header=1,
                     names=["idClient", "idItemOrigin", "OrderNo", "avg_sales"],
                     dtype={"idClient": np.int16, "idItemOrigin": np.str0, "OrderNo": np.str0, "avg_sales": np.float64})

In [18]:
data['OrderNo'] = data['OrderNo'].astype("category")
data['idItemOrigin'] = data['idItemOrigin'].astype("category")

In [17]:
data.head()

Unnamed: 0,idClient,idItemOrigin,OrderNo,avg_sales
0,4,10000500,1157751,10.2
1,4,10000500,1186706,10.2
2,4,10000500,1250937,20.4
3,4,10000500,1252112,20.4
4,4,10000500,1294823,10.2


In [20]:
sales = coo_matrix((data['avg_sales'].astype(float),
                    (data['idItemOrigin'].cat.codes.copy(),
                     data['OrderNo'].cat.codes.copy())))

In [21]:
sales.shape

(267201, 9269715)

In [22]:
def cosine1(matrix):
    normalized = normalize(matrix)
    return normalized.dot(normalized.T)

In [23]:
def bm25_weight(data, K1=100, B=0.8):
    """ Weighs each row of the matrix data by BM25 weighting """
    # calculate idf per term (user)
    N = float(data.shape[0])
    idf = np.log(N / (1 + np.bincount(data.col)))

    # calculate length_norm per document (artist)
    row_sums = np.squeeze(np.asarray(data.sum(1)))
    average_length = row_sums.sum() / N
    length_norm = (1.0 - B) + B * row_sums / average_length

    # weight matrix rows by bm25
    ret = coo_matrix(data)
    ret.data = ret.data * (K1 + 1.0) / (K1 * length_norm[ret.row] + ret.data) * idf[ret.col]
    return ret

In [24]:
def bm25(matrix):
    plays = bm25_weight(matrix)
    return plays.dot(matrix.T)

In [25]:
#similarity = cosine1(sales)
similarity = bm25(sales)

In [26]:
items = dict(enumerate(data['idItemOrigin'].cat.categories))
order_count = data.groupby('idItemOrigin').size()
to_generate = sorted(list(items), key=lambda x: -order_count[x])

In [27]:
def get_largest(row, N=10):
    if N >= row.nnz:
        best = zip(row.data, row.indices)
    else:
        ind = np.argpartition(row.data, -N)[-N:]
        best = zip(row.data[ind], row.indices[ind])
    return sorted(best, reverse=True)


def calculate_similar_items(similarity, item, idItem):
    neighbours = similarity[idItem]
    top = get_largest(neighbours)
    return [(item[other], score, i) for i, (score, other) in enumerate(top)]

In [28]:
l = []
for item in to_generate:
    name = items[item]
    for other, score, rank in calculate_similar_items(similarity, items, item):
            l.append([name, other, score, rank])

In [29]:
similarity_DF = pd.DataFrame(l, columns=['name', 'other', 'score', 'rank'])

In [30]:
similarity_DF = similarity_DF[similarity_DF['rank'] != 0]

In [31]:
#similarity_DF.to_html("test.html")

In [33]:
similarity_DF.to_json("cosine_distance_order_based.json")
similarity_DF.to_csv("cosine_distance_order_based.csv", sep = ";")

In [34]:
similarity_DF.head()

Unnamed: 0,name,other,score,rank
1,1632908,1534339,35725350.0,1
2,1632908,1672993,35343690.0,2
3,1632908,1730395,27258040.0,3
4,1632908,1631322,21190040.0,4
5,1632908,1632908,20742620.0,5
