In [1]:
import numpy
import pandas
from scipy import sparse
import itertools

In [2]:
def cal_sparsity(array):
    num_total = total_elems(array)
    num_non_zero = zero_elems(array, num_total)
    sparsity = num_non_zero/num_total
    print("Sparsity of matrix is = {}".format(sparsity))
    return sparsity


def zero_elems(array, num_total):
    non_zero = numpy.count_nonzero(array)
    return num_total-non_zero


def total_elems(array):
    shape = array.shape
    return shape[0]*shape[1]

In [3]:
import json
import pickle

In [4]:
with open('package-to-id-dict-without-trans.json', 'r') as f:
    pack_to_id = json.load(f)

In [5]:
with open('manifest-to-id-without-trans.pickle', 'rb') as f:
    man_to_id = pickle.load(f)

In [6]:
man_to_id.get(frozenset(['django']))

16101

In [7]:
users = len(man_to_id)

In [8]:
items = len(pack_to_id)

In [9]:
users

66018

In [10]:
items

18796

In [11]:
rating_matrix = numpy.zeros((users, items))

In [12]:
for item_list, user in man_to_id.items():
    for item in item_list:
        rating_matrix[user][pack_to_id.get(item)] = 1

In [12]:
cal_sparsity(rating_matrix)

Sparsity of matrix is = 0.9996093746247605


0.9996093746247605

In [13]:
rating_matrix[:10]

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [14]:
sparse_rating_matrix = sparse.csr_matrix(rating_matrix)

In [14]:
sparse_rating_matrix = sparse_rating_matrix.transpose()

In [15]:
sparse_rating_matrix[:10]

<10x18796 sparse matrix of type '<class 'numpy.float64'>'
	with 36 stored elements in Compressed Sparse Row format>

In [16]:
import bayessets

In [17]:
model = bayessets.BernoulliBayesianSet(sparse_rating_matrix)

In [17]:
with open('Bayesian_Sets.pkl', 'wb') as f:
    pickle.dump(model, f)

In [18]:
with open('Bayesian_Sets.pkl', 'rb') as f:
    model = pickle.load(f)

In [19]:
import random
from math import ceil

In [20]:
input_stack = ['flask', 'codecov', 'pytest', 'flake8', 'tensorflow', 'theano', 'scipy', 'numpy']

In [24]:
input_stack = map_input_to_package_ids(input_stack)

In [25]:
input_stack

[0, 182, 11, 7, 788, 526, 30, 384]

In [26]:
with open('id-to-package-dict-without-trans.json', 'r') as f:
    id_to_pack = json.load(f)

In [23]:
def get_packages_from_id(package_ids):
    package_list = list()
    for i in package_ids:
        package = id_to_pack.get(str(i))
        package_list.append(package)
    return package_list

In [22]:
def map_input_to_package_ids(input_stack):
    package_id_list = list()
    for package in input_stack:
        package_id = pack_to_id.get(package)
        if package_id is not None:
            package_id_list.append(package_id)
    return package_id_list

In [27]:
get_packages_from_id(input_stack)

['flask',
 'codecov',
 'pytest',
 'flake8',
 'tensorflow',
 'theano',
 'scipy',
 'numpy']

In [34]:
scores = model.query(list(input_stack))
ranking = numpy.argsort(scores)[::-1]
top30 = ranking[:30]
recommendations = numpy.array(list(itertools.compress(top30,
    [i not in input_stack for i in top30])))

In [43]:
recommendations

array([ 103,    8,   13,  145, 1110,  307,  282,  244,   85,  878, 1259,
         64,   21,  275,  456,  100,  350,   59,  220,  453,  224,  779])

In [44]:
get_packages_from_id(recommendations)

['matplotlib',
 'gunicorn',
 'pandas',
 'cython',
 'keras',
 'pyparsing',
 'pep8',
 'nose',
 'pytz',
 'cycler',
 'sklearn',
 'pymongo',
 'coverage',
 'flask-restful',
 'nltk',
 'h5py',
 'pytest-cov',
 'redis',
 'sqlalchemy',
 'scikit-learn',
 'tqdm',
 'sympy']