In [9]:
import pandas as pd

### Load the dataset and explore it.

For the purposes of this tutorial we will be working with a dump of the NPM registry itself. It was collected using a "feed-watcher" for the NPM registry.

In [10]:
node_details = pd.read_json(
    "node-package-details-clean.json", lines=True)

### Preprocessing the data

First we ignore all packages that have less than 5 dependencies for themselves. Reason for this is otherwise we end up with a very sparse matrix (>99.8%) parsity which leads to garbage weights.

In [3]:
node_details['name'] = node_details.astype('unicode')['name'].str.lower()

manifest_rows = node_details[pd.notnull(node_details['dependencies'])]
manifest_rows = manifest_rows[manifest_rows['dependencies'].str.len() >= 5]

packages_with_keywords = node_details[pd.notnull(node_details['keywords'])]
packages_with_keywords = packages_with_keywords[packages_with_keywords['keywords'].str.len() > 0]

In [11]:
manifest_rows.shape[0]

105818

In [12]:
manifest_rows.head()

Unnamed: 0,allDependencies,dependencies,description,devDependencies,keywords,name,version,id
14,"[async, connect-redis, express, nodeunit, redi...","[async, connect-redis, express, nodeunit, redi...",awesome framework,[],,awesome,0.0.7,14
22,"[ejs, express, oauth, socket.io, underscore]","[ejs, express, oauth, socket.io, underscore]",IRC-like chatroom + Twitter live-stream,[],"[real-time, webapp, ExpressJS, socket.io, jQue...",socket-twitchat,0.7.15,22
56,"[async, coffee-script, coloured-log, growl, op...","[async, coffee-script, coloured-log, growl, op...",Kontinuos Integrated Testing Koffee Applicatio...,[],"[integration, testing, framework, expresso, ja...",kitkat,0.3.0,56
98,"[extx-reference-slot, extx-shotenjin, johnny-m...","[extx-reference-slot, extx-shotenjin, johnny-m...",Framework for web-sites running on the client,[],,symbie,0.0.2,98
109,"[bufferjs, html5, jsdom, sharedjs, underscore]","[bufferjs, html5, jsdom, sharedjs, underscore]",Easy way to write a spider.,[],"[spider, scrap]",scrap,1.0.1,109


### Filtration

Some fancy preprocessing using the Python multiprocessing package, here we add a new "add_deps_resolved" column to indicate that for a certain dependency all the packages that it in turn is dependent on have tags against them. Recall from the presentation that the representation of a single dependency requires collecting all its tags and the tags of its own dependents. We can take cases where we don't require all of the dependencies to have tags on them but for simplicity's sake we leave it so here.

In [13]:
manifest_rows['id'] = manifest_rows.index
from pandas.util.testing import isiterable
import numpy as np
import datetime

i=1
row_count = manifest_rows.shape[0]

print(datetime.datetime.utcnow())

from multiprocessing import Pool

num_partitions = 8 #number of partitions to split dataframe
num_cores = 8 #number of cores on your machine

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def check_deps_resolved(data):
    data['all_deps_resolved'] = data['dependencies'].apply(add_dependencies_resolved_column)
    return data

def add_dependencies_resolved_column(dependencies):
    dependencies = [dep.lower() for dep in dependencies]
    packages_with_tags = packages_with_keywords.loc[packages_with_keywords['name'].isin(dependencies)]
    
    if len(packages_with_tags) == 0:
        return 0
    elif len(set(dependencies) - set(packages_with_tags['name'])) == 0:
        return 1
    else:
        return 0

manifest_rows = parallelize_dataframe(manifest_rows, check_deps_resolved)
manifest_rows.head()

2019-08-07 23:19:30.431626


Unnamed: 0,allDependencies,dependencies,description,devDependencies,keywords,name,version,id,all_deps_resolved
14,"[async, connect-redis, express, nodeunit, redi...","[async, connect-redis, express, nodeunit, redi...",awesome framework,[],,awesome,0.0.7,14,0
22,"[ejs, express, oauth, socket.io, underscore]","[ejs, express, oauth, socket.io, underscore]",IRC-like chatroom + Twitter live-stream,[],"[real-time, webapp, ExpressJS, socket.io, jQue...",socket-twitchat,0.7.15,22,0
56,"[async, coffee-script, coloured-log, growl, op...","[async, coffee-script, coloured-log, growl, op...",Kontinuos Integrated Testing Koffee Applicatio...,[],"[integration, testing, framework, expresso, ja...",kitkat,0.3.0,56,0
98,"[extx-reference-slot, extx-shotenjin, johnny-m...","[extx-reference-slot, extx-shotenjin, johnny-m...",Framework for web-sites running on the client,[],,symbie,0.0.2,98,0
109,"[bufferjs, html5, jsdom, sharedjs, underscore]","[bufferjs, html5, jsdom, sharedjs, underscore]",Easy way to write a spider.,[],"[spider, scrap]",scrap,1.0.1,109,0


In [14]:
manifest_rows = manifest_rows.loc[manifest_rows['all_deps_resolved'] == 1]

In [15]:
print(len(manifest_rows))
manifest_rows.head()

27936


Unnamed: 0,allDependencies,dependencies,description,devDependencies,keywords,name,version,id,all_deps_resolved
111,"[async, chai, config, debug, piper, supercomfy...","[async, config, debug, piper, supercomfy, unde...",Simple mapping library for CouchDB,[chai],,flatpack,0.1.2,111,1
149,"[backbone, markdown-js, mustache, promised-htt...","[backbone, markdown-js, mustache, promised-htt...",Teleport dashboard,[],"[teleport, dashboard, packages, dependencies]",teleport-dashboard,0.0.5,149,1
436,"[async, coffee-script, colors, nock, pkginfo, ...","[async, colors, pkginfo, request, underscore]",Official node.js API client for scottyapp.com....,"[coffee-script, nock, should, vows]","[scottyapp, api, rest, restful, client]",scottyapp-api-client,0.0.4,436,1
492,"[async, colors, eventemitter2, request, util]","[async, colors, eventemitter2, request, util]","a decentralized, distributed, anonymous database",[],,hnet,0.0.1,492,1
506,"[async, coffee-script, coffeekup, commander, c...","[async, commander, connect, express, lingo, mi...",Full Stack Web Framework for Node.js,"[coffee-script, coffeekup, design.io, jade, ma...","[framework, rails, node]",coach,0.3.0,506,1


In [16]:
with open('manifest_dataset.json', 'w') as f:
    f.write(manifest_rows.to_json())

### Vocabulary and package list

Here we create a list of all the items(dependencies) present in our dataset. We also create a vocabulary based on the tags found against these packages.

In [17]:
def clean_keywords(raw_keywords):
    return set(keyword.strip().lower().replace(' ', '-') for keyword in raw_keywords if keyword.strip() is not '')


def clean_dependencies(raw_keywords):
    return set(keyword.lower() for keyword in raw_keywords)

all_packages = set()
all_keywords = set()
list_of_manifest_list = []

for idx, row in manifest_rows.iterrows():
    all_packages = all_packages.union(set(dependency.lower() for dependency in row['dependencies']))
    list_of_manifest_list.append(list(clean_dependencies(row['dependencies'])))

vocabulary = set()

package_tag_map = {}

print("Length of all packages list (no. of items): {}".format(len(all_packages)))

Length of all packages list (no. of items): 29526


In [18]:
# First take care of all the direct keywords
print("Creating direct tag mapping")
i=0

keywords_df = packages_with_keywords.loc[packages_with_keywords['name'].isin(all_packages), ['name','keywords']]
# keywords_df.head()

package_tag_map = {}
for k,g in keywords_df.groupby("name"):
    package_tag_map[k] = clean_keywords(package_tag_map.get(k, set()).union(set(g["keywords"].tolist()[0])))
    vocabulary = vocabulary.union(package_tag_map[k])
print(package_tag_map['cli-color'])

Creating direct tag mapping
{'terminal', 'log', 'logging', 'xterm', 'cli', 'ansi', 'color', 'shell', 'console'}


In [19]:
print("Processing first level of dependencies")
# Now the dependencies
dependencies_df = packages_with_keywords.loc[packages_with_keywords['name'].isin(all_packages), ['name','dependencies']]

package_dep_map = {}
all_first_lv_deps = set()

def clean_dependencies(raw_keywords):
    return set(keyword.lower() for keyword in raw_keywords)

for k,g in dependencies_df.groupby("name"):
    package_dep_map[k] = clean_dependencies(package_dep_map.get(k, set()).union(set(g["dependencies"].tolist()[0])))
    all_first_lv_deps = all_first_lv_deps.union(set(package_dep_map[k]))

keywords_df_deps = packages_with_keywords.loc[packages_with_keywords['name'].isin(all_first_lv_deps), ['name','keywords']]

extended_ptm = {}

for k,g in keywords_df_deps.groupby("name"):
    extended_ptm[k] = clean_keywords(package_dep_map.get(k, set()).union(set(g["keywords"].tolist()[0])))

for package_name in package_tag_map.keys():
    more_keywords = set()
    for dependency in package_dep_map[package_name]:
        more_keywords = more_keywords.union(set(extended_ptm.get(dependency, [])))
    package_tag_map[package_name] = package_tag_map.get(package_name).union(more_keywords)
    vocabulary = vocabulary.union(more_keywords)

Processing first level of dependencies


In [20]:
print("Now making the maps")
pkg_idx_map = dict(zip(list(all_packages), range(len(all_packages))))
idx_pkg_map = dict(zip(range(len(all_packages)), list(all_packages)))

tag_idx_map = dict(zip(list(vocabulary), range(len(vocabulary))))
idx_tag_map = dict(zip(range(len(vocabulary)), list(vocabulary)))

Now making the maps


In [21]:
# content matrix construction
content_matrix = np.zeros([len(all_packages), len(vocabulary)])

for idx, package in enumerate(all_packages):
    this_package_tags = [tag_idx_map[tag] for tag in package_tag_map[package]]
    if idx == 0:
        print("Setting to 1: {}".format(this_package_tags))
    content_matrix[idx, this_package_tags] = 1

Setting to 1: [14211, 5772, 1082, 26460, 7738, 24695, 32528, 18816]


In [22]:
print(len(vocabulary))
print(len(all_packages))
print(len(list_of_manifest_list))

33347
29526
27936


### Save the "content matrix" that maps all of our packages to the "tag vocabulary" that we have created.

In [23]:
with open('content_matrix.npy', 'wb') as np_outfile:
    np.save(np_outfile, content_matrix)

### Save the "rating matrix". This contains our user-item i.e. stack-package pairs for the collaborative (probabilistic matrix factorization) part of the model.

In [24]:
# Rating matrix construction

with open('manifest_user_data.dat', 'w') as mud:
    for manifest in list_of_manifest_list:
        this_user_items = [pkg_idx_map[pkg] for pkg in manifest]
        mud.write("{} {}\n".format(len(this_user_items), " ".join(str(x) for x in this_user_items)))

### Data preparation to feed forward into our network, breakup into a training and a test set.

In [27]:
import numpy as np
pairs_train = []
pairs_test = []
num_train_per_user = 5
user_id = 0
np.random.seed(123)

for line in open("manifest_user_data.dat"):
    arr = line.strip().split()
    arr = np.asarray([int(x) for x in arr[1:]])
    n = len(arr)
    idx = np.random.permutation(n)
    # assert(n > num_train_per_user)
    for i in range(min(num_train_per_user, n)):
        # Add num_train_per_user or all of the user's items to the training data.
        pairs_train.append((user_id, arr[idx[i]]))
    # if we have more items than we need for training, append to testing.
    if n > num_train_per_user:
        for i in range(num_train_per_user, n):
            pairs_test.append((user_id, arr[idx[i]]))
    user_id += 1
num_users = user_id
pairs_train = np.asarray(pairs_train)
pairs_test = np.asarray(pairs_test)
num_items = np.maximum(np.max(pairs_train[:, 1]), np.max(pairs_test[:, 1]))+1
print("num_users=%d, num_items=%d" % (num_users, num_items))

# Row - users, column - items
with open("packagedata-train-"+str(num_train_per_user)+"-users.dat", "w") as fid:
    print(fid.name)
    for user_id in range(num_users):
        # Collect all items of this user.
        this_user_items = pairs_train[pairs_train[:, 0]==user_id, 1]
        # Convert to a space separated string of integers
        items_str = " ".join(str(x) for x in this_user_items)
        fid.write("%d %s\n" % (len(this_user_items), items_str))

# Row - items, column - users
with open("packagedata-train-"+str(num_train_per_user)+"-items.dat", "w") as fid:
    print(fid.name)
    for item_id in range(num_items):
        this_item_users = pairs_train[pairs_train[:, 1]==item_id, 0]
        users_str = " ".join(str(x) for x in this_item_users)
        fid.write("%d %s\n" % (len(this_item_users), users_str))

with open("packagedata-test-"+str(num_train_per_user)+"-users.dat", "w") as fid:
    print(fid.name)
    for user_id in range(num_users):
        this_user_items = pairs_test[pairs_test[:, 0]==user_id, 1]
        items_str = " ".join(str(x) for x in this_user_items)
        fid.write("%d %s\n" % (len(this_user_items), items_str))

with open("packagedata-test-"+str(num_train_per_user)+"-items.dat", "w") as fid:
    print(fid.name)
    for item_id in range(num_items):
        this_item_users = pairs_test[pairs_test[:, 1]==item_id, 0]
        users_str = " ".join(str(x) for x in this_item_users)
        fid.write("%d %s\n" % (len(this_item_users), users_str))


num_users=27936, num_items=29526
packagedata-train-5-users.dat
packagedata-train-5-items.dat
packagedata-test-5-users.dat
packagedata-test-5-items.dat


### Pretraining the network in stacked Denoising autoencoder fasion

- Pretrain the hidden layers
- Pretrain the latent layer
- Pretrain the network

In [40]:
# %load test_vae_package_data.py
%autoreload 2
import numpy as np
import tensorflow as tf
import logging
from aelib.vae import VariationalAutoEncoder
from aelib.utils import *
from scipy import sparse

if __name__ == '__main__':
    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
    # sess = tf.Session(config=tf.ConfigProto(log_device_placement=True, gpu_options=gpu_options))
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

    np.random.seed(0)
    tf.set_random_seed(0)
    init_logging("vae_pretrain_package_data.log")

    logging.info('loading data')

    content_matrix_file = open('content_matrix.npy', 'rb')
    data = np.load(content_matrix_file)
    content_matrix_file.close()
#     csr_sparse = sparse.load_npz('content_matrix.npz')
#     data = csr_sparse.toarray()

    idx = np.random.rand(data.shape[0]) < 0.8
    train_X = data[idx]
    test_X = data[~idx]
    logging.info('initializing sdae model')
    model = VariationalAutoEncoder(input_dim=37617, dims=[200, 100], z_dim=50,
                                   activations=['sigmoid', 'sigmoid'], epoch=[1, 1],
                                   noise='mask-0.3', loss='cross-entropy', lr=0.01, batch_size=1024, print_step=1)
    logging.info('fitting data starts...')
    model.fit(train_X, test_X)
# feat = model.transform(data)
# scipy.io.savemat('feat-dae.mat',{'feat': feat})
# np.savez("sdae-weights.npz", en_weights=model.weights, en_biases=model.biases,
# 	de_weights=model.de_weights, de_biases=model.de_biases)


2019-08-08 07:50:51,156 [3357] INFO     root: loading data
2019-08-08 07:50:51,156 [INFO]: loading data
2019-08-08 07:50:51,156 [INFO]: loading data
2019-08-08 07:50:51,156 [INFO]: loading data
2019-08-08 07:50:51,156 [INFO]: loading data
2019-08-08 07:50:51,156 [INFO]: loading data
2019-08-08 07:50:51,156 [INFO]: loading data
2019-08-08 07:50:51,156 [INFO]: loading data
2019-08-08 07:50:51,156 [INFO]: loading data
2019-08-08 07:50:51,156 [INFO]: loading data
2019-08-08 07:50:51,156 [INFO]: loading data
2019-08-08 07:50:51,156 [INFO]: loading data
2019-08-08 07:50:51,156 [INFO]: loading data
2019-08-08 07:51:15,219 [3357] INFO     root: initializing sdae model
2019-08-08 07:51:15,219 [INFO]: initializing sdae model
2019-08-08 07:51:15,219 [INFO]: initializing sdae model
2019-08-08 07:51:15,219 [INFO]: initializing sdae model
2019-08-08 07:51:15,219 [INFO]: initializing sdae model
2019-08-08 07:51:15,219 [INFO]: initializing sdae model
2019-08-08 07:51:15,219 [INFO]: initializing sdae m

2019-08-08 07:54:41,693 [INFO]: epoch 0: batch loss = 211.0083770751953, gen_loss=177.51190185546875, latent_loss=33.49647903442383, valid_loss=178.22448120117187
2019-08-08 07:54:41,693 [INFO]: epoch 0: batch loss = 211.0083770751953, gen_loss=177.51190185546875, latent_loss=33.49647903442383, valid_loss=178.22448120117187
2019-08-08 07:54:41,693 [INFO]: epoch 0: batch loss = 211.0083770751953, gen_loss=177.51190185546875, latent_loss=33.49647903442383, valid_loss=178.22448120117187
2019-08-08 07:54:41,693 [INFO]: epoch 0: batch loss = 211.0083770751953, gen_loss=177.51190185546875, latent_loss=33.49647903442383, valid_loss=178.22448120117187
2019-08-08 07:54:42,426 [3357] INFO     root: Weights saved at model/pretrain
2019-08-08 07:54:42,426 [INFO]: Weights saved at model/pretrain
2019-08-08 07:54:42,426 [INFO]: Weights saved at model/pretrain
2019-08-08 07:54:42,426 [INFO]: Weights saved at model/pretrain
2019-08-08 07:54:42,426 [INFO]: Weights saved at model/pretrain
2019-08-08 07:

## Training

We train the variational autoencoder and the PMF.

In [1]:
# %load test_cvae-packagedata5.py
import sys
sys.path.append("..")
from aelib.cvae import *
import numpy as np
import tensorflow as tf
import scipy.io
from scipy.sparse import load_npz
from aelib.utils import *

np.random.seed(0)
tf.set_random_seed(0)
init_logging("cvae-packagedata-5.log")

def load_cvae_data():
  data = {}
  data_dir = "./"
  # data_dir = "retrain/"
  # variables = scipy.io.loadmat(data_dir + "mult_nor.mat")
  # data["content"] = variables['X']

  d = np.load('content_matrix.npy')

  # d = d.T
  data["content"] = d

  data["train_users"] = load_rating(data_dir + "packagedata-train-5-users.dat")
  data["train_items"] = load_rating(data_dir + "packagedata-train-5-items.dat")
  data["test_users"] = load_rating(data_dir + "packagedata-test-5-users.dat")
  data["test_items"] = load_rating(data_dir + "packagedata-test-5-items.dat")

  return data

def load_rating(path):
  arr = []
  for line in open(path):
    a = line.strip().split()
    if a[0]==0:
      l = []
    else:
      l = [int(x) for x in a[1:]]
    arr.append(l)
  return arr

params = Params()
params.lambda_u = 0.1
params.lambda_v = 10
params.lambda_r = 1
params.a = 1
params.b = 0.01
params.M = 300
params.n_epochs = 1
params.max_iter = 1

data = load_cvae_data()
num_factors = 50
model = CVAE(num_users=27936, num_items=29526, num_factors=num_factors, params=params,
    input_dim=33347, dims=[200, 100], n_z=num_factors, activations=['sigmoid', 'sigmoid'],
    loss_type='cross-entropy', lr=0.001, random_seed=0, print_step=10, verbose=False)
model.load_model(weight_path="model/pretrain")
# model.load_model(weight_path="/Users/avgupta/s3/avgupta-stack-analysis-dev/weights/pretrain/pretrain")

model.run(data["train_users"], data["train_items"], data["test_users"], data["test_items"],
   data["content"], params)
model.save_model(weight_path="weights/train/cvae-packagedata", pmf_path="weights/train/pmf-packagedata")

  return f(*args, **kwds)
  return f(*args, **kwds)


only initializing x_recon
confirmed


2019-08-08 11:58:43,836 [8153] INFO     root: Loading weights from model/pretrain
2019-08-08 11:58:43,836 [INFO]: Loading weights from model/pretrain


INFO:tensorflow:Restoring parameters from model/pretrain


2019-08-08 11:58:43,837 [8153] INFO     tensorflow: Restoring parameters from model/pretrain
2019-08-08 11:58:43,837 [INFO]: Restoring parameters from model/pretrain
2019-08-08 12:00:55,309 [8153] INFO     root: [#epoch=000000], loss=1733259.76306, neg_likelihood=41497.92745, gen_loss=114.59472
2019-08-08 12:00:55,309 [INFO]: [#epoch=000000], loss=1733259.76306, neg_likelihood=41497.92745, gen_loss=114.59472
2019-08-08 12:00:55,659 [8153] INFO     root: Weights saved at /Users/avgupta/weights/train/cvae-packagedata
2019-08-08 12:00:55,659 [INFO]: Weights saved at /Users/avgupta/weights/train/cvae-packagedata
2019-08-08 12:00:55,792 [8153] INFO     root: Matrices saved at /Users/avgupta/weights/train/pmf-packagedata
2019-08-08 12:00:55,792 [INFO]: Matrices saved at /Users/avgupta/weights/train/pmf-packagedata
