In [1]:
from github import Github
from scipy import linalg
from os.path import isdir
# from commonjs_package_tokenizer import tokenize_package
import npm_crawler as npm
import os
import commonjs_package_tokenizer as t
import numpy as np
import packages_term_indexer as pti_mod
import pandas as pd

In [2]:
%%time

# spider npm's "most starred repositories" page for
# package name and a github url.
packages = npm.get_most_starred_packages(max_pages=10)

# providing a github login means we can exceed the public API request limit of 60 requests per hour.
# We need to look up approximately 400 packages.
gh_user = os.environ.get('GITHUB_API_USER')
gh_token = os.environ.get('GITHUB_API_TOKEN')
gh = Github(gh_user, gh_token)

# for each of those packages, perform a shallow git clone
for name, gh_url in packages:
    clone_url = 'git://github.com/{}.git'.format(gh_url)
    clone_dir = '.cache/packages/{}'.format(name)
    
    # use github API to determine if this repo is predominantly javascript.
    # we want to exclude coffeescript/typescript repos because their tokens
    # are significantly different.
    is_javascript = gh.is_repo_javascript(gh_url)
    
    if is_javascript and not isdir(clone_dir):
        # git clone the repo. assign output to an unused variable to supress noise
        _ = !git clone $clone_url $clone_dir --depth=1

CPU times: user 487 ms, sys: 18.2 ms, total: 505 ms
Wall time: 505 ms


In [3]:
%%time
reload(t)
reload(pti_mod)

# build a document-term matrix for all the packages we downloaded
pti = pti_mod.PackagesTermIndexer()

i = 0
for dir_name in os.listdir('.cache/packages'):
    if dir_name == '.gitkeep': continue
    i += 1
    if i > 50: break
    pkg = pti_mod.Package(dir_name)
    for token in t.tokenize_package(dir_name):
        pkg.register_term(token)
    pti.append(pkg)

print "{} packages indexed".format(i)

51 packages indexed
CPU times: user 10 s, sys: 148 ms, total: 10.1 s
Wall time: 10.4 s


In [4]:
%%time
wf_matrix = pti.word_frequency_matrix()

CPU times: user 368 ms, sys: 47.2 ms, total: 416 ms
Wall time: 414 ms


In [5]:
%%time
tfidf_matrix = pti.tfidf_matrix()

CPU times: user 2.33 s, sys: 70.4 ms, total: 2.4 s
Wall time: 2.62 s


In [6]:
%%time
T,sigma,D_trans = pti.svd()

CPU times: user 937 ms, sys: 116 ms, total: 1.05 s
Wall time: 1.09 s


In [7]:
df = pd.DataFrame(wf_matrix, index=pti.term_indices_, columns=pti.package_names_)
df.head()

Unnamed: 0,agenda,angular,apn,async,babelify,backbone,bcryptjs,bearcat,benchmark,bl,...,cron,d3,debug,dist,dnode,dotenv,download,ecstatic,ejs,elasticsearch
humanInterval,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
all,1,51,1,0,0,1,4,2,0,0,...,0,3,0,0,0,0,0,0,2,14
code,3,25,0,1,1,0,3,0,0,0,...,0,2,0,0,1,0,0,0,0,7
computeFromInterval(),1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
this._defaultConcurrency,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df = pd.DataFrame(tfidf_matrix, index=pti.term_indices_, columns=pti.package_names_)
df.head()

Unnamed: 0,agenda,angular,apn,async,babelify,backbone,bcryptjs,bearcat,benchmark,bl,...,cron,d3,debug,dist,dnode,dotenv,download,ecstatic,ejs,elasticsearch
humanInterval,1.901551,0.0,0,0,0,0,0.0,0.0,0,0,...,0,0.0,0,0,0,0,0,0,0.0,0.0
all,1.0,5.70044,1,0,0,1,2.321928,1.584963,0,0,...,0,2.0,0,0,0,0,0,0,1.584963,3.906891
code,2.0,4.70044,0,1,1,0,2.0,0.0,0,0,...,0,1.584963,0,0,1,0,0,0,0.0,3.0
computeFromInterval(),1.0,0.0,0,0,0,0,0.0,0.0,0,0,...,0,0.0,0,0,0,0,0,0,0.0,0.0
this._defaultConcurrency,1.0,0.0,0,0,0,0,0.0,0.0,0,0,...,0,0.0,0,0,0,0,0,0,0.0,0.0


In [9]:
%%time
reload(t)
other = pti_mod.Package('other-pkg')
for token in t.tokenize_package('../../../other'):
    other.register_term(token)

CPU times: user 288 ms, sys: 40.9 ms, total: 329 ms
Wall time: 383 ms


In [10]:
%%time

# folded_wfm = pti.fold_wfm(other)
# folded_tfidf = pti.fold_tfidf(other)
folded_svd = pti.fold_svd(other)
print folded_svd.shape

(1, 50)
CPU times: user 83.4 ms, sys: 2.36 ms, total: 85.8 ms
Wall time: 117 ms


In [11]:
T,_,_ = pti.svd()
T.shape

(88885, 50)

In [12]:
def cosine_sim(a,b):
    ret = np.dot(a, b)/linalg.norm(a)/linalg.norm(b)
#     return linalg.norm(ret)
    return ret[0,0]

In [13]:
%%time

other_doc_svd = pti.fold_svd(other)

cosines = []
for idx, pkg in enumerate(pti.package_names_):
    pkg_svd_vector = pti.svd()[2][:,idx]
    cos_svd = cosine_sim(other_doc_svd, pkg_svd_vector.T)
    cosines.append(cos_svd)

CPU times: user 70.4 ms, sys: 1.21 ms, total: 71.6 ms
Wall time: 86.7 ms


In [14]:
# %%time

other_doc_wfm = pti.fold_wfm(other)
other_doc_tfidf = pti.fold_tfidf(other)
other_doc_svd = pti.fold_svd(other)
other_doc_svd_w = pti.fold_svd_wfm(other)

# other_doc_wfm = pti.fold_wfm(pti._packages[0])
# other_doc_tfidf = pti.fold_tfidf(pti._packages[0])
# other_doc_svd = pti.fold_svd(pti._packages[0])


_,_,D_t = pti.svd()
_,_,Dw_t = pti.svd_wfm()
cosines = []
for idx, pkg in enumerate(pti.package_names_):
    pkg_svd_vector = D_t[:,idx]
    pkg_wfm_vector = pti.word_frequency_matrix()[:,idx]
    pkg_tfidf_vector = pti.tfidf_matrix()[:,idx]
    pkg_svd_w_vector = Dw_t[:, idx]
    
    cos_wfm = cosine_sim(other_doc_wfm, pkg_wfm_vector)
    cos_tfidf = cosine_sim(other_doc_tfidf, pkg_tfidf_vector)
    cos_svd = cosine_sim(other_doc_svd, pkg_svd_vector)
    cos_svd2 = cosine_sim(other_doc_svd_w, pkg_svd_w_vector)
    cosines.append([cos_wfm, cos_tfidf, cos_svd, cos_svd2])

In [15]:
df = pd.DataFrame(cosines, index=pti.package_names_, columns=['wfm cosine', 'tfidf cosine', 'svd cosine (tfidf)', 'svd cosine (wfm)'])
df.head(20)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
agenda,0.868696,0.233013,-2.632613e-15,5.345094e-14
angular,0.557249,0.171729,7.524648e-16,5.108036e-15
apn,0.862723,0.245765,1.291488e-15,-2.823808e-14
async,0.783686,0.282547,6.512151e-15,-4.290806e-14
babelify,0.773764,0.20055,4.781219e-16,6.422344e-14
backbone,0.534042,0.183004,1.266778e-15,-2.24596e-14
bcryptjs,0.632546,0.183535,-6.000272e-16,-1.682457e-15
bearcat,0.479265,0.192318,5.8962e-16,1.527741e-14
benchmark,,,-1.0,1.0
bl,0.782591,0.225968,2.11891e-15,2.47486e-14


In [16]:
df.sort_values(by='wfm cosine', ascending=False).head(10)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
ecstatic,0.88266,0.238679,1.311272e-15,-1.940243e-14
cheerio,0.870821,0.243453,-3.538905e-16,-1.938607e-14
agenda,0.868696,0.233013,-2.632613e-15,5.345094e-14
bower,0.862955,0.248647,2.70978e-15,2.611987e-14
apn,0.862723,0.245765,1.291488e-15,-2.823808e-14
bluebird,0.86255,0.230062,1.477434e-15,4.028391e-15
d3,0.85776,0.175521,5.934607e-16,2.165427e-14
blessed,0.856502,0.195492,5.710506e-16,-1.474026e-15
browserify,0.850246,0.260168,5.019812e-15,2.587634e-14
bootstrap,0.83804,0.218035,1.197561e-15,1.231516e-14


In [17]:
df.sort_values(by='tfidf cosine', ascending=False).head(10)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
connect-mongo,0.832192,0.319445,2.72573e-14,4.549013e-14
bookshelf,0.737584,0.291285,1.086508e-14,2.775806e-14
async,0.783686,0.282547,6.512151e-15,-4.290806e-14
elasticsearch,0.773149,0.277964,3.653581e-15,1.849218e-14
bunyan,0.780084,0.264472,3.282722e-15,-4.179543e-15
dnode,0.836333,0.263376,5.204565e-15,-1.304658e-14
browserify,0.850246,0.260168,5.019812e-15,2.587634e-14
bower,0.862955,0.248647,2.70978e-15,2.611987e-14
apn,0.862723,0.245765,1.291488e-15,-2.823808e-14
concat-stream,0.784157,0.243463,7.05743e-15,-5.592155e-14


In [18]:
df.sort_values(by='svd cosine (tfidf)', ascending=False).head(10)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
connect-mongo,0.832192,0.319445,2.72573e-14,4.549013e-14
cron,0.727569,0.217225,1.336354e-14,-4.522303e-14
bookshelf,0.737584,0.291285,1.086508e-14,2.775806e-14
connect-redis,0.608567,0.200791,8.802491e-15,5.19575e-15
concat-stream,0.784157,0.243463,7.05743e-15,-5.592155e-14
async,0.783686,0.282547,6.512151e-15,-4.290806e-14
dnode,0.836333,0.263376,5.204565e-15,-1.304658e-14
browserify,0.850246,0.260168,5.019812e-15,2.587634e-14
elasticsearch,0.773149,0.277964,3.653581e-15,1.849218e-14
download,0.684489,0.207123,3.460313e-15,5.230514e-14


In [19]:
df.sort_values(by='svd cosine (wfm)', ascending=False).head(10)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
benchmark,,,-1.0,1.0
cli,0.343351,0.065453,-2.205567e-14,1.003686e-12
crawler,0.581122,0.145958,-1.612223e-14,1.476196e-13
babelify,0.773764,0.20055,4.781219e-16,6.422344e-14
agenda,0.868696,0.233013,-2.632613e-15,5.345094e-14
download,0.684489,0.207123,3.460313e-15,5.230514e-14
connect-mongo,0.832192,0.319445,2.72573e-14,4.549013e-14
chai,0.304579,0.221114,-5.134105e-16,4.339271e-14
cors,0.696256,0.222179,-3.718539e-15,3.792304e-14
classnames,0.752609,0.22993,-3.819556e-15,3.243648e-14
