In [1]:
from github import Github
from scipy import linalg
from os.path import isdir
# from commonjs_package_tokenizer import tokenize_package
import npm_crawler as npm
import os
import commonjs_package_tokenizer as t
import numpy as np
import packages_term_indexer as pti_mod
import pandas as pd
import pickle

In [2]:
%%time

# spider npm's "most starred repositories" page for
# package name and a github url.
packages = npm.get_most_starred_packages(max_pages=10)

# providing a github login means we can exceed the public API request limit of 60 requests per hour.
# We need to look up approximately 400 packages.
gh_user = os.environ.get('GITHUB_API_USER')
gh_token = os.environ.get('GITHUB_API_TOKEN')
gh = Github(gh_user, gh_token)

# for each of those packages, perform a shallow git clone
for name, gh_url in packages:
    clone_url = 'git://github.com/{}.git'.format(gh_url)
    clone_dir = '.cache/packages/{}'.format(name)
    
    # use github API to determine if this repo is predominantly javascript.
    # we want to exclude coffeescript/typescript repos because their tokens
    # are significantly different.
    is_javascript = gh.is_repo_javascript(gh_url)
    
    if is_javascript and not isdir(clone_dir):
        # git clone the repo. assign output to an unused variable to supress noise
        _ = !git clone $clone_url $clone_dir --depth=1

CPU times: user 590 ms, sys: 55.6 ms, total: 646 ms
Wall time: 976 ms


In [3]:
%%time
reload(t)
reload(pti_mod)

# build a document-term matrix for all the packages we downloaded
pti = pti_mod.PackagesTermIndexer()

i = 0
for dir_name in os.listdir('.cache/packages'):
    if dir_name == '.gitkeep': continue
    i += 1
#     if i > 50: break
    pkg = pti_mod.Package(dir_name)
    for token in t.tokenize_package(dir_name):
        pkg.register_term(token)
    pti.append(pkg)

print "{} packages indexed".format(i)
doer = pti.fit_trim()

267 packages indexed
CPU times: user 1min 3s, sys: 2.59 s, total: 1min 5s
Wall time: 1min 18s


In [4]:
%%time
wf_matrix = doer.word_frequency_matrix()

CPU times: user 276 ms, sys: 26.4 ms, total: 303 ms
Wall time: 284 ms


In [5]:
%%time
tfidf_matrix = doer.tfidf_matrix()

CPU times: user 146 ms, sys: 12.4 ms, total: 159 ms
Wall time: 154 ms


In [6]:
%%time
T,sigma,D_trans = doer.svd()

CPU times: user 218 ms, sys: 26.2 ms, total: 244 ms
Wall time: 250 ms


In [7]:
print doer._global_term_count.keys()[0:10]
print doer.term_indices_[0:10]

['=', '{', '*', 'var', '}', 'return', 'if', 'the', 'function', '+']
['=', '{', '*', 'var', '}', 'return', 'if', 'the', 'function', '+']


In [8]:
df = pd.DataFrame(wf_matrix, index=doer.term_indices_, columns=doer.package_names_)
df.head()

Unnamed: 0,agenda,angular,apn,async,babelify,backbone,bcryptjs,bearcat,benchmark,bl,...,webtorrent,when,winston,wiredep,ws,x-ray,xlsx,xtend,yargs,yo
=,95,6152,181,159,23,14,260,167,0,31,...,304,420,389,47,769,161,3228,33,213,129
{,59,8981,132,226,10,15,76,143,0,20,...,178,424,244,68,544,121,2630,39,159,86
*,15,4399,44,21,0,0,179,112,0,0,...,65,494,69,29,287,106,163,0,1,25
var,22,4471,75,92,9,6,56,58,0,5,...,155,183,67,28,247,100,1523,25,125,99
},35,3721,88,97,6,8,61,82,0,15,...,114,179,138,35,372,90,909,20,93,36


In [9]:
df = pd.DataFrame(tfidf_matrix, index=doer.term_indices_, columns=doer.package_names_)
df.head(50)

Unnamed: 0,agenda,angular,apn,async,babelify,backbone,bcryptjs,bearcat,benchmark,bl,...,webtorrent,when,winston,wiredep,ws,x-ray,xlsx,xtend,yargs,yo
=,1.772635,3.388369,2.021056,1.971022,1.234246,1.051713,2.161067,1.98997,0,1.345972,...,2.221571,2.346749,2.317045,1.50344,2.581228,1.975846,3.137964,1.369516,2.083959,1.890382
{,1.60759,3.574164,1.920131,2.130035,0.941502,1.08862,1.705538,1.951331,0,1.195391,...,2.036758,2.376273,2.159996,1.662466,2.47392,1.886235,3.092061,1.44839,1.9927,1.75348
*,1.848495,5.593216,2.537915,2.060809,0.0,0.0,3.462163,3.151766,0,0.0,...,2.793258,4.136601,2.832487,2.26759,3.775516,3.115391,3.400099,0.0,0.462124,2.172184
var,1.245399,3.338648,1.72014,1.80032,0.914572,0.772903,1.605874,1.619572,0,0.711676,...,2.005771,2.07134,1.675962,1.337469,2.189899,1.833097,2.91107,1.294095,1.920941,1.829145
},1.324911,3.039873,1.659554,1.69517,0.719449,0.812366,1.525899,1.633749,0,1.025091,...,1.754313,1.919958,1.824391,1.324911,2.189347,1.667771,2.519091,1.125632,1.679763,1.335041
return,1.287467,3.394243,1.570395,1.758708,0.680599,0.464581,1.361198,1.752048,0,1.050817,...,1.836908,2.209863,1.724308,1.172472,1.765264,1.77808,2.778101,0.464581,1.377783,1.287467
if,1.224752,2.923022,1.450554,1.341346,0.858255,0.76009,1.64239,1.612095,0,1.082998,...,1.825248,1.257321,1.853413,1.189217,2.244518,1.630553,1.810366,0.429128,1.599287,1.272641
the,0.79547,3.972847,2.023461,2.288919,0.0,1.425864,2.386409,1.912052,0,0.0,...,1.912052,2.893968,2.386409,1.823599,2.445968,1.425864,2.79304,0.0,1.193204,1.625726
function,0.663378,2.938881,1.358477,2.021854,0.738527,0.285701,1.465442,1.191352,0,1.05722,...,1.800997,2.21558,1.811302,1.116203,1.845467,1.358477,2.584774,0.802065,1.620618,1.510045
+,0.884064,2.878568,0.884064,1.654588,0.0,0.827294,1.695897,1.090476,0,0.46707,...,1.747718,1.151314,1.511539,0.684245,1.931559,0.827294,2.639552,0.0,1.204527,1.41667


In [10]:
%%time
with open('.cache/pickle', 'w') as serialized_fd:
    pickle.dump(doer, serialized_fd)

CPU times: user 988 ms, sys: 73.3 ms, total: 1.06 s
Wall time: 1.06 s


In [11]:
%%time
reload(t)
other = pti_mod.Package('other-pkg')
for token in t.tokenize_package('../../../cinch-api'):
    other.register_term(token)

CPU times: user 280 ms, sys: 54.6 ms, total: 334 ms
Wall time: 461 ms


In [12]:
%%time

# folded_wfm = pti.fold_wfm(other)
# folded_tfidf = pti.fold_tfidf(other)
folded_svd = doer.fold_svd(other)
print folded_svd.shape

(1, 267)
CPU times: user 7.46 ms, sys: 1.44 ms, total: 8.9 ms
Wall time: 4.89 ms


In [13]:
T,_,_ = doer.svd()
T.shape

(1000, 267)

In [14]:
def cosine_sim(a,b):
    ret = np.dot(a, b)/linalg.norm(a)/linalg.norm(b)
#     return linalg.norm(ret)
    return ret[0,0]

In [15]:
%%time

other_doc_svd = doer.fold_svd(other)

cosines = []
for idx, pkg in enumerate(doer.package_names_):
    pkg_svd_vector = doer.svd()[2][:,idx]
    cos_svd = cosine_sim(other_doc_svd, pkg_svd_vector.T)
    cosines.append(cos_svd)

CPU times: user 26.4 ms, sys: 2.9 ms, total: 29.3 ms
Wall time: 24.2 ms


In [16]:
%%time

other_doc_wfm = doer.fold_wfm(other)
other_doc_tfidf = doer.fold_tfidf(other)
other_doc_svd = doer.fold_svd(other)
other_doc_svd_w = doer.fold_svd_wfm(other)

# other_doc_wfm = pti.fold_wfm(pti._packages[0])
# other_doc_tfidf = pti.fold_tfidf(pti._packages[0])
# other_doc_svd = pti.fold_svd(pti._packages[0])


_,_,D_t = doer.svd()
_,_,Dw_t = doer.svd_wfm()
cosines = []
for idx, pkg in enumerate(doer.package_names_):
    pkg_svd_vector = D_t[:,idx]
    pkg_wfm_vector = doer.word_frequency_matrix()[:,idx]
    pkg_tfidf_vector = doer.tfidf_matrix()[:,idx]
    pkg_svd_w_vector = Dw_t[:, idx]
    
    cos_wfm = cosine_sim(other_doc_wfm, pkg_wfm_vector)
    cos_tfidf = cosine_sim(other_doc_tfidf, pkg_tfidf_vector)
    cos_svd = cosine_sim(other_doc_svd, pkg_svd_vector)
    cos_svd2 = cosine_sim(other_doc_svd_w, pkg_svd_w_vector)
    cosines.append([cos_wfm, cos_tfidf, cos_svd, cos_svd2])

In [17]:
df = pd.DataFrame(cosines, index=doer.package_names_, columns=['wfm cosine', 'tfidf cosine', 'svd cosine (tfidf)', 'svd cosine (wfm)'])
df.head(20)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
agenda,0.891875,0.424031,3.129954e-15,4.010271e-15
angular,0.561672,0.250068,8.100300000000001e-17,-7.246252e-16
apn,0.875364,0.447486,2.945451e-16,-5.100641e-14
async,0.803992,0.515599,2.845232e-15,-1.51651e-14
babelify,0.815245,0.285086,-1.771613e-14,-6.970669e-14
backbone,0.597295,0.388325,1.020385e-14,-1.279354e-13
bcryptjs,0.651197,0.253523,8.151142e-16,-1.380266e-14
bearcat,0.516361,0.326057,-1.91732e-15,-4.530438e-15
benchmark,,,-0.2918656,-0.6211962
bl,0.826275,0.359106,-7.005506e-16,-5.089729e-14


In [18]:
df.sort_values(by='wfm cosine', ascending=False).head(10)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
jsdom,0.94119,0.604526,5.013009e-15,4.118695e-15
hapi,0.93766,0.634347,4.492637e-15,-9.088306e-15
ionic,0.917161,0.490678,-1.288113e-15,-5.316401e-16
gulp-useref,0.912775,0.42067,-4.289398e-15,-8.561476e-14
less,0.910267,0.492691,6.299564e-16,-2.59143e-15
ecstatic,0.908024,0.457958,1.53763e-15,-8.848916e-15
formidable,0.906487,0.450944,8.128092e-15,5.435686e-14
joi,0.905993,0.530852,1.939391e-16,-3.621852e-15
request,0.902398,0.423112,-9.323602e-16,1.635333e-14
http-server,0.893932,0.383568,-9.1229e-15,-4.322269e-14


In [19]:
df.sort_values(by='tfidf cosine', ascending=False).head(10)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
hapi,0.93766,0.634347,4.492637e-15,-9.088306e-15
jsdom,0.94119,0.604526,5.013009e-15,4.118695e-15
connect-mongo,0.851852,0.589345,1.411008e-14,2.027189e-15
karma,0.887911,0.563788,2.047253e-15,3.671273e-15
generator-angular-fullstack,0.837581,0.554348,1.936956e-15,-6.39151e-15
joi,0.905993,0.530852,1.939391e-16,-3.621852e-15
react-router,0.663988,0.526875,2.589239e-15,-3.951169e-15
elasticsearch,0.780828,0.524924,7.344983e-16,2.854406e-15
browserify,0.861864,0.521676,1.680218e-15,1.197611e-14
knex,0.834694,0.521626,8.59259e-16,3.047959e-15


In [20]:
df.sort_values(by='svd cosine (tfidf)', ascending=False).head(10)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
i18n,,,0.01658889,-0.7835456
gulp-htmlmin,0.828775,0.348638,2.191758e-14,-1.935931e-13
standard,0.710477,0.246094,1.754527e-14,4.709307e-13
express-jwt,0.788964,0.384628,1.57052e-14,2.33744e-13
gulp-ng-annotate,0.829659,0.422704,1.50432e-14,1.177869e-14
connect-mongo,0.851852,0.589345,1.411008e-14,2.027189e-15
require-dir,0.512151,0.151954,1.320332e-14,1.583596e-13
gulp-replace,0.833987,0.429373,1.265776e-14,3.773577e-14
grunt-contrib-clean,0.702683,0.377749,1.175363e-14,5.907138e-14
npm-check,0.7237,0.435647,1.077576e-14,-2.935436e-15


In [21]:
df.sort_values(by='svd cosine (wfm)', ascending=False).head(10)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
validator,0.0,0.0,3.566608e-15,9.072335e-13
helmet,0.754851,0.287143,-7.085324e-15,8.914736e-13
gulp-coffee,0.779009,0.354627,-6.539492e-15,6.646621e-13
standard,0.710477,0.246094,1.754527e-14,4.709307e-13
gulp-sass,0.79414,0.294872,-3.024566e-15,2.965373e-13
node-schedule,0.698107,0.260996,-3.660942e-15,2.771193e-13
express-jwt,0.788964,0.384628,1.57052e-14,2.33744e-13
gulp-uglify,0.848786,0.37932,-9.421427e-15,2.2384e-13
gulp-uncss,0.78097,0.35041,-7.096757e-15,2.215636e-13
split,0.815371,0.457052,4.463673e-15,2.103327e-13
