In [1]:
import os
import pickle
import numpy as np
import pandas as pd

import npm_crawler as npm
import commonjs_package_tokenizer as t
import packages_term_indexer as pti_mod

from scipy import linalg
from github import Github
from os.path import isdir

In [2]:
%%time

# spider npm's "most starred repositories" page for
# package name and a github url.
packages = list(npm.get_most_starred_packages(max_pages=40))

# providing a github login means we can exceed the public API request limit of 60 requests per hour.
# We need to look up approximately 400 packages.
gh_user = os.environ.get('GITHUB_API_USER')
gh_token = os.environ.get('GITHUB_API_TOKEN')
gh = Github(gh_user, gh_token)

# for each of those packages, perform a shallow git clone
for name, gh_url in packages:
    clone_url = 'git://github.com/{}.git'.format(gh_url)
    clone_dir = '.cache/packages/{}'.format(name)
    
    # use github API to determine if this repo is predominantly javascript.
    # we want to exclude coffeescript/typescript repos because their tokens
    # are significantly different.
    is_javascript = gh.is_repo_javascript(gh_url)
    
    if is_javascript and not isdir(clone_dir):
        # git clone the repo. assign output to an unused variable to supress noise
        _ = !git clone $clone_url $clone_dir --depth=1

CPU times: user 4.3 s, sys: 155 ms, total: 4.46 s
Wall time: 4.9 s


In [3]:
%%time
reload(t)
reload(pti_mod)

# build a document-term matrix for all the packages we downloaded
pti = pti_mod.PackagesTermIndexer()

i = 0
for name, gh_url in packages:
    if not isdir('.cache/packages/{}'.format(name)): continue
    if i >= 800: break
    i += 1

    pkg = pti_mod.Package(name, gh_url)
    for token in t.tokenize_package(name):
        pkg.register_term(token)
    pti.append(pkg)

print "{} packages indexed".format(i)
doer = pti.fit_trim()

800 packages indexed
CPU times: user 4min 33s, sys: 15.9 s, total: 4min 48s
Wall time: 4min 56s


In [4]:
%%time
wf_matrix = doer.word_frequency_matrix()

CPU times: user 324 ms, sys: 8.09 ms, total: 332 ms
Wall time: 346 ms


In [5]:
%%time
tfidf_matrix = doer.tfidf_matrix()

CPU times: user 214 ms, sys: 4.22 ms, total: 218 ms
Wall time: 229 ms


In [6]:
%%time
T,sigma,D_trans = doer.svd()

CPU times: user 255 ms, sys: 15 ms, total: 270 ms
Wall time: 280 ms


In [7]:
print doer._global_term_count.keys()[0:10]
print doer.term_indices_[0:10]

['=', '{', '*', 'var', '}', 'if', 'return', 'function', '//', 'the']
['=', '{', '*', 'var', '}', 'if', 'return', 'function', '//', 'the']


In [8]:
df = pd.DataFrame(wf_matrix, index=doer.term_indices_, columns=doer.package_names_)
df.head()

Unnamed: 0,gulp,async,request,lodash,browserify,grunt,pm2,commander,mongoose,mocha,...,gsap,gulp-htmlhint,node-readability,updtr,cli-spinner,simple-git,rollup,grunt-html2js,grunt-html2js.1,updtr.1
=,51,159,407,390,321,162,1259,70,1195,1136,...,558,16,52,53,15,168,370,72,72,53
{,40,226,226,202,246,126,991,36,603,676,...,272,6,29,55,6,170,365,70,70,55
*,0,21,0,68,2,83,488,157,1584,1398,...,481,0,12,0,0,208,5,10,10,0
var,36,92,109,153,129,76,663,19,435,455,...,93,7,25,32,9,59,31,50,50,32
},23,97,205,85,126,59,492,18,379,332,...,180,2,17,29,4,61,246,37,37,29


In [9]:
df = pd.DataFrame(tfidf_matrix, index=doer.term_indices_, columns=doer.package_names_)
df.head(50)
df[:'const']

Unnamed: 0,gulp,async,request,lodash,browserify,grunt,pm2,commander,mongoose,mocha,...,gsap,gulp-htmlhint,node-readability,updtr,cli-spinner,simple-git,rollup,grunt-html2js,grunt-html2js.1,updtr.1
=,1.153424,1.481515,1.754774,1.742350,1.685673,1.486938,2.083936,1.244336,2.068719,2.053951,...,1.846692,0.827055,1.158984,1.164441,0.809358,1.497490,1.727023,1.252446,1.252446,1.164441
{,1.140725,1.666421,1.666421,1.632096,1.692359,1.488024,2.119438,1.109192,1.967032,2.002080,...,1.723102,0.597739,1.044770,1.236496,0.597739,1.579402,1.813154,1.309398,1.309398,1.236496
*,0.000000,1.568465,0.000000,2.148481,0.557461,2.248296,3.142144,2.568875,3.738862,3.675522,...,3.134828,0.000000,1.301513,0.000000,0.000000,2.710821,0.909179,1.216746,1.216746,0.000000
var,1.116095,1.400977,1.452867,1.556867,1.504502,1.342623,2.008548,0.925948,1.878533,1.892396,...,1.404283,0.642733,1.007042,1.080732,0.711704,1.265517,1.071221,1.215284,1.215284,1.080732
},0.924379,1.333598,1.549683,1.295606,1.408996,1.190894,1.803500,0.856429,1.727777,1.689375,...,1.512051,0.319546,0.840703,0.989283,0.468126,1.200431,1.602478,1.058040,1.058040,0.989283
if,0.880199,1.045745,1.450215,1.165928,1.531909,1.145390,1.908538,0.967806,1.737974,1.657368,...,1.500486,0.422165,0.756723,0.880199,0.334558,1.191320,1.511325,1.035760,1.035760,0.880199
return,0.794723,1.378358,1.426475,1.548692,1.491350,0.992861,2.031321,0.728215,1.653954,1.823255,...,1.412759,0.533408,0.763134,0.644923,0.000000,1.362447,1.563829,0.850088,0.850088,0.644923
function,0.725056,1.710362,1.486303,1.219156,1.495990,0.836094,1.532248,0.561176,1.557165,1.557165,...,0.241685,0.000000,0.241685,1.161867,0.241685,1.622734,1.553134,0.383062,0.383062,1.161867
//,0.985301,1.164347,1.357436,1.066428,0.775682,1.445284,1.605306,0.961095,1.588372,1.211882,...,0.332238,0.332238,0.541857,0.332238,0.000000,0.541857,1.262402,0.905958,0.905958,0.332238
the,0.310670,1.787870,1.442706,1.770955,0.721353,1.664430,1.779492,1.342693,2.379176,1.980533,...,2.211641,0.000000,0.872161,0.000000,0.000000,1.770955,1.149615,0.621340,0.621340,0.000000


In [10]:
%%time
with open('.cache/pickle', 'w') as serialized_fd:
    pickle.dump(doer, serialized_fd)

CPU times: user 711 ms, sys: 45.2 ms, total: 757 ms
Wall time: 765 ms


In [11]:
%%time
reload(t)
other = pti_mod.Package('other-pkg', 'other/other')
for token in t.tokenize_package('../../../other-pkg'):
    other.register_term(token)

CPU times: user 278 ms, sys: 143 ms, total: 421 ms
Wall time: 518 ms


In [12]:
%%time

# folded_wfm = pti.fold_wfm(other)
# folded_tfidf = pti.fold_tfidf(other)
folded_svd = doer.fold_svd(other)
print folded_svd.shape

(1, 200)
CPU times: user 3.66 ms, sys: 825 µs, total: 4.49 ms
Wall time: 2.47 ms


In [13]:
T,_,_ = doer.svd()
T.shape

(200, 200)

In [14]:
def cosine_sim(a,b):
    ret = np.dot(a, b)/linalg.norm(a)/linalg.norm(b)
#     return linalg.norm(ret)
    return ret[0,0]

In [15]:
%%time

other_doc_svd = doer.fold_svd(other)

cosines = []
for idx, pkg in enumerate(doer.package_names_):
    pkg_svd_vector = doer.svd()[2][:,idx]
    cos_svd = cosine_sim(other_doc_svd, pkg_svd_vector.T)
    cosines.append(cos_svd)

CPU times: user 57.5 ms, sys: 2.74 ms, total: 60.2 ms
Wall time: 58.6 ms


In [16]:
%%time

other_doc_wfm = doer.fold_wfm(other)
other_doc_tfidf = doer.fold_tfidf(other)
other_doc_svd = doer.fold_svd(other)
other_doc_svd_w = doer.fold_svd_wfm(other)

_,_,D_t = doer.svd()
_,_,Dw_t = doer.svd_wfm()
cosines = []
for idx, pkg in enumerate(doer.package_names_):
    pkg_svd_vector = D_t[:,idx]
    pkg_wfm_vector = doer.word_frequency_matrix()[:,idx]
    pkg_tfidf_vector = doer.tfidf_matrix()[:,idx]
    pkg_svd_w_vector = Dw_t[:, idx]

    cos_wfm = cosine_sim(other_doc_wfm, pkg_wfm_vector)
    cos_tfidf = cosine_sim(other_doc_tfidf, pkg_tfidf_vector)
    cos_svd = cosine_sim(other_doc_svd, pkg_svd_vector)
    cos_svd2 = cosine_sim(other_doc_svd_w, pkg_svd_w_vector)
    cosines.append([cos_wfm, cos_tfidf, cos_svd, cos_svd2])

CPU times: user 2min 32s, sys: 457 ms, total: 2min 32s
Wall time: 2min 33s


In [22]:
df = pd.DataFrame(cosines, index=doer.package_github_urls_, columns=['wfm cosine', 'tfidf cosine', 'svd cosine (tfidf)', 'svd cosine (wfm)'])
df.head(20)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
gulpjs/gulp,0.891499,0.554683,0.010539,-0.026279
caolan/async,0.812602,0.638968,0.03109,-0.119499
request/request,0.90751,0.522324,-0.119423,-0.025122
lodash/lodash,0.796841,0.515154,-0.078962,0.110328
substack/node-browserify,0.867803,0.616395,-0.015567,-0.093146
gruntjs/grunt,0.84559,0.564191,0.030017,0.008254
Unitech/pm2,0.816642,0.609625,0.02346,-0.079798
tj/commander.js,0.4617,0.479682,0.052173,0.306596
Automattic/mongoose,0.621841,0.587146,-0.13232,-0.002456
mochajs/mocha,0.622886,0.487286,0.01732,-0.021776


In [23]:
df.sort_values(by='wfm cosine', ascending=False).head(10)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
typicode/lowdb,0.956122,0.681455,0.193045,-0.122726
tmpvar/jsdom,0.946214,0.785214,0.113753,-0.03216
hapijs/hapi,0.942515,0.813039,0.151185,-0.007051
tkellen/node-liftoff,0.938672,0.645644,0.188227,-0.052415
assaf/zombie,0.9376,0.747826,0.049885,0.045956
flatiron/cradle,0.925962,0.599827,0.174824,-0.06615
achingbrain/pm2-web,0.924032,0.684409,0.170383,-0.108661
driftyco/ionic-cli,0.922947,0.62447,-0.09085,-0.072793
jonkemp/gulp-useref,0.921974,0.542677,-0.104265,0.024166
hapijs/joi,0.916186,0.738691,-0.013973,0.00432


In [24]:
df.sort_values(by='tfidf cosine', ascending=False).head(10)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
hapijs/hapi,0.942515,0.813039,0.151185,-0.007051
hapijs/wreck,0.915246,0.789695,0.216174,-0.044098
hapijs/lab,0.915361,0.787593,0.203169,0.042335
tmpvar/jsdom,0.946214,0.785214,0.113753,-0.03216
assaf/zombie,0.9376,0.747826,0.049885,0.045956
kcbanner/connect-mongo,0.858063,0.742989,0.223624,-0.133674
hapijs/joi,0.916186,0.738691,-0.013973,0.00432
TryGhost/Ghost,0.911955,0.722256,0.038463,0.065611
mattdesl/budo,0.891514,0.718931,0.087661,0.118397
DaftMonk/generator-angular-fullstack,0.858493,0.71568,0.085043,0.050577


In [25]:
df.sort_values(by='svd cosine (tfidf)', ascending=False).head(10)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
kcbanner/connect-mongo,0.858063,0.742989,0.223624,-0.133674
hapijs/wreck,0.915246,0.789695,0.216174,-0.044098
hapijs/lab,0.915361,0.787593,0.203169,0.042335
maxogden/websocket-stream,0.828329,0.521541,0.199227,0.028985
typicode/lowdb,0.956122,0.681455,0.193045,-0.122726
substack/hyperquest,0.841908,0.599551,0.189084,0.009773
tkellen/node-liftoff,0.938672,0.645644,0.188227,-0.052415
bahmutov/next-update,0.845586,0.670346,0.183987,0.045248
vojtajina/grunt-bump,0.707589,0.406429,0.179167,0.103634
flatiron/cradle,0.925962,0.599827,0.174824,-0.06615


In [26]:
df.sort_values(by='svd cosine (wfm)', ascending=False).head(10)

Unnamed: 0,wfm cosine,tfidf cosine,svd cosine (tfidf),svd cosine (wfm)
nodejitsu/node-http-proxy,0.36965,0.415926,-0.040986,0.375926
caolan/highland,0.826121,0.598436,0.03125,0.341196
tj/commander.js,0.4617,0.479682,0.052173,0.306596
request/request-promise,0.891105,0.524941,-0.063685,0.30422
rackt/redux,0.456012,0.546762,0.028369,0.293482
chjj/marked,0.871399,0.423206,-0.048322,0.293413
aacerox/node-rest-client,0.762054,0.448322,-0.015489,0.276248
flatiron/prompt,0.760433,0.418703,-0.134469,0.260714
Gozala/crypto,0.389367,0.271264,-0.069198,0.246596
visionmedia/supertest,0.519356,0.313592,0.054026,0.236798
