In [5]:
from github import Github
from scipy import linalg
from os.path import isdir
# from commonjs_package_tokenizer import tokenize_package
import npm_crawler as npm
import os
import commonjs_package_tokenizer as t
import packages_term_indexer as pti_mod
import pandas as pd

In [115]:
%%time

# spider npm's "most starred repositories" page for
# package name and a github url.
packages = npm.get_most_starred_packages(max_pages=10)

# providing a github login means we can exceed the public API request limit of 60 requests per hour.
# We need to look up approximately 400 packages.
gh_user = os.environ.get('GITHUB_API_USER')
gh_token = os.environ.get('GITHUB_API_TOKEN')
gh = Github(gh_user, gh_token)

# for each of those packages, perform a shallow git clone
for name, gh_url in packages:
    clone_url = 'git://github.com/{}.git'.format(gh_url)
    clone_dir = '.cache/packages/{}'.format(name)
    
    # use github API to determine if this repo is predominantly javascript.
    # we want to exclude coffeescript/typescript repos because their tokens
    # are significantly different.
    is_javascript = gh.is_repo_javascript(gh_url)
    
    if is_javascript and not isdir(clone_dir):
        # git clone the repo. assign output to an unused variable to supress noise
        _ = !git clone $clone_url $clone_dir --depth=1

CPU times: user 661 ms, sys: 114 ms, total: 775 ms
Wall time: 1.15 s


In [127]:
%%time
reload(t)
reload(pti_mod)

# build a document-term matrix for all the packages we downloaded
pti = pti_mod.PackagesTermIndexer()

i = 0
for dir_name in os.listdir('.cache/packages'):
    if dir_name == '.gitkeep': continue
    i += 1
    if i > 30: break
    pkg = pti_mod.Package(dir_name)
    for token in t.tokenize_package(dir_name):
        pkg.register_term(token)
    pti.append(pkg)

print "{} packages indexed".format(i)

31 packages indexed
CPU times: user 11.9 s, sys: 580 ms, total: 12.4 s
Wall time: 14.8 s


In [117]:
%%time

wf_matrix = pti.word_frequency_matrix()
tfidf_matrix = pti.tfidf_matrix()

CPU times: user 7.88 s, sys: 131 ms, total: 8.01 s
Wall time: 8.09 s


In [121]:
%%time
T,sigma,D_trans = pti.svd()

CPU times: user 158 ms, sys: 49.3 ms, total: 208 ms
Wall time: 192 ms


In [122]:
df = pd.DataFrame(wf_matrix, index=pti.term_indices_, columns=pti.package_names_)
df.head()

Unnamed: 0,agenda,angular,angular2,apn,async,autoprefixer,aws-sdk,babelify,backbone,bcrypt,...,bunyan,busboy,canvas,chai,chalk,cheerio,chokidar,classnames,clean-css,cli
humanInterval,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
all,1,48,4,1,0,0,18,0,1,0,...,1,2,0,5,0,3,0,0,6,0
code,3,25,0,0,1,0,16,1,0,0,...,0,31,0,0,0,0,0,0,1,0
computeFromInterval(),1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
this._defaultConcurrency,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [123]:
df = pd.DataFrame(tfidf_matrix, index=pti.term_indices_, columns=pti.package_names_)
df.head()

Unnamed: 0,agenda,angular,angular2,apn,async,autoprefixer,aws-sdk,babelify,backbone,bcrypt,...,bunyan,busboy,canvas,chai,chalk,cheerio,chokidar,classnames,clean-css,cli
humanInterval,1.930794,0.0,0.0,0,0,0,0.0,0,0,0,...,0,0.0,0,0.0,0,0,0,0,0.0,0
all,0.965397,5.61471,2.321928,1,0,0,4.247928,0,1,0,...,1,1.584963,0,2.584963,0,2,0,0,2.807355,0
code,1.930794,4.70044,0.0,0,1,0,4.087463,1,0,0,...,0,5.0,0,0.0,0,0,0,0,1.0,0
computeFromInterval(),0.965397,0.0,0.0,0,0,0,0.0,0,0,0,...,0,0.0,0,0.0,0,0,0,0,0.0,0
this._defaultConcurrency,0.965397,0.0,0.0,0,0,0,0.0,0,0,0,...,0,0.0,0,0.0,0,0,0,0,0.0,0


In [124]:
%%time
reload(t)
other = pti_mod.Package('other-pkg')
for token in t.tokenize_package('../../../other-pkg'):
    other.register_term(token)

CPU times: user 302 ms, sys: 173 ms, total: 474 ms
Wall time: 813 ms


In [132]:
folded_wfm = pti.fold_wfm(other)
folded_tfidf = pti.fold_tfidf(other)

In [133]:
print pti.fold_svd(other)

[[-0.0299616  -0.00423243 -0.04781998  0.02189804 -0.03306221  0.01435506
  -0.02467444  0.00156258 -0.00996914  0.03335881 -0.01402059 -0.02043276
  -0.0025971  -0.00655165 -0.04050136  0.02570397 -0.00255934  0.06531314
   0.02913039 -0.00820125  0.00478438  0.00222559  0.01429583  0.01099879
   0.01802387 -0.04887145  0.04000966 -0.03340513  0.05284112  0.27794328]]
