# Docker Analysis

This notebook analyzes the docker installation and indicates which modules failed to install

In [1]:
import json
from collections import Counter

import pandas as pd

import analysis_helpers, importlib
importlib.reload(analysis_helpers)
from analysis_helpers import var, relative_var, dbmt_relative_var, getitem

from db import connect

## Query

Select asts and modules from:

- existing

  ```n.skip & 2048 = 0```

- valid
  
  ```NOT (n.kernel = 'no-kernel' AND n.nbformat = '0')
  AND n.processed & 15 = 0 AND n.skip & (1024 + 512) = 0```


- non-duplicated
  
  ```n.skip & (1024 + 512 + 128) = 0```

- executed

  ```n.max_execution_count > -0```
 

- unambiguous

  ```n.processing_cells = 0 AND n.unambiguous = 1```
 

- Python notebooks

  ```n.language = 'python' AND LEFT(n.language_version, 1) IN ('2', '3')```
  


In [2]:
%%time
with connect() as session:
    print("Query Notebooks")
    modules = pd.read_sql("""
        SELECT m.*, a.has_tests, a.has_unittests, n.stargazers, n.forks, n.starforks
        FROM notebooks n, notebook_modules m, notebooks_bool_aggregates a
        WHERE NOT (n.kernel = 'no-kernel' AND n.nbformat = '0')
        AND n.processed & 15 = 0
        AND n.skip & 2048 = 0
        AND n.skip & (1024 + 512) = 0
        AND n.skip & (1024 + 512 + 128) = 0
        AND n.language = 'python'
        AND LEFT(n.language_version, 1) IN ('2', '3')
        AND m.notebook_id = n.id
        AND a.notebook_id = n.id
        AND n.processing_cells = 0 AND n.unambiguous = 1
        AND n.max_execution_count > -0
        
    """, session.connection())

Query Notebooks
CPU times: user 7.59 s, sys: 571 ms, total: 8.16 s
Wall time: 19.4 s


In [3]:
columns = [
    "any_any", "local_any", "external_any",
    "any_import_from", "local_import_from", "external_import_from",
    "any_import", "local_import", "external_import",
    "any_load_ext", "local_load_ext", "external_load_ext",
]
count_columns = [c + "_count" for c in columns]
for column in columns:
    modules[column] = modules[column].apply(lambda c: {a for a in c.split(",") if a})
    modules["toplevel_" + column] = modules[column].apply(lambda imports: {
        getitem(x.split("."), 0, x) for x in imports
    })
    modules["toplevel_" + column + "_count"] = modules["toplevel_" + column].apply(len)

modules.head(5)

Unnamed: 0,id,repository_id,notebook_id,index,index_count,any_any,any_any_count,local_any,local_any_count,external_any,...,toplevel_local_import,toplevel_local_import_count,toplevel_external_import,toplevel_external_import_count,toplevel_any_load_ext,toplevel_any_load_ext_count,toplevel_local_load_ext,toplevel_local_load_ext_count,toplevel_external_load_ext,toplevel_external_load_ext_count
0,445493,53320,383583,1.0,1,"{matplotlib.pyplot, pandas, numpy}",3,{},0,"{matplotlib.pyplot, pandas, numpy}",...,{},0,"{matplotlib, pandas, numpy}",3,{},0,{},0,{},0
1,118930,36449,284179,157843.0,5,"{weakref, os, sys}",3,{},0,"{weakref, os, sys}",...,{},0,"{weakref, os, sys}",3,{},0,{},0,{},0
2,526212,177716,1031182,,0,{},0,{},0,{},...,{},0,{},0,{},0,{},0,{},0
3,1020920,127862,796666,1.0,1,{collections},1,{},0,{collections},...,{},0,{},0,{},0,{},0,{},0
4,962139,65891,460855,4.0,2,"{scipy, numpy}",2,{},0,"{scipy, numpy}",...,{},0,{numpy},1,{},0,{},0,{},0


In [4]:
failed_to_install_counters = []
failed_after_install_counters = []
ignored_counters = []
imported_counters = []
pip_installable_counters = []
total = 0
failed = 0

for name in ["27", "34", "35", "36", "37"]:
    with open("../archaeology/modules/report-py{}.json".format(name), "r") as f:
        report = json.load(f)
        total += (
            len(report["ignored"])
            + len(report["imported"])
            + len(report["pip_installable"])
            + len(report["failed_to_install"])
            + len(report["failed_after_install"])
        )
        failed += (
            len(report["failed_to_install"])
            + len(report["failed_after_install"])
        )
        ignored_counters.append(Counter({
            k: v for k, v in report["ignored"]
        }))
        imported_counters.append(Counter({
            k: v for k, v in report["imported"]
        }))
        pip_installable_counters.append(Counter({
            k: v for k, v in report["pip_installable"]
        }))
        failed_to_install_counters.append(Counter({
            k: v for k, v in report["failed_to_install"]
        }))
        failed_after_install_counters.append(Counter({
            k: v for k, v in report["failed_after_install"]
        }))


In [5]:
total, failed

(42996, 32808)

In [6]:
failed_to_install = sum(failed_to_install_counters, Counter())
failed_after_install = sum(failed_after_install_counters, Counter())
ignored = sum(ignored_counters, Counter())
imported = sum(imported_counters, Counter())
pip_installable = sum(pip_installable_counters, Counter())


In [7]:
failed = (failed_after_install + failed_to_install)
failed.most_common()

[('graphlab', 16407),
 ('caffe', 3781),
 ('ggplot', 1810),
 ('testCases', 1694),
 ('numpy_pack', 1441),
 ('cartopy', 1281),
 ('cs231n', 1273),
 ('pymc', 1246),
 ('MySQLdb', 1232),
 ('ROOT', 1082),
 ('YearPlotter', 976),
 ('Eigen_decomp', 794),
 ('recon_plot', 794),
 ('computeStats', 794),
 ('src', 777),
 ('visuals', 769),
 ('osgeo', 728),
 ('lr_utils', 710),
 ('import_modules', 668),
 ('kid_readout', 666),
 ('imblearn', 666),
 ('util', 656),
 ('planar_utils', 635),
 ('pymc3', 608),
 ('lsst', 606),
 ('spark_PCA', 590),
 ('quantopian', 578),
 ('gdal', 564),
 ('cntk', 561),
 ('testCases_v2', 553),
 ('dnn_utils_v2', 537),
 ('dautil', 531),
 ('pyfits', 527),
 ('Image', 520),
 ('rdkit', 519),
 ('mysql', 496),
 ('cvxpy', 481),
 ('mlp', 479),
 ('edward', 479),
 ('model', 479),
 ('fc_interface', 469),
 ('tf_utils', 467),
 ('pynq', 465),
 ('reg_utils', 456),
 ('init_utils', 451),
 ('dnn_app_utils_v2', 448),
 ('models', 444),
 ('gc_utils', 442),
 ('lucem_illud', 430),
 ('root_numpy', 428),
 ('obs

In [8]:
counter = Counter()
modules["toplevel_any_any"].apply(counter.update);

In [9]:
%%time
newcounter = Counter()
for name, ocount in failed.most_common():
    if name in counter:
        newcounter[name] = counter[name]

newcounter.most_common()[:15]

CPU times: user 16.3 ms, sys: 9 µs, total: 16.3 ms
Wall time: 16.6 ms


[('graphlab', 11092),
 ('cs231n', 10198),
 ('torch', 7745),
 ('gensim', 7174),
 ('cPickle', 3901),
 ('urllib2', 3455),
 ('torchvision', 3433),
 ('geopandas', 3350),
 ('visuals', 3209),
 ('problem_unittests', 2447),
 ('testCases', 1733),
 ('spacy', 1620),
 ('netCDF4', 1582),
 ('util', 1561),
 ('pymc3', 1533)]

In [10]:
print("Notebooks that import graphlab:", var("a7_module_graphlab", newcounter["graphlab"], "{:,}"))
print("Notebooks that import pytorch:", var("a7_module_torch", newcounter["torch"], "{:,}"))
print("Notebooks that import gensim:", var("a7_module_gensim", newcounter["gensim"], "{:,}"))
print("Notebooks that import torchvision:", var("a7_module_torchvision", newcounter["torchvision"], "{:,}"))
print("Notebooks that import geopandas:", var("a7_module_geopandas", newcounter["geopandas"], "{:,}"))

Notebooks that import graphlab: 11,092
Notebooks that import pytorch: 7,745
Notebooks that import gensim: 7,174
Notebooks that import torchvision: 3,433
Notebooks that import geopandas: 3,350


In [11]:
sum((failed_after_install + failed_to_install).values())

183780

The external modules with most failures are graphlab, torch, gensim, torchvision, and geopandas