# Experiment Template

Duplicate this notebook to have all you need to kickstart a new experiment.

In [2]:
# Import numpy

import numpy as np

In [3]:
# Import from system
import os
import json
import random

In [4]:
# Import scikit-learn (mainly for train/test splits)
import sklearn
from sklearn.metrics import roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.utils.fixes import signature
from sklearn.preprocessing import LabelBinarizer

In [5]:
# Import graph framework
import networkx as nx
from networkx.readwrite import json_graph

In [24]:
# Import and configure plotting library
import matplotlib.pyplot as plt
%matplotlib inline

step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})

## Configuration

In [25]:
# Define paths to data and models
DATA_PATH = 'data/keras-example/graph/train/'
PREFIX = 'keras'

## Load data

In [30]:
var_map = json.load(open(DATA_PATH+PREFIX+"-var_map.json"))

In [27]:
def fix_encoding(d):
    enc_d = {}
    for k,v in d.items():
        enc_k = k.encode("utf-8")
        enc_v = v.encode("utf-8")
        enc_d[enc_k] = enc_v
    return enc_d

In [31]:
var_map

{'39': 'h5py',
 '43': 'model',
 '44': 'input_tensors',
 '71': 'layer_map',
 '74': 'tensor_map',
 '82': 'input_layers',
 '85': 'input_tensors',
 '88': 'layer',
 '89': '_input_layers',
 '92': 'input_tensor',
 '96': 'batch_input_shape',
 '99': 'dtype',
 '102': 'sparse',
 '105': 'name',
 '109': 'append',
 '113': 'newly_created_input_layer',
 '115': '_keras_history',
 '127': '_original',
 '128': '_cloned',
 '131': '_input_layers',
 '141': 'input_tensors',
 '146': '_input_tensors',
 '150': 'i',
 '151': 'x',
 '158': 'is_keras_tensor',
 '162': 'name',
 '163': 'name',
 '165': '_input_layers',
 '170': 'input_tensor',
 '182': 'append',
 '186': 'original_input_layer',
 '188': '_keras_history',
 '193': 'newly_created_input_layer',
 '195': '_keras_history',
 '207': 'append',
 '211': 'input_tensors',
 '215': 'x',
 '216': 'y',
 '219': 'inputs',
 '231': 'depth_keys',
 '235': 'keys',
 '236': '_nodes_by_depth',
 '240': 'sort',
 '245': 'depth',
 '248': 'nodes',
 '250': '_nodes_by_depth',
 '255': 'node',
 

In [32]:
flatten = lambda l: [y for x in l for y in x]

In [33]:
import csv

In [34]:
with open('token_names.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        token_names = row

In [35]:
import re

In [36]:
def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]

In [37]:
vocab = list(set(var_map.values()))

In [39]:
split_vocab = [v.split('_') for v in vocab]
split_vocab

[['', 'recurrent', 'dropout', 'mask'],
 ['output', 'mask'],
 ['recurrent', 'constraint'],
 ['string', ''],
 ['rmtree'],
 ['', 'SEQUENCE', 'COUNTER'],
 ['moving', 'mean'],
 ['numdigits'],
 ['cpu', 'merge'],
 ['module', 'objects'],
 ['first', 'layer'],
 ['node'],
 ['element', 'select'],
 ['avg'],
 ['yt'],
 ['metrics'],
 ['total', 'size'],
 ['batch', 'array'],
 ['directory'],
 ['n'],
 ['original', 'node', 'index'],
 ['rnn'],
 ['param', 'values'],
 ['stride', 'd'],
 ['hsplit'],
 ['cell', 'contents'],
 ['input', 'h'],
 ['separable', 'conv2d'],
 ['cols'],
 ['tensor', 'shape'],
 ['tensor', 'connection'],
 ['cuda'],
 ['model', 'weights', 'group'],
 ['state', 'updates'],
 ['seen'],
 ['legacy', 'zeropadding2d', 'support'],
 ['x', 'aggregate'],
 ['u'],
 ['block', 'size'],
 ['shared'],
 ['cache', 'dir'],
 ['TimeseriesGenerator'],
 ['download'],
 ['b', 'carry'],
 ['inbound', 'names'],
 ['slice', 'row'],
 ['sparse', 'tensor', 'to', 'dense'],
 ['subset'],
 ['open'],
 ['max', 'pool3d'],
 ['sha256'],
 

In [40]:
for t in token_names:
    split_vocab.append([t])

In [41]:
split_vocab[2000:]

[['output', 'loss'],
 ['backend'],
 ['non', 'tensors'],
 ['x', 'c'],
 ['', 'callbacks'],
 ['AvgPool3D'],
 ['', '', 'dict', '', ''],
 ['untar', 'fpath'],
 ['files'],
 ['', 'callable', 'fn'],
 ['uid'],
 ['', 'make', 'train', 'function'],
 ['ins'],
 ['batch', 'ids'],
 ['input', 'conv'],
 ['updated', 'log', 'p', 'prev'],
 ['prev', 'total', 'width'],
 ['apply', 'channel', 'shift'],
 ['update'],
 ['distribution'],
 ['skip', 'target', 'weighing', 'indices'],
 ['extend'],
 ['iterations'],
 ['run', 'metadata'],
 ['has', 'seq'],
 ['map'],
 ['insecure'],
 ['permute', 'dimensions'],
 ['MSLE'],
 ['b', 'constraint'],
 ['rankdir'],
 ['embeddings', 'freq'],
 ['deepcopy'],
 ['', 'seen', 'so', 'far'],
 ['non', 'repeats'],
 ['num', 'gpus'],
 ['num', 'static', 'element'],
 ['int', 'shape'],
 ['active', 'skip', 'idxs'],
 ['bool'],
 ['proceed'],
 ['y', 'train'],
 ['', 'call'],
 ['saver'],
 ['add', 'summary'],
 ['new', 'shape'],
 ['rnn', 'constants'],
 ['', 'uses', 'learning', 'phase'],
 ['top', 'paths'],
 [

In [42]:
s_vocab = []
for vs in split_vocab:
    f = []
    for v in vs:
        f.append(camel_case_split(v))
    s_vocab.append(flatten(f))

In [43]:
s_vocab += [['[MASK]']]

In [44]:
s_vocab += [['[END]']]

In [45]:
s_vocab += [['[UNK]']]

In [46]:
s_vocab[2000:]

[['output', 'loss'],
 ['backend'],
 ['non', 'tensors'],
 ['x', 'c'],
 ['callbacks'],
 ['Avg', 'Pool3D'],
 ['dict'],
 ['untar', 'fpath'],
 ['files'],
 ['callable', 'fn'],
 ['uid'],
 ['make', 'train', 'function'],
 ['ins'],
 ['batch', 'ids'],
 ['input', 'conv'],
 ['updated', 'log', 'p', 'prev'],
 ['prev', 'total', 'width'],
 ['apply', 'channel', 'shift'],
 ['update'],
 ['distribution'],
 ['skip', 'target', 'weighing', 'indices'],
 ['extend'],
 ['iterations'],
 ['run', 'metadata'],
 ['has', 'seq'],
 ['map'],
 ['insecure'],
 ['permute', 'dimensions'],
 ['MSLE'],
 ['b', 'constraint'],
 ['rankdir'],
 ['embeddings', 'freq'],
 ['deepcopy'],
 ['seen', 'so', 'far'],
 ['non', 'repeats'],
 ['num', 'gpus'],
 ['num', 'static', 'element'],
 ['int', 'shape'],
 ['active', 'skip', 'idxs'],
 ['bool'],
 ['proceed'],
 ['y', 'train'],
 ['call'],
 ['saver'],
 ['add', 'summary'],
 ['new', 'shape'],
 ['rnn', 'constants'],
 ['uses', 'learning', 'phase'],
 ['top', 'paths'],
 ['weighted', 'metrics'],
 ['kept', 'n

In [47]:
flat_vocab = [y for x in s_vocab for y in x]

In [48]:
flat_vocab[-2]

'[END]'

In [49]:
unique_vocab = list(set(flat_vocab))

In [50]:
vocab_size = len(set(flat_vocab))

In [51]:
vocab_size

1413

In [52]:
s_vocab

[['recurrent', 'dropout', 'mask'],
 ['output', 'mask'],
 ['recurrent', 'constraint'],
 ['string'],
 ['rmtree'],
 ['SEQUENCE', 'COUNTER'],
 ['moving', 'mean'],
 ['numdigits'],
 ['cpu', 'merge'],
 ['module', 'objects'],
 ['first', 'layer'],
 ['node'],
 ['element', 'select'],
 ['avg'],
 ['yt'],
 ['metrics'],
 ['total', 'size'],
 ['batch', 'array'],
 ['directory'],
 ['n'],
 ['original', 'node', 'index'],
 ['rnn'],
 ['param', 'values'],
 ['stride', 'd'],
 ['hsplit'],
 ['cell', 'contents'],
 ['input', 'h'],
 ['separable', 'conv2d'],
 ['cols'],
 ['tensor', 'shape'],
 ['tensor', 'connection'],
 ['cuda'],
 ['model', 'weights', 'group'],
 ['state', 'updates'],
 ['seen'],
 ['legacy', 'zeropadding2d', 'support'],
 ['x', 'aggregate'],
 ['u'],
 ['block', 'size'],
 ['shared'],
 ['cache', 'dir'],
 ['Timeseries', 'Generator'],
 ['download'],
 ['b', 'carry'],
 ['inbound', 'names'],
 ['slice', 'row'],
 ['sparse', 'tensor', 'to', 'dense'],
 ['subset'],
 ['open'],
 ['max', 'pool3d'],
 ['sha256'],
 ['transp

In [53]:
from sklearn.preprocessing import LabelEncoder

In [54]:
lencoder = LabelEncoder()

In [55]:
lencoder.fit_transform(unique_vocab)

array([  45,   27, 1107, ...,  119, 1224, 1331])

In [56]:
max_len = max([len(v) for v in split_vocab])
max_len

7

In [57]:
data = np.zeros((len(s_vocab), max_len), dtype=int)
data.shape

(2505, 7)

In [58]:
for i, sv in enumerate(s_vocab):
    for j,s in enumerate(sv):
        data[i][j] = lencoder.transform([s])
    while j < max_len:
        data[i][j] = unique_vocab.index('[END]')
        j+=1

In [59]:
import sys
!{sys.executable} -m pip install gensim



In [60]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [61]:
sentences = s_vocab

In [62]:
# train word2vec on the two sentences
model = gensim.models.Word2Vec(sentences, window=1, size=64, min_count=0)

2018-12-19 17:53:58,529 : INFO : collecting all words and their counts
2018-12-19 17:53:58,531 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-12-19 17:53:58,538 : INFO : collected 1413 word types from a corpus of 4607 raw words and 2505 sentences
2018-12-19 17:53:58,539 : INFO : Loading a fresh vocabulary
2018-12-19 17:53:58,548 : INFO : effective_min_count=0 retains 1413 unique words (100% of original 1413, drops 0)
2018-12-19 17:53:58,549 : INFO : effective_min_count=0 leaves 4607 word corpus (100% of original 4607, drops 0)
2018-12-19 17:53:58,557 : INFO : deleting the raw counts dictionary of 1413 items
2018-12-19 17:53:58,559 : INFO : sample=0.001 downsamples 62 most-common words
2018-12-19 17:53:58,560 : INFO : downsampling leaves estimated 4086 word corpus (88.7% of prior 4607)
2018-12-19 17:53:58,565 : INFO : estimated required memory for 1413 words and 64 dimensions: 1429956 bytes
2018-12-19 17:53:58,567 : INFO : resetting layer weights
2018-12

In [63]:
model.similarity('y', 'train')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


-0.014980327

In [64]:
model.save("token2vec.model")

2018-12-19 17:54:01,364 : INFO : saving Word2Vec object under token2vec.model, separately None
2018-12-19 17:54:01,365 : INFO : not storing attribute vectors_norm
2018-12-19 17:54:01,367 : INFO : not storing attribute cum_table
2018-12-19 17:54:01,444 : INFO : saved token2vec.model


In [79]:
varname = "x_trainNet"

In [80]:
subtoks = flatten([camel_case_split(v) for v in varname.split('_')])

In [83]:
np.mean([model[sk] for sk in subtoks], axis=0).shape

  """Entry point for launching an IPython kernel.


(64,)