In [1]:
nodes_path = 'data_preprocess/data/pypi_nodes.csv'
lang_path = 'data_preprocess/data/pypi_nodes_lang.csv'
# Where to save the vectors we generate
data_dir = 'vector_generation/data'

## Generate TF-IDF Vectors

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch.utils.tensorboard import SummaryWriter

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# Get nodes/document names
nodes_names = pd.read_csv(nodes_path, na_filter=False)['nodes'].values

# Get language data
node_lang_df = pd.read_csv(lang_path, na_filter=False)
lang_data = node_lang_df['language'].values

In [6]:
# See: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
vectorizer = TfidfVectorizer(
    input='content',
    max_df=0.5,
    max_features=256,
    min_df=2,
    stop_words='english',
    use_idf=True,
)

X = vectorizer.fit_transform(lang_data)

In [7]:
feature_names = vectorizer.get_feature_names()
print(feature_names)

['00', '10', '2017', '2018', 'access', 'account', 'add', 'address', 'allow', 'alt', 'api', 'app', 'application', 'argument', 'attribute', 'author', 'automatically', 'available', 'badge', 'base', 'bash', 'bin', 'block', 'branch', 'bug', 'build', 'cache', 'case', 'change', 'check', 'class', 'client', 'clone', 'code', 'column', 'command', 'config', 'configuration', 'configure', 'connection', 'contain', 'content', 'context', 'copy', 'coverage', 'create', 'current', 'custom', 'data', 'database', 'date', 'datum', 'def', 'default', 'define', 'delete', 'dependency', 'description', 'dev', 'development', 'different', 'directory', 'display', 'django', 'docker', 'docs', 'document', 'documentation', 'download', 'easy', 'email', 'enable', 'end', 'environment', 'error', 'event', 'example', 'execute', 'exist', 'extension', 'false', 'feature', 'field', 'file', 'filter', 'fix', 'folder', 'follow', 'foo', 'form', 'format', 'function', 'generate', 'git', 'github', 'group', 'help', 'host', 'html', 'http', 

In [5]:
# Analysis taken from
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df


def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)


def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [6]:
features = vectorizer.get_feature_names()

print('Top mean features across ALL documents')
print(top_mean_feats(X, features))

# Save to tensorboard
# Writer will output to ./runs/ directory by default
assert len(X.toarray()) == len(nodes_names)

writer = SummaryWriter(f'{data_dir}/tfidf')
print('Saving Embeddings...')
writer.add_embedding(
    X.toarray(),
    metadata=nodes_names,
    tag='TFIDF'
)
writer.close()
print('Saved!')

Top mean features across ALL documents
          feature     tfidf
0           image  0.074913
1          target  0.053084
2          django  0.046839
3         install  0.041536
4             api  0.040747
5             alt  0.040086
6   documentation  0.033638
7             add  0.032782
8            code  0.030553
9          module  0.027012
10           file  0.026988
11           pypi  0.025442
12      interface  0.024354
13        package  0.024143
14        license  0.024042
15         plugin  0.023913
16         travis  0.023182
17          datum  0.022962
18            run  0.022431
19         github  0.021816
20           test  0.020548
21        example  0.019522
22        library  0.019413
23            pip  0.017485
24            oca  0.017446
Saving Embeddings...
Saved!


## Generate Doc2Vec Vectors

In [12]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [13]:
# Get nodes/document names
nodes_names = pd.read_csv(nodes_path, na_filter=False)['nodes'].values

# Get language data
node_lang_df = pd.read_csv(lang_path, na_filter=False)
lang_data = node_lang_df['language'].values

documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(lang_data)]

In [14]:
model = Doc2Vec(
    dm=1, 
    dm_concat=1, 
    vector_size=128, 
    window=5, 
    negative=5, 
    hs=0, 
    min_count=2,
    epochs=50,
    workers=4
)

In [15]:
model.build_vocab(documents)

In [17]:
%time model.train(documents, total_examples=len(documents), epochs=model.epochs)

CPU times: user 4h 45min 22s, sys: 9min 1s, total: 4h 54min 23s
Wall time: 1h 26min 15s


In [18]:
# Building Vector Space
vector_space = []
for i in range(len(documents)):
    # 1 indexed
    vector_space.append(model.docvecs[i])

assert len(vector_space) == len(nodes_names)

In [19]:
writer = SummaryWriter(f'{data_dir}/doc2vec')
print('Saving Embeddings...')
writer.add_embedding(
    np.array(vector_space),
    metadata=nodes_names,
    tag='Doc2Vec_0'
)
writer.close()
print('Saved!')

Saving Embeddings...
Saved!


## Generate BERT Vectors

*Using bert as a service

In [14]:
from bert_serving.client import BertClient
bc = BertClient()

TypeError: __init__() got an unexpected keyword argument 'max_seq_len'

In [6]:
# Get nodes/document names
nodes_names = pd.read_csv(nodes_path, na_filter=False)['nodes'].values

# Get language data
node_lang_df = pd.read_csv(lang_path, na_filter=False)
lang_data = node_lang_df['language'].values

In [8]:
lang_data.shape

(199234,)

In [13]:
lang_data = [max(_s, 'None') for _s in list(lang_data)] # Add data to empty strings
vectors = bc.encode(list(lang_data))

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


KeyboardInterrupt: 