In [1]:
nodes_path = 'data_preprocess/data/pypi_nodes.csv'
lang_path = 'data_preprocess/data/pypi_nodes_lang.csv'
# Where to save the vectors we generate
data_dir = 'vector_generation/data'

## Generate TF-IDF Vectors

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch.utils.tensorboard import SummaryWriter

In [3]:
# Get nodes/document names
nodes_names = pd.read_csv(nodes_path, na_filter=False)['nodes'].values

# Get language data
node_lang_df = pd.read_csv(lang_path, na_filter=False)
lang_data = node_lang_df['language'].values

In [4]:
# See: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
vectorizer = TfidfVectorizer(
    input='content',
    max_df=0.5,
    max_features=256,
    min_df=2,
    stop_words='english',
    use_idf=True,
)

X = vectorizer.fit_transform(lang_data)

In [5]:
# Analysis taken from
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df


def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)


def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [6]:
features = vectorizer.get_feature_names()

print('Top mean features across ALL documents')
print(top_mean_feats(X, features))

# Save to tensorboard
# Writer will output to ./runs/ directory by default
assert len(X.toarray()) == len(nodes_names)

writer = SummaryWriter(f'{data_dir}/tfidf')
print('Saving Embeddings...')
writer.add_embedding(
    X.toarray(),
    metadata=nodes_names,
    tag='TFIDF'
)
writer.close()
print('Saved!')

Top mean features across ALL documents
          feature     tfidf
0           image  0.074913
1          target  0.053084
2          django  0.046839
3         install  0.041536
4             api  0.040747
5             alt  0.040086
6   documentation  0.033638
7             add  0.032782
8            code  0.030553
9          module  0.027012
10           file  0.026988
11           pypi  0.025442
12      interface  0.024354
13        package  0.024143
14        license  0.024042
15         plugin  0.023913
16         travis  0.023182
17          datum  0.022962
18            run  0.022431
19         github  0.021816
20           test  0.020548
21        example  0.019522
22        library  0.019413
23            pip  0.017485
24            oca  0.017446
Saving Embeddings...
Saved!


## Generate Doc2Vec Vectors

In [8]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [10]:
# Get nodes/document names
nodes_names = pd.read_csv(nodes_path, na_filter=False)['nodes'].values

# Get language data
node_lang_df = pd.read_csv(lang_path, na_filter=False)
lang_data = node_lang_df['language'].values

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(lang_data)]

In [12]:
max_epochs = 250

model = Doc2Vec(
    vector_size=256,
    alpha=0.025,
    min_alpha=0.00025,
    min_count=1,
    dm=1,
    max_epochs=max_epochs
)

In [13]:
model.build_vocab(documents)

In [14]:
print('Beginning Training...')
for epoch in range(max_epochs):
    print(f'iteration {epoch+1}/{max_epochs}')
    model.train(documents,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

Beginning Training...
iteration 1/250


  


iteration 2/250
iteration 3/250
iteration 4/250
iteration 5/250
iteration 6/250
iteration 7/250
iteration 8/250
iteration 9/250
iteration 10/250
iteration 11/250
iteration 12/250
iteration 13/250
iteration 14/250
iteration 15/250
iteration 16/250
iteration 17/250
iteration 18/250
iteration 19/250
iteration 20/250
iteration 21/250
iteration 22/250
iteration 23/250
iteration 24/250
iteration 25/250
iteration 26/250
iteration 27/250
iteration 28/250
iteration 29/250
iteration 30/250
iteration 31/250
iteration 32/250
iteration 33/250
iteration 34/250
iteration 35/250
iteration 36/250
iteration 37/250
iteration 38/250
iteration 39/250
iteration 40/250
iteration 41/250
iteration 42/250
iteration 43/250
iteration 44/250
iteration 45/250
iteration 46/250
iteration 47/250
iteration 48/250
iteration 49/250
iteration 50/250
iteration 51/250
iteration 52/250
iteration 53/250
iteration 54/250
iteration 55/250
iteration 56/250
iteration 57/250
iteration 58/250
iteration 59/250
iteration 60/250
itera

In [15]:
# Building Vector Space
vector_space = []
for i in range(len(documents)):
    # 1 indexed
    vector_space.append(model.docvecs[i])

assert len(vector_space) == len(nodes_names)

In [17]:
writer = SummaryWriter(f'{data_dir}/doc2vec')
print('Saving Embeddings...')
writer.add_embedding(
    np.array(vector_space),
    metadata=nodes_names,
    tag='Doc2Vec'
)
writer.close()
print('Saved!')

Saving Embeddings...
Saved!
