## Импорты

In [35]:
%matplotlib inline
import matplotlib.pyplot as plt

import os
from tqdm import tqdm_notebook, trange, tqdm
import pickle
import numpy as np
from IPython.display import clear_output

from collections import defaultdict

import keyword
import tokenize
from io import BytesIO
import time

In [39]:
time.time()

1558273332.773469

In [2]:
base_path = '/Users/dentarasov/Yandex.Disk.localized/current/vkr'
filenames_path = base_path + '/pycodesuggest_py_repos_normalised/train_files.txt'
data_path = base_path + '/pycodesuggest_py_repos_normalised/'

with open(filenames_path, 'r') as f:
    filenames = f.read().split('\n')


In [3]:
filenames[:2]

['tldr-pages/tldr/scripts/send_to_bot.py',
 'tldr-pages/tldr/scripts/pdf/render.py']

In [4]:
def get_keywords_from_file(filename):
    try:
        with open(filename, 'r') as f:
            code = f.read()
    except IsADirectoryError:
        return []
    keywords_list = []
    g = tokenize.tokenize(BytesIO(code.encode("utf-8")).readline)
    for toktype, tokval, start, _, _ in g:
        if toktype == tokenize.NAME and keyword.iskeyword(tokval):
            keywords_list.append(tok2id[tokval])
    return keywords_list


In [15]:
tok2id = dict(zip(keyword.kwlist, range(len(keyword.kwlist))))
tok2id['BLANK'] = len(keyword.kwlist)
id2tok = dict(list(map(lambda x: x[::-1], list(tok2id.items()))))

tok2id

{'False': 0,
 'None': 1,
 'True': 2,
 'and': 3,
 'as': 4,
 'assert': 5,
 'async': 6,
 'await': 7,
 'break': 8,
 'class': 9,
 'continue': 10,
 'def': 11,
 'del': 12,
 'elif': 13,
 'else': 14,
 'except': 15,
 'finally': 16,
 'for': 17,
 'from': 18,
 'global': 19,
 'if': 20,
 'import': 21,
 'in': 22,
 'is': 23,
 'lambda': 24,
 'nonlocal': 25,
 'not': 26,
 'or': 27,
 'pass': 28,
 'raise': 29,
 'return': 30,
 'try': 31,
 'while': 32,
 'with': 33,
 'yield': 34,
 'BLANK': 35}

In [6]:
train = []
for filename in tqdm(filenames):
    train += [get_keywords_from_file(data_path + filename)]

100%|██████████| 51889/51889 [06:53<00:00, 125.58it/s]  


In [7]:
test_filenames_path = base_path + '/pycodesuggest_py_repos_normalised/test_files.txt'

with open(test_filenames_path, 'r') as f:
    test_filenames = f.read().split('\n')



In [8]:
test = []
for filename in tqdm(test_filenames):
    test += [get_keywords_from_file(data_path + filename)]

100%|██████████| 39791/39791 [04:33<00:00, 145.69it/s]


In [16]:
d = {
    'train': train,
    'test': test,
    'tok2id': tok2id,
    'id2tok': id2tok
}

In [17]:
with open(base_path + '/keywords_dataset.pickle', 'wb') as f:
    pickle.dump(d, f)

In [18]:
with open(base_path + '/keywords_dataset.pickle', 'rb') as f:
    d = pickle.load(f)

In [52]:
class Ngram():
    def __init__(self, N, tok2id):
        self.N = N
        self.tok2id = tok2id
        self.ngrams_dict = defaultdict(int)
    
    def process_train(self, train):
        start_time = time.time()
        for seq in train:
            seq = [self.tok2id['BLANK'] for _ in range(self.N)] + seq
            for i in range(len(seq) - self.N):
                self.ngrams_dict[tuple(seq[i:i+self.N])] += 1
#         print(seq, self.ngrams_dict)
        self.filter_most_popular_ngrams()
        print('Time spent {}s'.format(time.time() - start_time))
    
    def filter_most_popular_ngrams(self):
        # ngram_list: [[(1, 231, 12), 5], ...]
        ngrams_list = list(self.ngrams_dict.items())
        def sort_fun(ngram):
            return ngram[0][:self.N-1], ngram[1]
        ngrams_list = sorted(ngrams_list, key=sort_fun, reverse=True)
        prev = None
        self.new_ngrams_dict = {}
        for ngram in tqdm(ngrams_list):
            cur = ngram[0][:self.N-1]
            if prev != cur:
                self.new_ngrams_dict[tuple(cur)] = ngram[0][self.N-1]
            prev = cur
    
    def calc_accuracy(self, test):
        tp = 0
        count = 0
        for seq in tqdm(test):
            for i in range(len(seq)-self.N):
                # to avoid considering empty tokens
                # and non-identifiers
                key = tuple(seq[i:i+self.N-1])
                if key in self.new_ngrams_dict \
                    and seq[i+self.N-1] == self.new_ngrams_dict[key]:
                    tp += 1
                count += 1
        return tp / count


In [57]:
ngram = Ngram(2, d['tok2id'])
ngram.process_train(d['train'])
accuracy = ngram.calc_accuracy(d['test'])
print('''Accuracy: {:.4f}'''.format(accuracy))
# print('''Accuracy: {:.4f}
# Time needed for one prediction is just accessing the dict element,
# so it\'s around ~12 sec / ~96000 = {:.7f} (info from tqdm counter)'''\
# .format(accuracy, 12. / 96000))

100%|██████████| 1077/1077 [00:00<00:00, 916652.88it/s]
  3%|▎         | 1189/39791 [00:00<00:03, 11833.81it/s]

Time spent 2.4231371879577637s


100%|██████████| 39791/39791 [00:02<00:00, 19552.17it/s]

Accuracy: 0.3854





In [55]:
ngram = Ngram(3, d['tok2id'])
ngram.process_train(d['train'])
accuracy = ngram.calc_accuracy(d['test'])
print('''Accuracy: {:.4f}'''.format(accuracy))
# print('''Accuracy: {:.4f}
# Time needed for one prediction is just accessing the dict element,
# so it\'s around ~12 sec / ~96000 = {:.7f} (info from tqdm counter)'''\
# .format(accuracy, 12. / 96000))

100%|██████████| 16905/16905 [00:00<00:00, 1241828.98it/s]
  3%|▎         | 1337/39791 [00:00<00:02, 13366.28it/s]

Time spent 2.4435698986053467s


100%|██████████| 39791/39791 [00:02<00:00, 18969.09it/s]

Accuracy: 0.4486





In [56]:
ngram = Ngram(4, d['tok2id'])
ngram.process_train(d['train'])
accuracy = ngram.calc_accuracy(d['test'])
print('''Accuracy: {:.4f}'''.format(accuracy))
# print('''Accuracy: {:.4f}
# Time needed for one prediction is just accessing the dict element,
# so it\'s around ~12 sec / ~96000 = {:.7f} (info from tqdm counter)'''\
# .format(accuracy, 12. / 96000))

100%|██████████| 106273/106273 [00:00<00:00, 946852.58it/s]
  3%|▎         | 1152/39791 [00:00<00:03, 11519.10it/s]

Time spent 3.037687063217163s


100%|██████████| 39791/39791 [00:02<00:00, 16595.04it/s]

Accuracy: 0.4706





In [5]:
# N = 3

In [6]:
# ngrams_dict = defaultdict(int)

In [7]:
# for seq in tqdm_notebook(train):
#     for i in range(len(seq) - N):
#         ngrams_dict[tuple(seq[i:i+N])] += 1

## Что стоит сделать
* Убрать `<PAD>`

In [8]:
# def filter_most_popular_ngrams(ngrams_dict):
#     # ngram_list: [[(1, 231, 12), 5], ...]
#     ngrams_list = list(ngrams_dict.items())
#     def sort_fun(ngram):
#         return ngram[0][:2], ngram[1]
#     ngrams_list = sorted(ngrams_list, key=sort_fun, reverse=True)
#     prev = None
#     new_ngrams_dict = {}
#     for ngram in tqdm(ngrams_list):
#         cur = ngram[0][:2]
#         if prev != cur:
#             new_ngrams_dict[tuple(cur)] = ngram[0][2]
#         prev = cur
#     return new_ngrams_dict

In [9]:
# new_ngrams_dict = filter_most_popular_ngrams(ngrams_dict)

In [10]:
# with open('./ngrams_dict.pickle', 'wb') as f:
#     pickle.dump(new_ngrams_dict, f)

with open('./ngrams_dict.pickle', 'rb') as f:
    new_ngrams_dict = pickle.load(f)

In [11]:
list(new_ngrams_dict.items())[:10]

[((79583, 14), 3),
 ((79583, 9), 2),
 ((79583, 5), 14),
 ((79583, 3), 2),
 ((79583, 2), 2),
 ((79581, 4), 1),
 ((79581, 3), 42671),
 ((79581, 2), 15),
 ((79580, 14), 7),
 ((79580, 9), 13)]

In [12]:
def build_vocab():
    # load pre-computed vocab
    with open(base_path + '/mapping.map', 'rb') as f:
        word_to_id = pickle.load(f)
    id_to_word = dict([(v, k) for (k, v) in word_to_id.items()])
    return word_to_id, id_to_word

word_to_id, id_to_word = build_vocab()


In [13]:
def calc_quality(ngrams_dict):
    tp = 0
    count = 0
    for seq in tqdm(test):
        for i in range(len(seq)-N):
            # to avoid considering empty tokens
            # and non-identifiers
            if seq[i+1] != 0 and seq[i+2] != 0 and keyword.iskeyword(id_to_word[seq[i+2]]):
                if (seq[i], seq[i+1]) in ngrams_dict \
                        and seq[i+2] == ngrams_dict[(seq[i], seq[i+1])]:
                    tp += 1
                count += 1
    return tp / count
        

In [14]:
accuracy = calc_quality(new_ngrams_dict)
print('''Accuracy: {:.4f}
Time needed for one prediction is just accessing the dict element, so it\'s around ~12 sec / ~96000 = {:.7f} (info from tqdm counter)'''\
.format(accuracy, 12. / 96000))


100%|██████████| 95687/95687 [00:09<00:00, 9885.78it/s] 

Accuracy: 0.3511
Time needed for one prediction is just accessing the dict element, so it's around ~12 sec / ~96000 = 0.0001250 (info from tqdm counter)





In [43]:
test.shape[0] * (test.shape[1] - N)

9281639