## Импорты

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import os
from tqdm import tqdm_notebook, trange, tqdm
import pickle
import numpy as np
from IPython.display import clear_output
import time
import string

# Предобработка

In [5]:
# from google.colab import drive
# drive.mount('/content/drive')

In [6]:
# base_path = '/content/drive/My Drive/vkr'
base_path = '.'

- брать код из обычных репозиториев, а разбивку — из нормализованных
- сначала делаем выборку максимально без изворотов:
    - каждый файл нарезаем на части по n символов в каждом, если в последнем куске не n — выбрасываем
    - сохраняем выборку в файл pickle, пример: `[["import pickl",1], ...]` (потому что дальше идет "е", вероятно)
- преобразовать в удобный для модели формат:
    - вместо строк — массив чисел, где каждое число биективно соответствует символу
    
- модель учится предсказывать n+1 символ, если он eng letter, то это 1, иначе 0

In [17]:
class PreprocessDataset():
    def __init__(
        self,
        n,
        path_to_filenames,
        path_to_repo,
        objects_limit,
        check_if_english=True
    ):
        self.n = n
        self.total_count = 0
        self.pos_count = 0
        with open(path_to_filenames, 'r') as f:
            self.filenames = f.read().split('\n')
        self.path_to_repo = path_to_repo
        self.dataset_dictionary = set()
        self.objects_limit = objects_limit
        self.check_if_english = check_if_english
        self.allowed_letters = set(string.printable)

    def update_dataset_dictionary(self, code):
        self.dataset_dictionary.update(set(code))
        
    def is_english(self, code):
        for char in code:
            if char not in self.allowed_letters:
                return False
        return True

    def process_code(self, code):
        self.update_dataset_dictionary(code)
        chunks, chunk_size = len(code), self.n
        slices = [code[i:i+chunk_size] for i in range(0, chunks, chunk_size)]
        self.total_count += len(slices) - 1
        targets = []
        for i in range(len(slices) - 1):
            if slices[i+1][0].isalpha():
                self.pos_count += 1
                targets.append(True)
            else:
                targets.append(False)
        slices = slices[:-1]
        objects_from_code = list(zip(slices, targets))
        return objects_from_code
        
    def make_dataset(self):
        fidx = 0
        pbar = tqdm(total=self.objects_limit)
        objects_from_code = []
        while self.total_count < self.objects_limit:
            with open(
                self.path_to_repo + self.filenames[fidx],
                'r'
            ) as code_file:
                code = code_file.read()
            fidx += 1
            if self.check_if_english and not self.is_english(code):
                continue
            objects_from_code += self.process_code(code)
            pbar.update(self.total_count)
        pbar.close()
#         print(objects_from_code)

        print('Number of positives samples is {} out of {} ({:.2f}%)'.format(
            self.pos_count,
            self.total_count,
            100. * self.pos_count / self.total_count
        ))
        
#         print(len(self.dataset_dictionary), self.dataset_dictionary)
        
        return objects_from_code


In [21]:
prep_dataset = PreprocessDataset(
    20,
    base_path + '/pycodesuggest_py_repos_normalised/train_files.txt',
    base_path + '/pycodesuggest_py_repos_normalised/',
    objects_limit=5000000
)
objects_from_code = prep_dataset.make_dataset()


31566657837it [00:25, 1221527424.58it/s]                      

Number of positives samples is 2565009 out of 5000289 (51.30%)





In [22]:
objects_from_code[:10]

[('import json\nimport o', True),
 ('s\nimport sys\ntry:\n  ', False),
 ('  import urllib.requ', True),
 ('est as urllib2\nexcep', True),
 ('t ImportError:\n    i', True),
 ('mport urllib2\nvar266', False),
 ("5 = 'https://tldr-bo", True),
 ('t.starbeamrainbowlab', True),
 ("s.com/'\n\n\ndef functi", True),
 ('on2743(arg1358, arg5', False)]

In [26]:
prep_dataset.dataset_dictionary

{'\t',
 '\n',
 '\x0c',
 ' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~'}

In [23]:
with open(base_path + '/objects_from_code.pickle', 'wb') as f:
    pickle.dump(objects_from_code, f)


In [29]:
tokens = prep_dataset.dataset_dictionary
tok2id = dict(zip(tokens, range(len(tokens))))
id2tok = dict(list(map(lambda x: x[::-1], list(tok2id.items()))))

In [32]:
tok2id

{'p': 0,
 ' ': 1,
 't': 2,
 '=': 3,
 '6': 4,
 'X': 5,
 'S': 6,
 '+': 7,
 'x': 8,
 's': 9,
 'n': 10,
 '8': 11,
 '`': 12,
 '\\': 13,
 'a': 14,
 'I': 15,
 'l': 16,
 'G': 17,
 '#': 18,
 'q': 19,
 'M': 20,
 '/': 21,
 'D': 22,
 'z': 23,
 'c': 24,
 'R': 25,
 'C': 26,
 'v': 27,
 '.': 28,
 ',': 29,
 'j': 30,
 'L': 31,
 'N': 32,
 ';': 33,
 ')': 34,
 'K': 35,
 'T': 36,
 '\x0c': 37,
 '?': 38,
 'J': 39,
 '9': 40,
 'O': 41,
 '"': 42,
 'A': 43,
 '@': 44,
 '\t': 45,
 '_': 46,
 '}': 47,
 'i': 48,
 'u': 49,
 '4': 50,
 '*': 51,
 'k': 52,
 'y': 53,
 'F': 54,
 'E': 55,
 'g': 56,
 "'": 57,
 '&': 58,
 '-': 59,
 'V': 60,
 '3': 61,
 'W': 62,
 '2': 63,
 'o': 64,
 '\n': 65,
 '7': 66,
 '0': 67,
 '<': 68,
 '5': 69,
 'Y': 70,
 'r': 71,
 '[': 72,
 '1': 73,
 '(': 74,
 '|': 75,
 'm': 76,
 '>': 77,
 '^': 78,
 'h': 79,
 'e': 80,
 '$': 81,
 'd': 82,
 'f': 83,
 '{': 84,
 'U': 85,
 'Q': 86,
 'w': 87,
 'H': 88,
 '~': 89,
 'b': 90,
 '%': 91,
 '!': 92,
 'P': 93,
 'B': 94,
 ']': 95,
 ':': 96,
 'Z': 97}

In [39]:
def convert_for_model(code_samples):
    max_len = len(code_samples[0][0])
    converted_code = np.zeros([len(code_samples), max_len], np.int64)

    for i in range(len(code_samples)):
        obj = list(map(tok2id.get, code_samples[i][0]))
        converted_code[i, :] = obj
    targets = np.array(list(map(lambda pair: pair[1], code_samples)), np.bool)

    return converted_code, targets

In [40]:
# %%time
print(objects_from_code[:10])
print(convert_for_model(objects_from_code[:10]))


[('import json\nimport o', True), ('s\nimport sys\ntry:\n  ', False), ('  import urllib.requ', True), ('est as urllib2\nexcep', True), ('t ImportError:\n    i', True), ('mport urllib2\nvar266', False), ("5 = 'https://tldr-bo", True), ('t.starbeamrainbowlab', True), ("s.com/'\n\n\ndef functi", True), ('on2743(arg1358, arg5', False)]
(array([[48, 76,  0, 64, 71,  2,  1, 30,  9, 64, 10, 65, 48, 76,  0, 64,
        71,  2,  1, 64],
       [ 9, 65, 48, 76,  0, 64, 71,  2,  1,  9, 53,  9, 65,  2, 71, 53,
        96, 65,  1,  1],
       [ 1,  1, 48, 76,  0, 64, 71,  2,  1, 49, 71, 16, 16, 48, 90, 28,
        71, 80, 19, 49],
       [80,  9,  2,  1, 14,  9,  1, 49, 71, 16, 16, 48, 90, 63, 65, 80,
         8, 24, 80,  0],
       [ 2,  1, 15, 76,  0, 64, 71,  2, 55, 71, 71, 64, 71, 96, 65,  1,
         1,  1,  1, 48],
       [76,  0, 64, 71,  2,  1, 49, 71, 16, 16, 48, 90, 63, 65, 27, 14,
        71, 63,  4,  4],
       [69,  1,  3,  1, 57, 79,  2,  2,  0,  9, 96, 21, 21,  2, 16, 82,
        71,

In [41]:
%%time
X_train, y_train = convert_for_model(objects_from_code)

CPU times: user 19.6 s, sys: 505 ms, total: 20.1 s
Wall time: 20.5 s
