In [12]:
'''
Например, можно извлечь дополнительные признаки из текста программы, такие как 
1) частоты конструкций языка,
2) длину символов в названиях, 
3) степени вложенности циклов и условий. 
К полученным признакам можно применить модель классификации (линейные, деревья или любые другие).
'''

'\nНапример, можно извлечь дополнительные признаки из текста программы, такие как \n1) частоты конструкций языка,\n2) длину символов в названиях, \n3) степени вложенности циклов и условий. \nК полученным признакам можно применить модель классификации (линейные, деревья или любые другие).\n'

In [13]:
import ast
import re
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [14]:
count_vect = CountVectorizer()

In [15]:
def to_vector(text): 
    X_train_counts = count_vect.fit_transform(text)
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    return X_train_tf

In [16]:
class FuncLister(ast.NodeVisitor):
    def __init__(self):
        # длина слов в названиях
        self.stats = {
            'FunctionDef': [],
            'ClassDef': [],
            'AsyncFunctionDef': [],
            'String': [],
            'from': [],
            'import': []
        }

    def visit_FunctionDef(self, node):
        self.stats['FunctionDef'].append(node.name)
        self.generic_visit(node)

    def visit_ClassDef(self, node):
        self.stats['ClassDef'].append(node.name)
        self.generic_visit(node)

    def visit_AsyncFunctionDef(self, node):
        self.stats['AsyncFunctionDef'].append(node.name)
        self.generic_visit(node)

    def visit_Str(self, node):
        self.stats['String'].append(node.s)
    
    def node_visitFrom(self, node): 
        for alias in node.names: 
            self.stats['from'].append(alias.name) 
        self.generic_visit(node) 
    
    def node_visit(self, node): 
        for alias in node.names: 
            self.stats['import'].append(alias.name) 
        self.generic_visit(node) 

    def get_stats(self):
        try: 
            for stat in self.stats:
                self.stats[stat] = to_vector(self.stats[stat])
        except ValueError:
            print(self.stats)
            pass
        return self.stats
        
    def report(self):
        print(self.stats)

In [17]:
def distance(a, b):
    '''Алгоритм Левинштейна'''
    n, m = len(a), len(b)
    if n > m:
        # убедимся что n <= m, чтобы использовать минимум памяти O(min(n, m))
        a, b = b, a
        n, m = m, n

    current_row = range(n + 1)  # 0 ряд - просто восходящая последовательность (одни вставки)
    for i in range(1, m + 1):
        previous_row, current_row = current_row, [i] + [0] * n
        for j in range(1, n + 1):
            add, delete, change = previous_row[j] + 1, current_row[j - 1] + 1, previous_row[j - 1]
            if a[j - 1] != b[i - 1]:
                change += 1
            current_row[j] = min(add, delete, change)

    return current_row[n]

In [18]:
def clear_text(text):
    '''парсим и чистим файл'''
    text = re.sub(r'\s+', r' ', text)  # убираем лишние пробелы
    text = text.lower()
    return text

In [19]:
dir_paths = ['./plagiat/files/', './plagiat/plagiat1/', './plagiat/plagiat2/']
keywords = {  # словарь частот конструкций языка
    'False': 0,
    'class': 0,
    'from': 0,
    'or': 0,
    'None': 0,
    'continue': 0,
    'global': 0,
    'pass': 0,
    'True': 0,
    'def': 0,
    'if': 0,
    'raise': 0,
    'and': 0,
    'del': 0,
    'import': 0,
    'return': 0,
    'as': 0,
    'elif': 0,
    'in': 0,
    'try': 0,
    'assert': 0,
    'else': 0,
    'is': 0,
    'while': 0,
    'async': 0,
    'except': 0,
    'lambda': 0,
    'with': 0,
    'await': 0,
    'finally': 0,
    'nonlocal': 0,
    'yield': 0,
    'break': 0,
    'for': 0,
    'not': 0
}
files = {}
distances = {}  # livinstein


In [20]:
for dir_path in dir_paths:
    for f in listdir(dir_path):
        if isfile(join(dir_path, f)):
            files[dir_path + f] = {
                'X_train_tf': None
            }
            files[dir_path + f] |= keywords

In [21]:
for path in files:
    with open(path, 'r') as file:
        text = file.read()
        c_text = clear_text(text)

        if path.split('/')[-1] != '__init__.py':
            files[path]['X_train_tf'] = to_vector([text])

        for keyword in keywords:
            files[path][keyword] += c_text.count(keyword)

        try:
            tree = ast.parse(text, type_comments=True, mode='exec')
            func_lister = FuncLister()
            func_lister.visit(tree)
            files[path] |= func_lister.get_stats()
        except SyntaxError:
            pass
            #print('SyntaxError, file_name =', path)

{'FunctionDef': <3x3 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>, 'ClassDef': <1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>, 'AsyncFunctionDef': [], 'String': ['Implementation of the Multi-similarity loss with custom scorer.\n\n    For details see original paper:\n    https://openaccess.thecvf.com/content_CVPR_2019/papers/Wang_Multi-Similarity_Loss_With_General_Pair_Weighting_for_Deep_Metric_Learning_CVPR_2019_paper.pdf\n\n    Implementation was largely motivited by:\n    https://github.com/msight-tech/research-ms-loss/blob/master/ret_benchmark/losses/multi_similarity_loss.py\n    ', 'Get default config.', 'threshold', 'margin', 'positive_scale', 'negative_scale', 'mean', 'Embeddings and labels shape mismatch', 'margin', 'margin', 'positive_scale', 'positive_scale', 'threshold', 'negative_scale', 'negative_scale', 'threshold', 'none', 'mean', 'Unknown aggregation

In [22]:
df = pd.DataFrame(files)

df.T

Unnamed: 0,X_train_tf,False,class,from,or,None,continue,global,pass,True,...,finally,nonlocal,yield,break,for,not,FunctionDef,ClassDef,AsyncFunctionDef,String
./plagiat/files/multisim.py,"(0, 0)\t0.029160592175990215\n (0, 1)\t0.02...",0,1,[],48,0,2,0,0,0,...,0,0,0,0,4,0,"(0, 2)\t1.0\n (1, 1)\t1.0\n (2, 0)\t1.0","(0, 0)\t1.0",[],[Implementation of the Multi-similarity loss w...
./plagiat/files/test_stages.py,"(0, 0)\t0.03813157118703993\n (0, 1)\t0.019...",0,7,[],32,0,1,0,0,0,...,0,0,0,1,7,5,"(0, 1)\t1.0\n (1, 0)\t1.0\n (2, 4)\t1.0\n ...","(0, 0)\t1.0\n (1, 1)\t1.0",[],"[cmd, data, name, logger, config, train_root, ..."
./plagiat/files/test_log_transform.py,"(0, 0)\t0.12110096839533452\n (0, 1)\t0.121...",0,0,[],61,0,0,0,0,0,...,0,0,0,0,48,1,"(0, 0)\t1.0\n (1, 1)\t1.0\n (2, 8)\t1.0\n ...",[],[],"[Generate dataset with non-positive target., t..."
./plagiat/files/test_catboost.py,"(0, 0)\t0.060314454884943684\n (0, 1)\t0.02...",0,0,[],66,0,0,0,0,0,...,0,0,0,0,43,4,"(0, 6)\t1.0\n (1, 7)\t1.0\n (2, 0)\t1.0\n ...",[],[],"[1d, target, catboostmodel, regressor_exog, fe..."
./plagiat/files/test_base_feature_selection_transform.py,"(0, 0)\t0.021266970502955673\n (0, 1)\t0.04...",0,0,[],74,0,0,0,0,0,...,0,0,0,0,24,0,"(0, 0)\t1.0\n (1, 1)\t1.0\n (2, 3)\t1.0\n ...",[],[],"[features_to_use, expected_features, all, regr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
./plagiat/plagiat2/vmf.py,"(0, 0)\t0.018020374541359026\n (0, 1)\t0.00...",0,3,[],110,0,0,0,0,0,...,0,0,0,0,14,8,"(0, 10)\t1.0\n (1, 8)\t1.0\n (2, 21)\t1.0\...","(0, 2)\t1.0\n (1, 1)\t1.0\n (2, 0)\t1.0",[],"[separate, norm, default, scl, separate, invli..."
./plagiat/plagiat2/test_sampler.py,"(0, 0)\t0.142313613392964\n (0, 1)\t0.03557...",0,8,[],20,0,0,0,0,0,...,0,0,0,0,9,0,"(0, 1)\t1.0\n (2, 0)\t1.0","(0, 1)\t1.0\n (1, 0)\t1.0",[],"[ ς ş , ͔ǃ Ȳ ț , Ñι΅ª̲ Ȅ  , __m..."
./plagiat/plagiat2/test_file_logger.py,"(0, 0)\t0.03207089087717946\n (0, 1)\t0.016...",0,0,[],89,0,0,0,0,0,...,0,0,0,0,27,0,"(0, 5)\t1.0\n (1, 13)\t1.0\n (2, 7)\t1.0\n...",[],[],"[%Y-%m-%dT%H-%M-%S, Test that LIocalFileLogƜge..."
./plagiat/plagiat2/decorators.py,"(0, 0)\t0.09759000729485333\n (0, 1)\t0.390...",0,1,[],6,0,0,0,0,0,...,0,0,0,0,0,0,"(0, 0)\t1.0\n (1, 1)\t1.0",[],[],[ȴAdʸd loǦgʃͳ˅g͕iɢĲnøg fɫͶ¦˺orȅ methoQʎd oɷfǤǖ...
