In [1]:
from subprocess import run, PIPE
from os import mkdir, listdir
from os.path import isdir, exists

import json
from itertools import starmap
from datetime import datetime

from re import sub
from collections import namedtuple

import lzma
from itertools import combinations
from collections import defaultdict

def checkNCD(str_dict, threshold, verbose=False):
    new_dict = dict()
    complen_dict = dict()
    pairs = dict()
    for i in str_dict:
        s = sub(r'\s+', ' ', str_dict[i].lower()).encode()
        new_dict[i] = s
        complen_dict[i] = len(lzma.compress(s))

    all_pairs = list(combinations(new_dict.items(), 2))
    i = 0
    one_percent = len(all_pairs) // 100
    for (i1, s1), (i2, s2) in all_pairs:
        i += 1
        l1 = complen_dict[i1]
        l2 = complen_dict[i2]
        NCDistance = (len(lzma.compress(s1 + s2)) - min(l1, l2)) / max(l1, l2)
        if NCDistance <= threshold:
            pairs[(i1, i2)] = NCDistance
        if verbose and one_percent > 0 and i % one_percent == 0:
            print(str(i / one_percent) + '%')
    return pairs


def extract_function_definition(text, function_name):
    result = ''
    function_started = False
    for line in text.split('\n'):
        if line.startswith('def') and function_name in line:
            function_started = True
        elif function_started:
            if line.startswith(' ') or line.startswith('\t'):
                result += line + '\n'
            else:
                break
    return result

def skip_functions(text, function_names):
    result = ''
    function_started = False
    for line in text.split('\n'):
        if line.startswith('def') and not any(name in line for name in function_names):
            function_started = True
        elif function_started:
            if line.startswith(' ') or line.startswith('\t'):
                result += line + '\n'
            else:
                function_started = False
    return result

In [2]:
learners = '''Белых Евгений		https://github.com/white2302/discrete-optimization-course-homework
Васильев Александр		https://github.com/mizabrik/discrete-optimization-course-homework
Герасимов Артём		https://github.com/gerasartem/discrete-optimization-course-homework
Горелик Александр		https://github.com/alexgorelick/discrete-optimization-course-homework
Гришутин Александр		https://github.com/agrishutin/discrete-optimization-course-homework
Гусарова Дарья		https://github.com/DariaGusarova/discrete-optimization-course-homework
Зуева Надежда		https://github.com/nadezhdazueva/DiscreteOpt
Калиниченко Ольга		https://github.com/madshuttlecock/discrete-optimization-course-homework
Куприянов Артем		https://github.com/ArtemKupriyanov/discrete-optimization-course-homework
Литвинов Станислав		https://github.com/litvinovSA/discrete-optimization-course-homework
Марков Александр		https://github.com/markovalexander/discrete-optimization-course-homework
Мартинсон Михаил		https://github.com/MartinsonMichael/discrete-optimization-course-homework
Муравьев Кирилл		https://github.com/KirillMouraviev/discrete-optimization-course-homework
Мурзин Дмитрий		https://github.com/dima74/discrete-optimization-course-homework
Немычникова Валерия		https://github.com/sooobus/discrete-optimization-course-homework
Нифантова Ирина		https://github.com/NifantovaIrina/discrete-optimization-course-homework
Петров Филипп		https://github.com/yaPhilya/discrete-optimization-course-homework
Проскурин Александр		https://github.com/aleksProsk/discrete-optimization-course-homework
Пыркин Дмитрий		https://github.com/Blacksorld/discrete-optimization-course-homework
Лернер Регина		https://github.com/flagolyub/discrete-optimization-course-homework
Рейдман Павел		https://github.com/preidman/discrete-optimization-course-homework
Ремизова Анастасия		https://github.com/feathernox/discrete-optimization-course-homework
Рязановский Данила		https://github.com/riazanovskiy/discrete-optimization-course-homework
Ткаченко Дмитрий		https://github.com/JerryCh0/discrete-optimization-course-homework
Троцюк Владислав		https://github.com/vladtrotsiuk/discrete-optimization-course-homework
Якушева Софья		https://github.com/stager108/discrete-optimization-course-homework'''

Learner = namedtuple( 'Learner', ['lastname', 'firstname', 'github_username'] )

learners = list(map(
    lambda s: Learner(*s.split()), 
    sub(r'https://[^/]*/([^/]*)/discrete-optimization-course-homework', r'\1', learners.replace('\t\t',' ')).split('\n')
))

main_repo_dir = r'c:\Users\daini\Documents\disco2017-hw-check'

In [3]:
def cloneAllRepos():
    global learners
    print('Going to process {} user repos.'.format(len(learners)))
    
    for learner in learners:
        repo_local_dirname = main_repo_dir + r'\{}'.format(learner.lastname)
        repo_remote_dirname = 'https://github.com/{}/discrete-optimization-course-homework.git'.format(learner.github_username)
        if not isdir(repo_local_dirname):
            print('Directory “{}” doesn’t exist. Creating it.'.format(repo_local_dirname))
            mkdir(repo_local_dirname)
        if listdir(repo_local_dirname) == []:
            print('Executing “git clone {} {}”'.format(repo_remote_dirname, repo_local_dirname), end=' ')
            git_run_result = run([r'C:\Users\daini\AppData\Local\GitHub\PortableGit_f02737a78695063deace08e96d5042710d3e32db\cmd\git.exe', 'clone', repo_remote_dirname, repo_local_dirname], stdout=PIPE, stderr=PIPE)            
        else:
            print('Repo “{}” already exists; trying to update.'.format(repo_local_dirname), end=' ')
            git_run_result = run([r'C:\Users\daini\AppData\Local\GitHub\PortableGit_f02737a78695063deace08e96d5042710d3e32db\cmd\git.exe', 'pull', repo_remote_dirname], stdout=PIPE, stderr=PIPE, cwd=repo_local_dirname)
        if len(git_run_result.stdout) > 0:
                print('\n', git_run_result.stdout.decode("utf-8", "backslashreplace"))
        print('done')
    print('All repositories updated.')

In [3]:
def checkoutToDate(deadline, verbose = False):
    global learners
    if not verbose:
        print('Doing the checkout…', end='')
    for learner in learners:
        if verbose:
            print('Processing {}'.format(learner.lastname))
        repo_local_dirname = main_repo_dir + r'\{}'.format(learner.lastname)
        if not isdir(repo_local_dirname) and verbose:
            print('Directory “{}” doesn’t exist. Skipping.'.format(repo_local_dirname))
            continue
        path = 'git rev-list -n 1 --before="{}" master'.format(deadline)
        if verbose:
            print('Performing checkout of path “{}”…'.format(path))
        git_run_result = run([r'C:\Users\daini\AppData\Local\GitHub\PortableGit_f02737a78695063deace08e96d5042710d3e32db\cmd\git.exe', 'checkout', path], stdout=PIPE, stderr=PIPE, cwd=repo_local_dirname)
        if len(git_run_result.stdout) > 0 and verbose:
            print('\n', git_run_result.stdout.decode("utf-8", "backslashreplace"))
        
        if verbose:
            print('Cleaning the repo…')
        git_run_result = run([r'C:\Users\daini\AppData\Local\GitHub\PortableGit_f02737a78695063deace08e96d5042710d3e32db\cmd\git.exe', 'clean', '-fdxX'], stdout=PIPE, stderr=PIPE, cwd=repo_local_dirname)
        if len(git_run_result.stdout) > 0 and verbose:
            print('\n', git_run_result.stdout.decode("utf-8", "backslashreplace"))
    if not verbose:
        print('done!')

In [5]:
# cloneAllRepos()

In [4]:
def find_plagiarism_in_function(notebook_name, function_name, threshold=0.5):
    all_solutions = defaultdict(lambda: defaultdict(str))
    
    for learner in learners:
        filename = main_repo_dir + r'\{0}\{1}'.format(learner.lastname, notebook_name)
        if not exists(filename):
            continue
        with open( filename, 'r', encoding='utf8' ) as nbfile:
            nb = json.load( nbfile )
            code_cells = []
            for cell in nb['cells']:
                if cell['cell_type'] != 'code':
                    continue
                cell_text = sub(r'#.*', '', ''.join(s for s in cell['source'] if not s.startswith('#')))
                core_text = extract_function_definition(cell_text, function_name)
                if core_text:
                    all_solutions[learner.lastname] = core_text

    no_duplicates = True

    print('Processing…', end='')
    similarPairs = checkNCD(all_solutions, threshold)
    print('done!')

    if len(similarPairs) > 0:
        formatted_output = sorted(
            (similarPairs[pair], '{2}: ({0}, {1})'.format(pair[0], pair[1], round(similarPairs[pair], 2))) 
            for pair in similarPairs
        )
        print('Similar definitions of “{}” in {}:'.format(function_name, notebook_name))
        print('\n'.join(x[1] for x in formatted_output))
    else:
        print('No duplicates')
    return all_solutions

def find_plagiarism_in_code(notebook_name, ignored_function_names, threshold=0.5):
    all_solutions = defaultdict(lambda: defaultdict(str))
    
    for learner in learners:
        filename = main_repo_dir + r'\{0}\{1}'.format(learner.lastname, notebook_name)
        if not exists(filename):
            continue
        with open( filename, 'r', encoding='utf8' ) as nbfile:
            nb = json.load( nbfile )
            code_cells = []
            for cell in nb['cells']:
                if cell['cell_type'] != 'code':
                    continue
                cell_text = sub(r'#.*', '', ''.join(s for s in cell['source'] if not s.startswith('#')))
                core_text = skip_functions(cell_text, ignored_function_names)
                if core_text:
                    all_solutions[learner.lastname] = core_text

    no_duplicates = True

    print('Processing…', end='')
    similarPairs = checkNCD(all_solutions, threshold)
    print('done!')

    if len(similarPairs) > 0:
        formatted_output = sorted(
            (similarPairs[pair], '{2}: ({0}, {1})'.format(pair[0], pair[1], round(similarPairs[pair], 2))) 
            for pair in similarPairs
        )
        print('Similar code in {}:'.format(notebook_name))
        print('\n'.join(x[1] for x in formatted_output))
    else:
        print('No duplicates')
    return all_solutions

In [7]:
# checkoutToDate(datetime(2017,3,19,23,59,59))
# solutions = find_plagiarism_in_function('coding-hometask-1.ipynb', 'solve_bp_search')

# checkoutToDate(datetime(2017,3,27,23,59,59))
solutions = find_plagiarism_in_function('coding-hometask-2-1.ipynb', 'basic_local_search')

# checkoutToDate(datetime(2017,4,10,23,59,59))
solutions = find_plagiarism_in_function('coding-hometask-2-2.ipynb', 'variable_depth_local_search')

# checkoutToDate(datetime(2017,4,2,23,59,59))
solutions = find_plagiarism_in_function('coding-hometask-3-1.ipynb', 'solve_tsp_nearest_neighbour')
solutions = find_plagiarism_in_function('coding-hometask-3-1.ipynb', 'solve_tsp_nearest_insertion')

# checkoutToDate(datetime(2017,4,17,23,59,59))
# solutions = find_plagiarism_in_function('coding-hometask-3-2.ipynb', 'lower_bound_tsp')

# checkoutToDate(datetime(2017,5,8,23,59,59))
solutions = find_plagiarism_in_function('coding-hometask-4-1.ipynb', 'solve_tsp_with_lp')

# checkoutToDate(datetime(2017,5,24,23,59,59))
# solutions = find_plagiarism_in_function('coding-hometask-5-1.ipynb', 'choose_landmarks')
# solutions = find_plagiarism_in_function('coding-hometask-5-1.ipynb', 'precalculate_landmark_distances')
# solutions = find_plagiarism_in_function('coding-hometask-5-1.ipynb', 'a_star_with_landmarks')

# checkoutToDate(datetime(2017,3,27,23,59,59))
# solutions = find_plagiarism_in_code('coding-hometask-2-1.ipynb', ['read_col_file'])

# checkoutToDate(datetime(2017,4,10,23,59,59))
# solutions = find_plagiarism_in_code('coding-hometask-2-2.ipynb', ['read_instance', 'get_quality', 'run_all'])

# checkoutToDate(datetime(2017,4,2,23,59,59))
# solutions = find_plagiarism_in_code('coding-hometask-3-1.ipynb', ['read_tsp_instance', 'euclidean_distance', 'calculate_tour_length', 'run_all'])

# checkoutToDate(datetime(2017,4,17,23,59,59))
# solutions = find_plagiarism_in_code('coding-hometask-3-2.ipynb', ['read_tsp_instance', 'euclidean_distance', 'run_all'])

# checkoutToDate(datetime(2017,5,8,23,59,59))
# solutions = find_plagiarism_in_code('coding-hometask-4-1.ipynb', ['dist15'])

# checkoutToDate(datetime(2017,5,24,23,59,59))
# solutions = find_plagiarism_in_code('coding-hometask-5-1.ipynb', ['read_node_coords', 'read_arcs', 'run_all'])


Processing…done!
Similar definitions of “basic_local_search” in coding-hometask-2-1.ipynb:
0.26: (Муравьев, Троцюк)
0.44: (Марков, Проскурин)
0.44: (Гришутин, Ремизова)
0.46: (Мартинсон, Петров)
0.48: (Гришутин, Мартинсон)
0.48: (Гришутин, Петров)
0.5: (Лернер, Рейдман)
0.5: (Мартинсон, Ремизова)
Processing…done!
Similar definitions of “variable_depth_local_search” in coding-hometask-2-2.ipynb:
0.06: (Горелик, Рязановский)
0.34: (Куприянов, Лернер)
0.35: (Горелик, Мурзин)
0.35: (Мурзин, Рязановский)
0.36: (Куприянов, Марков)
0.38: (Марков, Лернер)
0.4: (Куприянов, Ткаченко)
0.41: (Куприянов, Мартинсон)
0.41: (Лернер, Ткаченко)
0.43: (Мартинсон, Лернер)
0.45: (Марков, Мартинсон)
0.45: (Марков, Ткаченко)
0.47: (Горелик, Мартинсон)
0.47: (Мартинсон, Ткаченко)
0.48: (Мартинсон, Рязановский)
0.49: (Горелик, Куприянов)
0.49: (Куприянов, Рязановский)
0.49: (Горелик, Лернер)
0.49: (Лернер, Рязановский)
Processing…done!
Similar definitions of “solve_tsp_nearest_neighbour” in coding-hometask-3-1

JSONDecodeError: Expecting ':' delimiter: line 26 column 9 (char 1325)