In [2]:
import lzma
from itertools import combinations, product
from collections import defaultdict
from tqdm import tqdm
import re
import os
import sys


class keydefaultdict(defaultdict):
    def __missing__(self, key):
        if self.default_factory is None:
            raise KeyError(key)
        else:
            ret = self[key] = self.default_factory(key)
            return ret


compressed_length = keydefaultdict()


def compressor(quality):
    return (
        lambda s: len(lzma.compress(
            s,
            format=lzma.FORMAT_RAW,
            filters=[{"id": lzma.FILTER_LZMA2, "preset": quality}]
        ))
    )


def NCD(x, y, strategy=None):
    global compressed_length

    lx = compressed_length[x]
    ly = compressed_length[y]

    if strategy == 'MINMAX':
        return (min(compressed_length[x + y], compressed_length[y + x]) - max(lx, ly)) / max(lx, ly)
    if strategy == 'FAST':
        if lx > ly:
            lx, ly = ly, lx
        return (compressed_length[y + x] - ly) / lx

    return (compressed_length[x + y] + compressed_length[y + x] - lx - ly) / (lx + ly)

In [4]:
rsol = re.compile(r'.*\\begin\{solution\}(.*)\\end\{solution\}.*', re.DOTALL)


def get_solution_text(filename):
    global rsol
    with open(filename, 'r', encoding='utf-8') as txt:
        solution_text = rsol.match(txt.read())
        if solution_text:
            return solution_text.group(1).strip().lower().replace(' ', '').encode()
        else:
            return None


def traverse(dirname):
    all_solutions = defaultdict(lambda: defaultdict(str))
    rexp = re.compile(r'\D*(\d+)\.tex')

    for item in os.listdir(dirname):
        if item.startswith('ds2017'):
            _, _, student_surname, student_name = item.split()

            for file in os.listdir(dirname + item):
                if file.endswith('.tex'):
                    x = rexp.match(file)
                    if x:
                        if not file.startswith('solution'):
                            print(item[13:] + '/' + file)

                        problem_id = int(x.group(1))
                        if problem_id < 2:
                            continue

                        solution_text = get_solution_text(dirname + item + '\\' + file)
                        if solution_text:
                            all_solutions[problem_id][f'{student_surname} {student_name}'] = solution_text
                        else:
                            print('Failed to find solution in ' + item[9:] + '/' + file)
    return all_solutions


def traverse_archive(dirname, year=None):
    old_solutions = defaultdict(lambda: defaultdict(str))
    rexp = re.compile(r'\D*(\d+)\.tex')

    for item in os.listdir(dirname):
        student_surname, student_name = item.split()

        for file in os.listdir(dirname + item):
            if file.endswith('.tex'):
                x = rexp.match(file)
                if x:
                    problem_id = int(x.group(1))
                    if problem_id < 2:
                        continue

                    solution_text = get_solution_text(dirname + item + '\\' + file)
                    if solution_text:
                        student_display = f'{student_surname} {student_name}'
                        if year:
                            student_display = f'{student_display} ({year})'
                        old_solutions[problem_id][student_display] = solution_text
                    else:
                        print('Failed to find solution in ' + item + '/' + file)

    return old_solutions

In [5]:
def find_plagiarism(threshold=0.5, quality=9, problem_ids=None, problem_ids_exclude=None, exclusion_threshold=0.0):
    global compressed_length

    all_solutions = traverse(r'C:\Users\dainiak\Dropbox\Apps\ShareLaTeX\\')

    old_solutions = defaultdict(lambda: defaultdict(str))
    archives = [
        traverse_archive(r'c:\Users\dainiak\Documents\ds_reviews\dumps-2015\\', year=2015),
        traverse_archive(r'c:\Users\dainiak\Documents\ds_reviews\dumps-2016\\', year=2016)
    ]
    for archive in archives:
        for problem_id in archive:
            old_solutions[problem_id].update(archive[problem_id])

    compressed_length = keydefaultdict(compressor(quality))

    filtered_ids = set(all_solutions)
    if problem_ids:
        filtered_ids.intersection_update(problem_ids)
    if problem_ids_exclude and not exclusion_threshold or exclusion_threshold <= 0:
        filtered_ids.difference_update(problem_ids_exclude)

    n_pairs_to_check = sum(
        k * (k - 1) // 2
        for k in map(
            len,
            (all_solutions[problem_id] for problem_id in filtered_ids)
        )
    )

    #
    # for problem_id in filtered_ids:
    #     k = len(all_solutions[problem_id])
    #     n_pairs_to_check += k * (k-1) // 2

    similar_pairs = defaultdict(dict)

    with tqdm(total=n_pairs_to_check) as progress_bar:
        iteration = 0
        for problem_id in filtered_ids:
            if len(all_solutions[problem_id]) > 0:
                for (i1, s1), (i2, s2) in combinations(all_solutions[problem_id].items(), 2):
                    distance = NCD(s1, s2)
                    iteration += 1
                    progress_bar.update(1)
                    if distance <= exclusion_threshold or (
                                not problem_ids_exclude
                                or problem_id not in problem_ids_exclude
                            ) and distance <= threshold:
                        similar_pairs[problem_id][(i1, i2)] = distance

    n_pairs_to_check = sum(
        len(old_solutions[problem_id]) * len(all_solutions[problem_id])
        for problem_id in filtered_ids
        if problem_id in old_solutions
    )

    with tqdm(total=n_pairs_to_check) as progress_bar:
        for problem_id in filtered_ids:
            if problem_id in old_solutions:
                pairs_to_check = list(product(all_solutions[problem_id].items(), old_solutions[problem_id].items()))
                if len(pairs_to_check) > 0:
                    for (i1, s1), (i2, s2) in pairs_to_check:
                        distance = NCD(s1, s2)
                        progress_bar.update(1)
                        if distance <= exclusion_threshold or (not problem_ids_exclude or problem_id not in problem_ids_exclude) and distance <= threshold:
                            similar_pairs[problem_id][(i1, i2)] = distance

    sys.stdout.flush()
    no_duplicates = False
    if len(similar_pairs) > 0:
        for problem_id in similar_pairs:
            print(f'\nSimilar solutions for problem {problem_id}:')
            print(', '.join(
                f'[({pair[0]}, {pair[1]}), {round(similar_pairs[problem_id][pair], 2)}]'
                for pair in similar_pairs[problem_id]
            ))
    if no_duplicates:
        print('No duplicates')

In [6]:
find_plagiarism(
    threshold=0.5,
    quality=4,
    problem_ids_exclude=[1, 128, 156, 315, 314],
    exclusion_threshold=0.2
)

Failed to find solution in Глушенков Иван/solution109.tex




Failed to find solution in Глушенков Иван/solution156.tex




Failed to find solution in Дахова Елизавета/solution314.tex




  0%|                                                                                          | 0/247 [00:00<?, ?it/s]

  3%|██▎                                                                               | 7/247 [00:00<00:04, 57.34it/s]

  5%|███▉                                                                             | 12/247 [00:00<00:04, 52.82it/s]

  7%|█████▉                                                                           | 18/247 [00:00<00:04, 51.13it/s]

  9%|███████▌                                                                         | 23/247 [00:00<00:04, 50.47it/s]

 11%|████████▊                                                                        | 27/247 [00:00<00:05, 41.02it/s]

 13%|██████████▍                                                                      | 32/247 [00:00<00:05, 40.45it/s]

 15%|████████████▏                                                                    | 37/247 [00:00<00:05, 41.31it/s]

 17%|█████████████▍                                                                   | 41/247 [00:00<00:05, 39.33it/s]

 18%|██████████████▊                                                                  | 45/247 [00:01<00:05, 37.41it/s]

 20%|████████████████                                                                 | 49/247 [00:01<00:05, 37.39it/s]

 22%|█████████████████▋                                                               | 54/247 [00:01<00:04, 40.35it/s]

 26%|████████████████████▋                                                            | 63/247 [00:01<00:03, 48.27it/s]

 30%|███████████████████████▉                                                         | 73/247 [00:01<00:03, 57.03it/s]

 34%|███████████████████████████▏                                                     | 83/247 [00:01<00:02, 65.46it/s]

 38%|██████████████████████████████▍                                                  | 93/247 [00:01<00:02, 72.38it/s]

 42%|█████████████████████████████████▋                                              | 104/247 [00:01<00:01, 79.42it/s]

 46%|████████████████████████████████████▌                                           | 113/247 [00:01<00:01, 80.76it/s]

 50%|████████████████████████████████████████▏                                       | 124/247 [00:02<00:01, 86.50it/s]

 54%|███████████████████████████████████████████▍                                    | 134/247 [00:02<00:01, 82.95it/s]

 58%|██████████████████████████████████████████████▋                                 | 144/247 [00:02<00:01, 85.17it/s]

 62%|█████████████████████████████████████████████████▌                              | 153/247 [00:02<00:01, 85.07it/s]

 66%|█████████████████████████████████████████████████████                           | 164/247 [00:02<00:00, 89.04it/s]

 72%|█████████████████████████████████████████████████████████▎                      | 177/247 [00:02<00:00, 98.10it/s]

 81%|███████████████████████████████████████████████████████████████▋               | 199/247 [00:02<00:00, 116.89it/s]

 87%|█████████████████████████████████████████████████████████████████████          | 216/247 [00:02<00:00, 127.80it/s]

 94%|█████████████████████████████████████████████████████████████████████████▉     | 231/247 [00:02<00:00, 129.21it/s]

100%|██████████████████████████████████████████████████████████████████████████████▋| 246/247 [00:03<00:00, 122.86it/s]

100%|████████████████████████████████████████████████████████████████████████████████| 247/247 [00:03<00:00, 81.26it/s]




  0%|                                                                                          | 0/102 [00:00<?, ?it/s]

  6%|████▊                                                                             | 6/102 [00:00<00:01, 57.10it/s]

 15%|███████████▉                                                                     | 15/102 [00:00<00:01, 63.45it/s]

 25%|███████████████████▊                                                             | 25/102 [00:00<00:01, 69.32it/s]

 31%|█████████████████████████▍                                                       | 32/102 [00:00<00:01, 68.69it/s]

 39%|███████████████████████████████▊                                                 | 40/102 [00:00<00:00, 70.21it/s]

 47%|██████████████████████████████████████                                           | 48/102 [00:00<00:00, 70.93it/s]

 56%|█████████████████████████████████████████████▎                                   | 57/102 [00:00<00:00, 74.60it/s]

 65%|████████████████████████████████████████████████████▍                            | 66/102 [00:00<00:00, 78.62it/s]

 73%|██████████████████████████████████████████████████████████▊                      | 74/102 [00:00<00:00, 73.16it/s]

 80%|█████████████████████████████████████████████████████████████████                | 82/102 [00:01<00:00, 72.22it/s]

 89%|████████████████████████████████████████████████████████████████████████▎        | 91/102 [00:01<00:00, 76.36it/s]

 97%|██████████████████████████████████████████████████████████████████████████████▌  | 99/102 [00:01<00:00, 76.52it/s]

100%|████████████████████████████████████████████████████████████████████████████████| 102/102 [00:01<00:00, 75.78it/s]





Similar solutions for problem 314:




[(Ахметзянов Талгат, Никулов Сергей), 0.15], [(Ахметзянов Талгат, Пономаренко Николай), 0.15], [(Ахметзянов Талгат, Якупова Аделина), 0.15], [(Никулов Сергей, Пономаренко Николай), 0.02], [(Никулов Сергей, Якупова Аделина), 0.02], [(Пономаренко Николай, Якупова Аделина), 0.02]





Similar solutions for problem 156:




[(Сикалов Никита, Сунгатуллин Руслан), 0.16]


