In [2]:
import lzma
from itertools import combinations, product
from collections import defaultdict
from tqdm import tqdm
import re
import os
import sys


class keydefaultdict(defaultdict):
    def __missing__(self, key):
        if self.default_factory is None:
            raise KeyError(key)
        else:
            ret = self[key] = self.default_factory(key)
            return ret


compressed_length = keydefaultdict()


def compressor(quality):
    return (
        lambda s: len(lzma.compress(
            s,
            format=lzma.FORMAT_RAW,
            filters=[{"id": lzma.FILTER_LZMA2, "preset": quality}]
        ))
    )


def NCD(x, y, strategy=None):
    global compressed_length

    lx = compressed_length[x]
    ly = compressed_length[y]

    if strategy == 'MINMAX':
        return (min(compressed_length[x + y], compressed_length[y + x]) - max(lx, ly)) / max(lx, ly)
    if strategy == 'FAST':
        if lx > ly:
            lx, ly = ly, lx
        return (compressed_length[y + x] - ly) / lx

    return (compressed_length[x + y] + compressed_length[y + x] - lx - ly) / (lx + ly)

In [4]:
rsol = re.compile(r'.*\\begin\{solution\}(.*)\\end\{solution\}.*', re.DOTALL)


def get_solution_text(filename):
    global rsol
    with open(filename, 'r', encoding='utf-8') as txt:
        solution_text = rsol.match(txt.read())
        if solution_text:
            return solution_text.group(1).strip().lower().replace(' ', '').encode()
        else:
            return None


def traverse(dirname):
    all_solutions = defaultdict(lambda: defaultdict(str))
    rexp = re.compile(r'\D*(\d+)\.tex')

    for item in os.listdir(dirname):
        if item.startswith('ds2017'):
            _, _, student_surname, student_name = item.split()

            for file in os.listdir(dirname + item):
                if file.endswith('.tex'):
                    x = rexp.match(file)
                    if x:
                        if not file.startswith('solution'):
                            print(item[13:] + '/' + file)

                        problem_id = int(x.group(1))
                        if problem_id < 2:
                            continue

                        solution_text = get_solution_text(dirname + item + '\\' + file)
                        if solution_text:
                            all_solutions[problem_id][f'{student_surname} {student_name}'] = solution_text
                        else:
                            print('Failed to find solution in ' + item[9:] + '/' + file)
    return all_solutions


def traverse_archive(dirname, year=None):
    old_solutions = defaultdict(lambda: defaultdict(str))
    rexp = re.compile(r'\D*(\d+)\.tex')

    for item in os.listdir(dirname):
        student_surname, student_name = item.split()

        for file in os.listdir(dirname + item):
            if file.endswith('.tex'):
                x = rexp.match(file)
                if x:
                    problem_id = int(x.group(1))
                    if problem_id < 2:
                        continue

                    solution_text = get_solution_text(dirname + item + '\\' + file)
                    if solution_text:
                        student_display = f'{student_surname} {student_name}'
                        if year:
                            student_display = f'{student_display} ({year})'
                        old_solutions[problem_id][student_display] = solution_text
                    else:
                        print('Failed to find solution in ' + item + '/' + file)

    return old_solutions

In [5]:
def find_plagiarism(threshold=0.5, quality=9, problem_ids=None, problem_ids_exclude=None, exclusion_threshold=0.0):
    global compressed_length

    all_solutions = traverse(r'C:\Users\dainiak\Dropbox\Apps\ShareLaTeX\\')

    old_solutions = defaultdict(lambda: defaultdict(str))
    archives = [
        traverse_archive(r'c:\Users\dainiak\Documents\ds_reviews\dumps-2015\\', year=2015),
        traverse_archive(r'c:\Users\dainiak\Documents\ds_reviews\dumps-2016\\', year=2016)
    ]
    for archive in archives:
        for problem_id in archive:
            old_solutions[problem_id].update(archive[problem_id])

    compressed_length = keydefaultdict(compressor(quality))

    filtered_ids = set(all_solutions)
    if problem_ids:
        filtered_ids.intersection_update(problem_ids)
    if problem_ids_exclude and not exclusion_threshold or exclusion_threshold <= 0:
        filtered_ids.difference_update(problem_ids_exclude)

    n_pairs_to_check = sum(
        k * (k - 1) // 2
        for k in map(
            len,
            (all_solutions[problem_id] for problem_id in filtered_ids)
        )
    )

    #
    # for problem_id in filtered_ids:
    #     k = len(all_solutions[problem_id])
    #     n_pairs_to_check += k * (k-1) // 2

    similar_pairs = defaultdict(dict)

    with tqdm(total=n_pairs_to_check) as progress_bar:
        iteration = 0
        for problem_id in filtered_ids:
            if len(all_solutions[problem_id]) > 0:
                for (i1, s1), (i2, s2) in combinations(all_solutions[problem_id].items(), 2):
                    distance = NCD(s1, s2)
                    iteration += 1
                    progress_bar.update(1)
                    if distance <= exclusion_threshold or (
                                not problem_ids_exclude
                                or problem_id not in problem_ids_exclude
                            ) and distance <= threshold:
                        similar_pairs[problem_id][(i1, i2)] = distance

    n_pairs_to_check = sum(
        len(old_solutions[problem_id]) * len(all_solutions[problem_id])
        for problem_id in filtered_ids
        if problem_id in old_solutions
    )

    with tqdm(total=n_pairs_to_check) as progress_bar:
        for problem_id in filtered_ids:
            if problem_id in old_solutions:
                pairs_to_check = list(product(all_solutions[problem_id].items(), old_solutions[problem_id].items()))
                if len(pairs_to_check) > 0:
                    for (i1, s1), (i2, s2) in pairs_to_check:
                        distance = NCD(s1, s2)
                        progress_bar.update(1)
                        if distance <= exclusion_threshold or (not problem_ids_exclude or problem_id not in problem_ids_exclude) and distance <= threshold:
                            similar_pairs[problem_id][(i1, i2)] = distance

    sys.stdout.flush()
    no_duplicates = False
    if len(similar_pairs) > 0:
        for problem_id in similar_pairs:
            print(f'\nSimilar solutions for problem {problem_id}:')
            print(', '.join(
                f'[({pair[0]}, {pair[1]}), {round(similar_pairs[problem_id][pair], 2)}]'
                for pair in similar_pairs[problem_id]
            ))
    if no_duplicates:
        print('No duplicates')

In [9]:
find_plagiarism(
    threshold=0.5,
    quality=4,
    problem_ids_exclude=[1, 128, 156, 315, 314],
    exclusion_threshold=0.2
)

Failed to find solution in Глушенков Иван/solution109.tex




Failed to find solution in Глушенков Иван/solution156.tex




Failed to find solution in Дахова Елизавета/solution314.tex




  0%|                                                                                          | 0/282 [00:00<?, ?it/s]

  2%|█▋                                                                                | 6/282 [00:00<00:04, 57.10it/s]

  6%|████▌                                                                            | 16/282 [00:00<00:04, 64.63it/s]

  9%|██████▉                                                                          | 24/282 [00:00<00:03, 68.22it/s]

 12%|█████████▊                                                                       | 34/282 [00:00<00:03, 72.28it/s]

 15%|████████████                                                                     | 42/282 [00:00<00:03, 73.90it/s]

 17%|██████████████                                                                   | 49/282 [00:00<00:03, 69.11it/s]

 21%|████████████████▋                                                                | 58/282 [00:00<00:03, 73.00it/s]

 24%|███████████████████▊                                                             | 69/282 [00:00<00:02, 79.94it/s]

 29%|███████████████████████▊                                                         | 83/282 [00:00<00:02, 90.19it/s]

 34%|███████████████████████████▌                                                     | 96/282 [00:01<00:01, 97.84it/s]

 39%|███████████████████████████████                                                | 111/282 [00:01<00:01, 108.27it/s]

 44%|███████████████████████████████████                                            | 125/282 [00:01<00:01, 113.60it/s]

 49%|██████████████████████████████████████▋                                        | 138/282 [00:01<00:01, 116.77it/s]

 55%|███████████████████████████████████████████▍                                   | 155/282 [00:01<00:00, 127.19it/s]

 60%|███████████████████████████████████████████████▌                               | 170/282 [00:01<00:00, 133.24it/s]

 67%|████████████████████████████████████████████████████▋                          | 188/282 [00:01<00:00, 143.44it/s]

 75%|███████████████████████████████████████████████████████████▍                   | 212/282 [00:01<00:00, 161.78it/s]

 83%|█████████████████████████████████████████████████████████████████▌             | 234/282 [00:01<00:00, 174.03it/s]

 90%|██████████████████████████████████████████████████████████████████████▉        | 253/282 [00:02<00:00, 172.55it/s]

 96%|████████████████████████████████████████████████████████████████████████████▏  | 272/282 [00:02<00:00, 135.04it/s]

100%|███████████████████████████████████████████████████████████████████████████████| 282/282 [00:02<00:00, 121.42it/s]




  0%|                                                                                          | 0/126 [00:00<?, ?it/s]

  6%|█████▏                                                                            | 8/126 [00:00<00:01, 75.00it/s]

 15%|████████████▏                                                                    | 19/126 [00:00<00:01, 82.25it/s]

 23%|██████████████████▋                                                              | 29/126 [00:00<00:01, 86.18it/s]

 29%|███████████████████████▏                                                         | 36/126 [00:00<00:01, 77.88it/s]

 34%|███████████████████████████▋                                                     | 43/126 [00:00<00:01, 74.36it/s]

 40%|████████████████████████████████▊                                                | 51/126 [00:00<00:01, 74.05it/s]

 47%|█████████████████████████████████████▉                                           | 59/126 [00:00<00:00, 75.08it/s]

 56%|█████████████████████████████████████████████                                    | 70/126 [00:00<00:00, 81.68it/s]

 63%|██████████████████████████████████████████████████▊                              | 79/126 [00:00<00:00, 76.68it/s]

 69%|███████████████████████████████████████████████████████▉                         | 87/126 [00:01<00:00, 60.12it/s]

 75%|████████████████████████████████████████████████████████████▍                    | 94/126 [00:01<00:00, 59.71it/s]

 83%|██████████████████████████████████████████████████████████████████▋             | 105/126 [00:01<00:00, 68.62it/s]

 91%|█████████████████████████████████████████████████████████████████████████       | 115/126 [00:01<00:00, 74.02it/s]

 98%|██████████████████████████████████████████████████████████████████████████████▋ | 124/126 [00:01<00:00, 71.46it/s]

100%|████████████████████████████████████████████████████████████████████████████████| 126/126 [00:01<00:00, 73.94it/s]





Similar solutions for problem 314:




[(Ахметзянов Талгат, Кревский Михаил), 0.15], [(Ахметзянов Талгат, Никулов Сергей), 0.15], [(Ахметзянов Талгат, Пономаренко Николай), 0.15], [(Ахметзянов Талгат, Якупова Аделина), 0.15], [(Кревский Михаил, Никулов Сергей), 0.02], [(Кревский Михаил, Пономаренко Николай), 0.02], [(Кревский Михаил, Якупова Аделина), 0.02], [(Никулов Сергей, Пономаренко Николай), 0.02], [(Никулов Сергей, Якупова Аделина), 0.02], [(Пономаренко Николай, Якупова Аделина), 0.02]


