In [1]:
import lzma
import pyppmd
import pickle
from fuzzywuzzy import fuzz
import re
import os
import io
import sys
from itertools import combinations, product
from collections import defaultdict
from tqdm.notebook import tqdm
from datetime import datetime

compressor = None
compressed_length_cache = dict()

def build_compressor(quality):
    if quality == 10:
        return (
            lambda s: len(pyppmd.compress(
                s, 
                max_order=128, 
                mem_size=1<<17
            ))
        )
        
    return (
        lambda s: len(lzma.compress(
            s,
            format=lzma.FORMAT_RAW,
            filters=[{"id": lzma.FILTER_LZMA2, "preset": quality}]
        ))
    )

def get_compressed_length(s):
    global compressor
    mod_hash = (hash(s), len(s))
    result = compressed_length_cache.get(mod_hash, None)
    if result is None:
        result = compressor(s)
        compressed_length_cache[mod_hash] = result
    return result

def NCD(x, y, strategy=None):
    if min(len(x), len(y)) < 400:
        return 1.0
    if strategy == 'LEVENSHTEIN':
        return 1. - 0.01 * fuzz.ratio(x,y)
    
    global get_compressed_length

    lx = get_compressed_length(x)
    ly = get_compressed_length(y)

    if strategy == 'MINMAX':
        return (min(get_compressed_length(x + y), get_compressed_length(y + x)) - max(lx, ly)) / max(lx, ly)
    if strategy == 'METHOD-1':
        return (min(get_compressed_length(x + y), get_compressed_length(y + x)) - max(lx, ly)) / min(lx, ly)
    if strategy == 'METHOD-2':
        return (2*min(get_compressed_length(x + y), get_compressed_length(y + x)) - lx - ly) / (lx + ly)
    if strategy == 'METHOD-3':
        return min(
            (get_compressed_length(x + y) - lx) / ly,
            (get_compressed_length(y + x) - ly) / lx,
        )
        
    if strategy == 'FAST':
        if lx > ly:
            lx, ly = ly, lx
        return (get_compressed_length(y + x) - ly) / lx

    return (get_compressed_length(x + y) + get_compressed_length(y + x) - lx - ly) / (lx + ly)


In [2]:
rsol = re.compile(r'.*\\begin\{solution\}(.*)\\end\{solution\}.*', re.DOTALL)


def get_solution_text(filename):
    global rsol
    with open(filename, 'r', encoding='utf-8') as txt:
        solution_text = rsol.match(txt.read())
        if solution_text:
            return solution_text.group(1).strip().lower().replace(' ', '').encode()
        else:
            return None


def traverse(dirname, problem_ids=None):
    all_solutions = defaultdict(lambda: defaultdict(str))
    rexp = re.compile(r'.*\\[^1-9]*0*(\d+)\.tex')

    for item in os.listdir(dirname):
        if not item.startswith('ds2021'):
            continue
        
        _, _, student_surname, student_name = item.split()
        
        for root, dirs, files in os.walk(dirname + '\\' + item, topdown=False):
            for filename in files:
                file = os.path.join(root, filename)
                if not file.endswith('.tex'):
                    continue
                    
                rematch = rexp.match(file)
                if not rematch:
                    continue

                if 'solution-' not in file:
                    print(f'File not starts with “solution-”: {file}')

                problem_id = int(rematch.group(1))
                if problem_id < 2 or (
                        problem_ids is not None 
                        and problem_id not in problem_ids
                    ):
                    continue

                solution_text = get_solution_text(file)
                if solution_text:
                    all_solutions[problem_id][f'{student_surname} {student_name}'] = solution_text
                else:
                    print(f'Failed to find solution in {file}')
    return all_solutions


def traverse_archive(dirname, year=None, problem_ids=None):
    old_solutions = defaultdict(lambda: defaultdict(str))
    rexp = re.compile(r'[^1-9]*(\d+)\.tex')

    for item in os.listdir(dirname):
        student_surname, student_name = item.split()

        for file in os.listdir(dirname + item):
            if not file.endswith('.tex'):
                continue
            rematch = rexp.match(file)
            if not rematch:
                continue
            problem_id = int(rematch.group(1))
            if problem_id < 2 or (
                        problem_ids is not None 
                        and problem_id not in problem_ids
                    ):
                continue

            solution_text = get_solution_text(dirname + item + '\\' + file)
            if solution_text:
                student_display = f'{student_surname} {student_name}'
                if year:
                    student_display = f'{student_display} ({year})'
                old_solutions[problem_id][student_display] = solution_text
            else:
                print('Failed to find solution in ' + item + '/' + file)


    return old_solutions

In [3]:
def find_plagiarism(threshold=0.5,
                    NCD_strategy=None,
                    problem_ids=None,
                    problem_ids_exclude=None,
                    exclusion_threshold=0.0,
                    check_archives=True, 
                    exclude_from_output=None,
                    focus_on_user=None
                   ):
    if exclude_from_output is None:
        exclude_from_output = dict()
        
    print('Loading this year solutions')
    all_solutions = traverse(r'D:\Dropbox\Apps\Overleaf')
    
    print('Loading last years solutions')
    old_solutions = defaultdict(lambda: defaultdict(str))
    
    if check_archives:
        archives = [
            traverse_archive(
                rf'd:\Documents\plagiarism-detection\ds\student-solutions-latex-{year}\\',
                year
            )
            for year in range(2015, 2020+1)
        ]
        for archive in archives:
            for problem_id in archive:
                old_solutions[problem_id].update(archive[problem_id])
    
    filtered_ids = set(all_solutions)
    if problem_ids:
        filtered_ids.intersection_update(problem_ids)
    if problem_ids_exclude is not None and not exclusion_threshold or exclusion_threshold <= 0:
        filtered_ids.difference_update(problem_ids_exclude)

    n_pairs_to_check = sum(k * (k - 1) // 2
                           for k in map(len, (all_solutions[problem_id]
                                              for problem_id in filtered_ids)))

    print('\nThis year solutions:', end='')
    similar_pairs = defaultdict(dict)

    with tqdm(total=n_pairs_to_check) as progress_bar:
        iteration = 0
        for problem_id in filtered_ids:
            if len(all_solutions[problem_id]) > 0:
                for (i1, s1), (i2, s2) in combinations(
                        all_solutions[problem_id].items(), 2):
                    if focus_on_user is not None and focus_on_user not in [i1, i2]:
                        continue
                    distance = NCD(s1, s2, NCD_strategy)
                    iteration += 1
                    progress_bar.update(1)
                    if distance <= exclusion_threshold or (
                            problem_ids_exclude is None or problem_id not in
                            problem_ids_exclude) and distance <= threshold:
                        similar_pairs[problem_id][(i1, i2)] = distance

    print('This year vs. last years solutions:', end='')
    n_pairs_to_check = sum(
        len(old_solutions[problem_id]) * len(all_solutions[problem_id])
        for problem_id in filtered_ids if problem_id in old_solutions
    )

    with tqdm(total=n_pairs_to_check) as progress_bar:
        for problem_id in filtered_ids:
            if problem_id in old_solutions:
                pairs_to_check = list(
                    product(all_solutions[problem_id].items(),
                            old_solutions[problem_id].items()))
                if len(pairs_to_check) > 0:
                    for (i1, s1), (i2, s2) in pairs_to_check:
                        if focus_on_user is not None and focus_on_user not in [i1, i2]:
                            continue
                        distance = NCD(s1, s2)
                        progress_bar.update(1)
                        if distance <= exclusion_threshold or (
                                problem_ids_exclude is None
                                or problem_id not in problem_ids_exclude
                        ) and distance <= threshold:
                            similar_pairs[problem_id][(i1, i2)] = distance

    sys.stdout.flush()
    print(f'Time of report: {datetime.now()}')
    no_duplicates = True
    if len(similar_pairs) > 0:
        for problem_id in sorted(similar_pairs.keys()):
            pairs_list = sorted([
                (round(similar_pairs[problem_id][pair], 2), pair[0], pair[1])
                for pair in similar_pairs[problem_id]
                if pair not in exclude_from_output.get(problem_id, {})
            ])
            if pairs_list != []:
                print(f'Similar solutions for problem {problem_id}:\n{pairs_list}')
                no_duplicates = False
    if no_duplicates:
        print('No duplicates')

In [4]:
exclude_from_output = {
    10: {('Мартьянов Вова', 'Шевляков Антон')},
    27: {('Дилшодзода Равшан', 'Мирзаев Рустам')},
    37: {('Волков Алексей', 'Инденбом Дмитрий (2019)'), 
         ('Ильдаров Адам', 'Пименов Павел (2016)'), 
         ('Ильдаров Адам', 'Левашов Артём (2018)')},
    60: {('Мысов Никита', 'Солостовский Василий'), ('Мысов Никита', 'Яковлев Андрей')},
    68: {('Тихонин Тихон', 'Шляхин Михаил (2020)'), ('Кузнецов Николай', 'Тихонин Тихон')},
    72: {('Смирнов Павел', 'Хузин Тимур')},
    88: {('Завидонова Даниела', 'Беляев Анастасий (2020)'), ('Завидонова Даниела', 'Санников Григорий'), ('Лескин Иван', 'Хузин Тимур'), ('Завидонова Даниела', 'Зиняева Анастасия'), ('Завидонова Даниела', 'Колесникова Ксения'), ('Завидонова Даниела', 'Купцов Дмитрий (2019)'), ('Завидонова Даниела', 'Скороходов Всеволод'), ('Хузин Тимур', 'Беляев Анастасий (2020)'), ('Агафонов Александр', 'Колесникова Ксения'), ('Агафонов Александр', 'Купцов Дмитрий (2019)'), ('Завидонова Даниела', 'Онучин Артем'), (0.5, 'Колесникова Ксения', 'Хузин Тимур')},
    113: {('Хузин Тимур', 'Стебловский Дмитрий (2020)')},
    114: {('Мысов Никита', 'Юрченко Вероника')},
    120: {('Сенаторов Петр', 'Вознюк Юлия (2020)')},
    276: {('Тихонин Тихон', 'Яковлева Алена'), ('Ефимова Юлия', 'Тихонин Тихон'), ('Тихонин Тихон', 'Якупова Аделина (2017)'), (0.5, 'Кутный Данила', 'Антоненко Мария (2018)')},
    316: {('Завидонова Даниела', 'Матвеев Илья'), (0.5, 'Зиняева Екатерина', 'Матвеев Илья')},
    386: {('Зиняева Екатерина', 'Матвеев Илья')},
    445: {('Панов Никита', 'Тихонин Тихон'), ('Казакова Анастасия', 'Тихонин Тихон'), ('Панов Никита', 'Романчина Татьяна'), ('Муромцев Илья', 'Тихонин Тихон'), ('Павлунин Дмитрий', 'Смирнов Павел'), ('Смирнов Павел', 'Тихонин Тихон'), ('Зиганшин Марк', 'Тихонин Тихон'), ('Муромцев Илья', 'Смирнов Павел'), ('Павлунин Дмитрий', 'Тихонин Тихон'), ('Панов Никита', 'Смирнов Павел')},
    447: {(0.5, 'Зернышкина Екатерина', 'Сенаторов Петр'), (0.5, 'Морозов Артемий', 'Сенаторов Петр')}   
}

In [5]:
def find_plagiarism_single(problem_id,
                    solution_text,
                    threshold=0.5,
                    check_archives=True):
    
    solution_text = solution_text.strip().lower().replace(' ', '').encode()
    
    print('Loading this year solutions')
    all_solutions = traverse(r'D:\Dropbox\Apps\Overleaf\\', [problem_id])
    
    print('Loading last years solutions')
    old_solutions = defaultdict(lambda: defaultdict(str))

    if check_archives:
        archives = [
            traverse_archive(
                rf'd:\Documents\plagiarism-detection\ds\student-solutions-latex-{year}\\',
                year,
                [problem_id]
            )
            for year in range(2015, 2020+1)
        ]
        for archive in archives:
            for problem_id in archive:
                old_solutions[problem_id].update(archive[problem_id])

    all_solutions[problem_id].update(old_solutions[problem_id])

    n_pairs_to_check = len(all_solutions[problem_id])

    similar_pairs = defaultdict(dict)

    with tqdm(total=n_pairs_to_check) as progress_bar:
        iteration = 0
        if len(all_solutions[problem_id]) > 0:
            for (i2, s2) in all_solutions[problem_id].items():
                (i1, s1) = ('CURRENT', solution_text)
                distance = NCD(s1, s2)
                iteration += 1
                progress_bar.update(1)
                if distance <= threshold:
                    similar_pairs[problem_id][(i1, i2)] = distance

    sys.stdout.flush()
    no_duplicates = True
    if len(similar_pairs) > 0:
        for problem_id in sorted(similar_pairs.keys()):
            pairs_list = sorted([
                (round(similar_pairs[problem_id][pair], 2), pair[0], pair[1])
                for pair in similar_pairs[problem_id]
            ])

            if pairs_list != []:
                print(f'Similar solutions for problem {problem_id}:\n{pairs_list}')
                no_duplicates = False
    if no_duplicates:
        print('No duplicates')

In [6]:
def store_cache(quality):
    global compressed_length_cache

    with open(f'compressed_length_cache-q{quality}.pkl', 'wb') as outfile:
        pickle.dump(compressed_length_cache, outfile)

def load_cache(quality):
    global compressed_length_cache
    with open(rf'compressed_length_cache-q{quality}.pkl', 'rb') as infile:
        compressed_length_cache = pickle.load(infile)

In [7]:
load_cache(10)
# compressed_length_cache = dict()

compressor = build_compressor(quality=10)
find_plagiarism(
    threshold=0.4,
    # NCD_strategy='METHOD-3',
#     problem_ids=[418],
    problem_ids_exclude=[10,44,51,60,67,68,71,72,79,83,84,87,88, 65, 66, 444],
    exclude_from_output=exclude_from_output,
    # focus_on_user='Вербов Ярослав'
)

Loading this year solutions
Failed to find solution in D:\Dropbox\Apps\Overleaf\ds2021 — Агафонов Александр\fall\solution-194.tex
Failed to find solution in D:\Dropbox\Apps\Overleaf\ds2021 — Агафонов Александр\solution-256.tex
Failed to find solution in D:\Dropbox\Apps\Overleaf\ds2021 — Айтимов Акежан\solution-135.tex
Failed to find solution in D:\Dropbox\Apps\Overleaf\ds2021 — Айтимов Акежан\solution-372.tex
Failed to find solution in D:\Dropbox\Apps\Overleaf\ds2021 — Айтимов Акежан\solution-387.tex
Failed to find solution in D:\Dropbox\Apps\Overleaf\ds2021 — Айтимов Акежан\solution-388.tex
Failed to find solution in D:\Dropbox\Apps\Overleaf\ds2021 — Айтимов Акежан\solution-435.tex
Failed to find solution in D:\Dropbox\Apps\Overleaf\ds2021 — Александров Илья\fall\solution-194.tex
Failed to find solution in D:\Dropbox\Apps\Overleaf\ds2021 — Баранов Виктор\solution-395.tex
Failed to find solution in D:\Dropbox\Apps\Overleaf\ds2021 — Баранов Виктор\solution-401.tex
Failed to find solutio

  0%|          | 0/22617 [00:00<?, ?it/s]

This year vs. last years solutions:

  0%|          | 0/88916 [00:00<?, ?it/s]

Time of report: 2022-05-05 22:17:15.114781
Similar solutions for problem 145:
[(0.17, 'Айтимов Акежан', 'Вербов Ярослав'), (0.18, 'Айтимов Акежан', 'Потяшин Иван (2019)'), (0.19, 'Вербов Ярослав', 'Потяшин Иван (2019)'), (0.31, 'Вербов Ярослав', 'Шевляков Антон (2020)'), (0.33, 'Айтимов Акежан', 'Шевляков Антон (2020)')]
Similar solutions for problem 151:
[(0.37, 'Бодрова Марина', 'Морозов Константин (2018)')]
Similar solutions for problem 158:
[(0.4, 'Морозов Артемий', 'Назарова Анна')]
Similar solutions for problem 159:
[(0.23, 'Вербов Ярослав', 'Пчелинцев Святослав (2019)')]
Similar solutions for problem 162:
[(0.37, 'Бодрова Марина', 'Реутский Даниил (2018)')]
Similar solutions for problem 178:
[(0.31, 'Онучин Артем', 'Пилькевич Антон (2019)')]
Similar solutions for problem 187:
[(0.38, 'Панов Никита', 'Демина Елизавета (2020)')]
Similar solutions for problem 224:
[(0.38, 'Матвеев Илья', 'Хузин Тимур'), (0.38, 'Хузин Тимур', 'Литвинова Екатерина (2019)')]
Similar solutions for prob

In [8]:
solo = r'''\begin{itemize}\end{itemize}'''

In [9]:
# find_plagiarism_single(
#     problem_id=316,
#     solution_text=solo,
#     threshold=0.45,
#     quality=9,
#     check_archives=True
# )

In [12]:
# store_cache(10)

In [11]:
# len(compressed_length_cache.keys())