## Jupyter Notebook Collusion Detector

&copy; Jeremy Ellman 28/02/2022 (v1). MIT License


In [None]:
# Standard imports
import os
import re
import csv
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Use jupyter to convert notebooks (.ipynb) to python (py). This will also save Markdown as comments
#
!jupyter nbconvert --to script --no-prompt /home/notebookuser/working/Colludo/Sources/*.ipynb

In [66]:
#Set Global Paremeters
verbose = False #prints full file names
similarity_threshold = 0.8  #Value depends on amount of standard code used -- vary up if more standard code is used.
output_file = '/home/notebookuser/working/Colludo/Sources/copies.csv'
source_path = '/home/notebookuser/working/Colludo/Sources/'
file_type = ".py"

In [67]:
#Collect list of code_files to analyse
#
code_files = [source_path + filename for filename in os.listdir(source_path) if filename.endswith(file_type)]

if verbose:
    print(f'There are {len(code_files)} to process')

In [68]:
## remove irrelevant (i.e. text/comments) from a line
#
def remove_irrelevant(line):
    line = line.strip()                           #remove whitespace at and end start
    line = re.sub("#.*$", "", line)               #strip comments
    line = re.sub("^import.*", "", line)          #remove import statements
    line = re.sub("^from.*", "", line)
    line = re.sub("^get_ipython.*", "", line)     #strip cell-magic
    return line

## read in a file and strip irrelevant content
#
def preprocess(filename):
    lineslist = ""
    with open(filename) as f:
        for line in f.read().splitlines():
            line = remove_irrelevant(line)
            if line != "":                      #only retain none-blank lines after comment stripping
                lineslist += ' ' + line
    return lineslist

# Documents are the files as list of lines
#
documents = [preprocess(filename) for filename in code_files]

In [69]:
# Create tf/idf vectors of the documents using sklearn and compute their similarity
# (Thanks to https://stackoverflow.com/questions/8897593/how-to-compute-the-similarity-between-two-text-documents)

tfidf = TfidfVectorizer().fit_transform(documents)
pairwise_similarity = tfidf * tfidf.T
results_array = pairwise_similarity.toarray() #compute the cosine similarity into square similarity matrix
np.fill_diagonal(results_array, np.nan) #mask the 1's, which represent the similarity of each document to itself

In [70]:

#Global record of file similarities
copies_dict = dict()

cheating_detected = True

#Here we report on pair wise similarities. Since groups often collude we repeat the process until all pairs are found
#
while cheating_detected:
    cheating_detected = False
    for i in range(len(code_files)):
        max_sim = np.nanargmax(results_array[i])
        first = code_files[i]
        similarity_score = results_array[i, max_sim]
        if (similarity_score > similarity_threshold) & (first not in copies_dict):
            second = code_files[max_sim]
            copies_dict[first] = [second, similarity_score]
            if verbose:
                print(f"File: {first}\n max_sim: {second}, Score: {similarity_score:.2f} ")
            results_array[i, max_sim] = np.nan #don't re-find this pair
            results_array[max_sim, i] = np.nan
            cheating_detected = True

In [71]:
## Write the output as csv file
#
with open(output_file, 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(["First File", "Second File", "Similarity Score"])
    for key, value in copies_dict.items():
        src = os.path.basename(key)
        dest = os.path.basename(value[0])
        score = "{:.2f}".format(value[1])
        writer.writerow([src, dest, score])
        
print('Done!')

Done!
