In [1]:
import json
from collections import Counter

In [2]:
def get_data_ngrams(data, n_grams):
    grams = set([])
    for line in data:
        line = line.split()
        for i in range(len(line) - n_grams + 1):
            grams.add(" ".join(line[i : i + n_grams]))
    return grams        

In [3]:
def plagiarism_check(n_grams, training, test):
    training_grams = get_data_ngrams(training, n_grams)
    test_grams = get_data_ngrams(test, n_grams)
    common = training_grams.intersection(test_grams)
    return 100 * (len(common) / len(test_grams))

### Plagiarism Results for ACL Onthology Dataset 

| n            | 1    | 2  | 3    | 4    | 5    | 6    |
|--------------|------|----|------|------|------|------|
| Human        | 81.4 | 59 | 31.6 | 13.5 | 5.7  | 2.7  |
| Kevin Knight | 100  | 78 | 47.4 | 21.5 | 8.82 | 3.43 |
| WEPGen       | 100  | 82 | 52   | 24   | 9.5  | 3.68 |

### Plagiarism Results for XMLA Dataset 

| n            | 1   | 2    | 3    | 4    | 5     | 6   |
|--------------|-----|------|------|------|-------|-----|
| Human        | 86  | 71   | 44   | 21.4 | 10    | 5.1 |
| Kevin Knight | 100 | 81.6 | 58.4 | 30   | 13.72 | 6   |
| WEPGen       | 100 | 89   | 66   | 37   | 17.2  | 7.5 |


In [7]:
test_data = []
with open("arxiv-dataset/arxiv-original.txt") as f:
    for line in f:
        j = json.loads(line)
        test_data.append(j["abstract"])

training_data = []
with open("arxiv-dataset/train_arxiv.txt") as f:
    for line in f:
        j = json.loads(line)
        training_data.append(j["abstract"])  

for n_grams in range(1, 8):
    print("n-grams: {}, common: {}%".format(n_grams, plagiarism_check(n_grams, training_data, test_data)))        

n-grams: 1, common: 86.0503047471886%
n-grams: 2, common: 70.64496229431833%
n-grams: 3, common: 43.91544687322076%
n-grams: 4, common: 21.42736417755354%
n-grams: 5, common: 10.012041558504427%
n-grams: 6, common: 5.120974530411987%
n-grams: 7, common: 3.1645912453129985%
