# Plagiarism Detection Notebook
## Notebook for the "Textmining" project in WS2020/2021

Sources used for code: 

* https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html

* https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html#corpus-streaming-tutorial

* https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#12buildingthetopicmodel

In [1]:
# imports
import os
import pprint
import gensim
from gensim import corpora
from gensim import models
from gensim import similarities
import nltk
from nltk.corpus import stopwords
from smart_open import open 



---

# First experiments

In [3]:
# some example documents. For the actual application we wouldn't load everything at once.

documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

## Preprocessing

Prepocess the text

In [1]:
# simple function for text preprocessing. Converts the text to lower case, removes stopwords and words with a minimum length of 2 and maximum length of 15
def preprocessing (corpus):
    
    processed_corpus = []
    
    # load stopwords from NLTK
    stop_words = set(stopwords.words('english'))
    
    # go through each document in the corpus
    for document in corpus:
        
        # step1: convert to lowercase and remove words that do not match the min-max-length
        step1 = gensim.utils.simple_preprocess(document, deacc=False, min_len=2, max_len=15)
        
        #step2: remove stopwords
        step2 = [word for word in step1 if word not in stop_words]
        
        processed_corpus.append(step2)
    return processed_corpus


In [5]:
processed_corpus = preprocessing(documents)
pprint.pprint(processed_corpus)

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]


Maybe add a filter for min occurence?

In [8]:
# convert text to vectors using the dictionary function

# define dictionary of our corpus. Contains the word frequency of each token in the whole corpus
dictionary = corpora.Dictionary(processed_corpus)

# transform the corpus to vectors. Each vector consists of a token ID and the token frequency (taken from the dictionary)
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

print(dictionary)
print(dictionary.token2id)
pprint.pprint(bow_corpus)


Dictionary(35 unique tokens: ['abc', 'applications', 'computer', 'human', 'interface']...)
{'abc': 0, 'applications': 1, 'computer': 2, 'human': 3, 'interface': 4, 'lab': 5, 'machine': 6, 'opinion': 7, 'response': 8, 'survey': 9, 'system': 10, 'time': 11, 'user': 12, 'eps': 13, 'management': 14, 'engineering': 15, 'testing': 16, 'error': 17, 'measurement': 18, 'perceived': 19, 'relation': 20, 'binary': 21, 'generation': 22, 'random': 23, 'trees': 24, 'unordered': 25, 'graph': 26, 'intersection': 27, 'paths': 28, 'iv': 29, 'minors': 30, 'ordering': 31, 'quasi': 32, 'well': 33, 'widths': 34}
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(2, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(4, 1), (10, 1), (12, 1), (13, 1), (14, 1)],
 [(3, 1), (10, 2), (13, 1), (15, 1), (16, 1)],
 [(8, 1), (11, 1), (12, 1), (17, 1), (18, 1), (19, 1), (20, 1)],
 [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(24, 1), (26, 1), (27, 1), (28, 1)],
 [(24, 1), (26, 1), (29, 1), (30, 1)

Create a TF-IDF model of the corpus:

In [72]:
# train the model
tfidf = models.TfidfModel(bow_corpus)

# transform the corpus
corpus_tfidf = tfidf[bow_corpus]


In [75]:
# every word is now represented by a vector: Token-ID and token-weight
for doc in corpus_tfidf:
    pprint.pprint(doc)

[(0, 0.4301019571350565),
 (1, 0.4301019571350565),
 (2, 0.2944198962221451),
 (3, 0.2944198962221451),
 (4, 0.2944198962221451),
 (5, 0.4301019571350565),
 (6, 0.4301019571350565)]
[(2, 0.3726494271826947),
 (7, 0.5443832091958983),
 (8, 0.3726494271826947),
 (9, 0.3726494271826947),
 (10, 0.27219160459794917),
 (11, 0.3726494271826947),
 (12, 0.27219160459794917)]
[(4, 0.438482464916089),
 (10, 0.32027755044706185),
 (12, 0.32027755044706185),
 (13, 0.438482464916089),
 (14, 0.6405551008941237)]
[(3, 0.3449874408519962),
 (10, 0.5039733231394895),
 (13, 0.3449874408519962),
 (15, 0.5039733231394895),
 (16, 0.5039733231394895)]
[(8, 0.30055933182961736),
 (11, 0.30055933182961736),
 (12, 0.21953536176370683),
 (17, 0.43907072352741366),
 (18, 0.43907072352741366),
 (19, 0.43907072352741366),
 (20, 0.43907072352741366)]
[(21, 0.48507125007266594),
 (22, 0.48507125007266594),
 (23, 0.48507125007266594),
 (24, 0.24253562503633297),
 (25, 0.48507125007266594)]
[(24, 0.31622776601683794),


## Compare Texts

In [76]:
# build an index for the corpus
index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=len(dictionary))

In [79]:
# add a query document
query_document = 'system engineering'.split()

# transform the query document to a vector
query_bow = dictionary.doc2bow(query_document)

# compare the query document to each document in the corpus
sims = index[tfidf[query_bow]]
print(list(enumerate(sims)))

[(0, 0.0), (1, 0.12172779), (2, 0.14323246), (3, 0.67615116), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]


In [78]:
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

3 0.67615116
2 0.14323246
1 0.12172779
0 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0


---

# Corpus Streaming

Since we do not want to load the whole corpus into memory every time, we need some changes in our approach, so we are able to stream the single documents when needed.
Code is from: https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html#corpus-streaming-tutorial

## Preprocessing

In [11]:
# Set the preprocessing file
preprocessing_file = 'data/corpus/deu_mixed-typical_2011_10K/deu_mixed-typical_2011_10K-sentences.txt'
# preprocessing_file = 'data/corpus/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-sentences.txt'

In [32]:
# Remove IDs from the corpus, since they do not belong in the content of the documents.
temp_input_file = open(preprocessing_file, 'r', encoding = "utf-8")
output_file = open(os.path.splitext(preprocessing_file)[0]+"_only.txt", 'w', encoding = "utf-8")
for line in temp_input_file.readlines():
    output_file.write(" ".join(line.split()[1:])+"\n")

In [39]:
# Try do create an index
temp_input_file = open(preprocessing_file, 'r', encoding = "utf-8")
index_dic = {}
for line in temp_input_file.readlines():
    index_dic[int(line.split()[0])] = str(line.split()[1:])

In [15]:
# Set the input file for further preprocessing
input_file = 'data/corpus/deu_mixed-typical_2011_10K/deu_mixed-typical_2011_10K-sentences_only.txt'
# input_file = 'data/corpus/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-sentences_only.txt'

In [16]:
# Save corpus as a list, so we can print out the document at the end. Not ideal for performance, but currently no other solution.
corpus_list = []
for line in open(input_file, encoding = "utf-8"):
    corpus_list.append(line)

In [17]:
# load stopwords from NLTK
stop_words = set(stopwords.words('german'))

# load all tokens as lowercase text
dictionary = corpora.Dictionary(line.lower().split() for line in open(input_file, encoding="utf-8"))

# find all stop words
stop_ids = [dictionary.token2id[stopword] for stopword in stop_words if stopword in dictionary.token2id]

# find all words that only occur once
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]


# Remove all stopwords and words that only occur once
dictionary.filter_tokens(stop_ids + once_ids)

# Since we filtered out some words, the ID count now has gaps. With .compactify we can remove those gaps
dictionary.compactify()


In [18]:
# Define the corpus as an object

class MyCorpus:
    def __iter__(self):
        # Each line in the corpus file represents one document, each token in a document is seperated by a whitespace
        for line in open(input_file):
            # Transfom the corpus to vectors
            yield dictionary.doc2bow(line.lower().split())

In [19]:
# Initialize the corpus, without loading it into memory
corpus = MyCorpus()

Note: The corpus now exists only as an object, to work with it, the vectors inside have to be called. Only then they will be loaded into memory.
Example: Print the first 10 document vectors:

In [20]:
for index, vector in enumerate(corpus):
    if index <10:
        print(vector)

[(0, 1), (1, 1)]
[(2, 1), (3, 1)]
[(3, 1), (4, 1), (5, 1)]
[(7, 1), (8, 1)]
[(3, 1), (9, 1)]
[(3, 1)]
[(3, 1), (10, 1), (11, 1)]
[(12, 1), (13, 1)]
[(14, 1), (15, 1), (16, 1)]
[(3, 1), (9, 1), (17, 1), (18, 1)]


## Work with our Corpus

Now that we are able to load our corpus memory friendly we can transform the document vectors using a variety of functions.

First we have to initialize a model, which will be used for the transformations:

In [21]:
# TF-IDF Model:
tfidf_model = models.TfidfModel(corpus, normalize=False)

In [22]:
# LSI (Latent Semantic Indexing) Model:
tfidf_corpus = tfidf_model[corpus]
lsi_model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=10)

In [23]:
# LDA Model:
lda_model = models.LdaModel(corpus, id2word=dictionary, num_topics=10)

In [24]:
# lda_model.print_topics(num_topics=10, num_words=10)

Index the corpus:

In [25]:
index = similarities.SparseMatrixSimilarity(tfidf_model[corpus], num_features=len(dictionary))

Create an example which should be queried against the corpus:

In [26]:
# Example Sentence:
example = "Die bisherigen Bau-Planungen bezüglich des Klinikums sind sehr verändert."

# Transform example to vector
example_vec = dictionary.doc2bow(example.lower().split())

# Convert it to our current model space
example_lda = tfidf_model[example_vec]

In [27]:
print(example_vec)

[(954, 1), (1241, 1), (1730, 1)]


Query the example:

In [28]:
sim = index[example_vec]

Output the result:

In [41]:
for document_number, score in sorted(enumerate(sim), key=lambda x: x[1], reverse=True):
    if score != 0.0:
        print(index_dic[document_number], score)

['Die', 'Biografie', 'eines', 'Songs".'] 0.40824828
['Eine', 'Übersicht', 'der', 'Kindertageseinrichtungen', 'in', 'Rodgau.'] 0.40824828
['Bei', 'den', 'beteiligten', 'Fahrzeugen', 'wurden', 'jeweils', 'die', 'Außenspiegel', 'beschädigt.'] 0.25779927


In [31]:
for document_number, score in sorted(enumerate(sim), key=lambda x: x[1], reverse=True):
    if score != 0.0:
        print(corpus_list[document_number], score)

Die bisherigen PCMs brachten es nur auf 128 MBit.
 0.40824828
Eine umfassende Chronik der bisherigen Ereignisse!
 0.40824828
Bei den bisherigen WM-Auktionen wurden insgesamt 170 000 Euro erzielt.
 0.25779927


Different results for index_dic vs document list. document_number from gensim is not the same as the index number?

---
---

# Wikipedia Corpus

Corpus from: https://dumps.wikimedia.org/dewiki/20200820/

Sentences for comparison from: https://github.com/t-systems-on-site-services-gmbh/german-wikipedia-text-corpus

In [12]:
#imports
from xml.etree.ElementTree import *
import xml.etree.ElementTree as ET
from collections import Counter
import os
import pprint
import gensim
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import LdaMulticore
import nltk
from nltk.corpus import stopwords
from smart_open import open 
import spacy
import de_core_news_sm

## Preprocessing:

In [2]:
# load the language model from spacy
spacy_data = de_core_news_sm.load()

def preprocess_text(text):
    # load and tokenize text
    prep_text = spacy_data(text)
    # list for tokens
    prep_tokens = []
    # for every token in text
    for token in prep_text:
        # remove stopwords and punctuatiuon
        if token.pos_ != 'PUNCT' and token.is_stop == False:
            # lemmatize and transform to lowercase
            lemma_token = token.lemma_.lower()
            # remove non-alphabetic tokens
            if lemma_token.isalpha() or lemma_token == '-PRON-':
                prep_tokens.append(lemma_token)
    # return preprocessed text 
    return prep_tokens

## Build the corpus

Create a corpus from the text contents of the XML file:

First test:

Print text from \<text>:

In [3]:
xml_file = "data/wiki_corpus/dewiki-20200820-pages-articles-multistream.xml"

In [57]:
index = 0
for event, elem in ET.iterparse(xml_file, events = ("start", "end")):
    if index < 3:
        if event == 'end' and "text" in elem.tag:
            prep_tokens = preprocess_text(elem.text)
            print(prep_tokens)
            elem.clear()
            index += 1    
    else:
        break       


['alan', 'smithee', 'stehen', 'pseudonym', 'fiktiv', 'regisseur', 'film', 'verantworten', 'eigentlich', 'regisseur', 'name', 'werk', 'verbindung', 'bringen', 'directors', 'guild', 'of', 'america', 'dga', 'situation', 'empfehlen', 'seither', 'thoma', 'angeles', 'name', 'of', 'director', 'smithee', 'what', 'it', 'used', 'to', 'be', 'zuletzt', 'prüfen', 'april', 'alan', 'smithee', 'weiterhin', 'gebrauch', 'alternative', 'schreibweise', 'ursprungsvariante', 'smithee', 'alan', 'smithee', 'teilweise', 'asiatisch', 'anmutend', 'schreibweise', 'alan', 'smi', 'thee', 'sumishii', 'aran', 'gehören', 'internet', 'movie', 'database', 'name', 'eintrag', 'alan', 'smithee', 'geschichte', 'entstehung', 'pseudonym', 'entstehen', 'infolge', 'arbeit', 'death', 'of', 'gunfighter', 'deutsch', 'titel', 'frank', 'patch', 'stunde', 'zählen', 'regisseur', 'robert', 'totten', 'hauptdarsteller', 'richard', 'widmark', 'geraten', 'streit', 'woraufhin', 'don', 'siegel', 'neu', 'regisseur', 'einsetzen', 'film', 'trag

Now define the corpus:

In [23]:
# Define the corpus as an object
class MyCorpus:
    def __iter__(self):
        # define the XML tree
        for event, elem in ET.iterparse(xml_file, events = ("start", "end")):            
            # Each document is represented as an object between <text> tags in the xml file
            if event == 'end' and "text" in elem.tag:
                # Transfom the corpus to vectors
                yield dictionary.doc2bow(preprocess_text(elem.text))
                elem.clear()

In [71]:
# Define a smaller corpus, containing only the first 200 documents:
class MyCorpus_small:
    def __iter__(self):
        index = 0
        # define the XML tree
        for event, elem in ET.iterparse(xml_file, events = ("start", "end")):
            if index < 200:
                # Each document is represented as an object between <text> tags in the xml file
                if event == 'end' and "text" in elem.tag:
                    # Transfom the corpus to vectors
                    yield dictionary.doc2bow(preprocess_text(elem.text))
                    index+=1
                    elem.clear()
                else:
                    break

In [24]:
# Initialize the corpus, without loading it into memory
corpus = MyCorpus()

In [72]:
corpus_small = MyCorpus_small()

---

### Some testing

### Somehow we have to save which text matches which ID, so we can later on return the text and not only it's vector representation. To match every text with an id and store it would render the following approaches, to stream the data instead of storing it in memory, obsolete/useless. To counter that i propose we create a dictionary which only contains the title of every article, instead of the full text.
Currently WIP

In [7]:
title_ids = {}
index = 0
for event, elem in ET.iterparse(xml_file, events = ("start", "end")):        
    if index < 200:
        if event == 'end' and "title" in elem.tag:
            title_ids[index]=str(elem.text)
            index += 1    
            elem.clear()
    else:
        break 

In [8]:
print(title_ids)

{0: 'Alan Smithee'}


Search for a specific entry:

In [None]:
%%time
title_ids = {}
index = 0
for event, elem in ET.iterparse(xml_file, events = ("start", "end")):        
    if event == 'end' and "title" in elem.tag:
        if "British Airways i360" in elem.text:
            title_ids[index]=str(elem.text)
            print(elem.text)
            break
        elem.clear()

Get texts from the XML File

In [141]:
%%time
text_ids = {}
index = 0
for event, elem in ET.iterparse(xml_file, events = ("start", "end")):        
    if index < 200:
        if event == 'end' and "text" in elem.tag:
            text_ids[index]=str(elem.text)
            index += 1    
            elem.clear()
    else:
        break


Wall time: 47 ms


In [None]:
print(text_ids[0])

---

## Build the Dictionary

In [11]:
def build_dictionary(xml_file):
    index = 0
    first_elem = True
    for event, elem in ET.iterparse(xml_file, events = ("start", "end")):        
        if index < 200:
            if event == "end" and "text" in elem.tag:
                text = preprocess_text(elem.text)
                if first_elem:
                    dictionary = Dictionary([text])
                    first_elem = False
                    index += 1
                else:
                    dictionary.add_documents([text])
                    index += 1
                elem.clear()
        else:
            break
    return dictionary

In [12]:
%%time
# build the dictionary:
dictionary = build_dictionary(xml_file)

Wall time: 2min 7s


In [13]:
%%time
# remove words that appear only once
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(once_ids)
# remove gaps in id sequence after words that were removed
dictionary.compactify() 

Wall time: 50 ms


In [14]:
print(dictionary)

Dictionary(20309 unique tokens: ['abc', 'abkehr', 'ablehnen', 'abrufen', 'abschluss']...)


In [15]:
#save the dictionary
dictionary.save('data/wiki_200.dict')

In [6]:
#load the dictionary
dictionary = Dictionary.load('data/wiki_200.dict')

In [7]:
print(dictionary)

Dictionary(20309 unique tokens: ['abc', 'abkehr', 'ablehnen', 'abrufen', 'abschluss']...)


In [None]:
index = 0
for vector in corpus:
    if index <2:
        print(vector)
        index += 1
    else:
        print("finished")
        break

## Similarity with LDA (Latent Dirichlet Allocation)

### Train the LDA model

Parameters:
* corpus: the corpus
* num_topics: topics to be extracted from the training corpus
* id2word: id to word mapping, the dictionary
* workers: number of cpu cores used

In [69]:
%%time
lda = LdaMulticore(corpus_small, num_topics=10, id2word=dictionary)

Wall time: 17 ms


In [98]:
doc_bow = [(0, 1), (1, 1)]
print(lda[doc_bow]) 

[(0, 0.033490263), (1, 0.033490896), (2, 0.03348978), (3, 0.033489995), (4, 0.033489734), (5, 0.0334891), (6, 0.69858706), (7, 0.03348973), (8, 0.03349116), (9, 0.0334923)]


In [106]:
index = similarities.SparseMatrixSimilarity(lda[corpus_small], num_features=len(dictionary))

In [140]:
test_doc_raw = "'''Alan Smithee''' steht als [[Pseudonym]] für einen fiktiven Regisseur, der Filme verantwortet, bei denen der eigentliche [[Regisseur]] seinen Namen nicht mit dem Werk in Verbindung gebracht haben möchte."
test_vec = dictionary.doc2bow(preprocess_text(test_doc_raw))
print(test_vec)
# convert to lda space
test_vec_lda = lda[test_vec]
print(test_vec_lda)

[(8, 1), (43, 1), (65, 1), (84, 1), (85, 1), (157, 1), (182, 1), (189, 2), (217, 1), (244, 1), (245, 1), (256, 1)]
[(3, 0.93544364)]


In [114]:
sims = index[test_vec_lda]
print(sims)

[]
