# Wikipedia Corpus

Corpus from: https://dumps.wikimedia.org/dewiki/20200820/

Sentences for comparison from: https://github.com/t-systems-on-site-services-gmbh/german-wikipedia-text-corpus

In [None]:
#imports
from xml.etree.ElementTree import *
import xml.etree.ElementTree as ET
from collections import Counter
import os
import pprint
import gensim
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import LdaMulticore
import nltk
from nltk.corpus import stopwords
from smart_open import open 
import spacy
import de_core_news_md
import pickle
import numpy as np

from ipywidgets import FileUpload
from IPython.display import display
from IPython.core.display import display, HTML


from functions import *

### Global Variables

In [None]:
# the XML-file
xml_file = "data/dewiki-20200820-pages-articles-multistream.xml"

# number of documents to parse 
num_documents = 200

# similarity threshold, when does a document count as plagiarism
sim_threshold = 0.3

## Preprocessing

To be able to return the title of a given article later on, we need to store those in a dictionary:

In [None]:
title_ids = get_titles(xml_file, num_documents)

In [None]:
#save the index
pickle_out = open("data/title_ids200.pickle", "wb")
pickle.dump(title_ids, pickle_out)
pickle_out.close()

In [None]:
# load the index from disk
title_ids = pickle.load(open("data/title_ids200.pickle", "rb"))

## Build the corpus

Create a corpus from the text contents of the XML file.

1. Corpus is defined as a class object, so it can be called when needed.
2. Loops through the XML-file, searching for closing "text" tags.
3. Returns the text contents from these nodes in preprocessed form.
4. Then clears the current node from memory

Initialize the corpus, without loading it into memory, this is not needed when working with the smaller corpus.

The whole corpus is too big for this experiment and takes too long to parse through. For our proof-of-concept approach we therefore propose a function which only loops through the first i documents (text nodes) in the XML tree:

In [None]:
# Define a smaller corpus, containing only the first i documents:
class MyCorpus_small:
    def __iter__(self):
        index = 0
        # define the XML tree
        for event, elem in ET.iterparse(xml_file, events = ("start", "end")):
            if index < num_documents:
                # Each document is represented as an object between <text> tags in the xml file
                if event == 'end' and "text" in elem.tag:
                    # Transfom the corpus to vectors
                    yield dictionary.doc2bow(preprocess_text(elem.text))
                    index+=1
                    # clear the node
                    elem.clear()
            else:
                break    

Initialize the smaller corpus, again without loading it into memory:

In [None]:
corpus_small = MyCorpus_small()

---

## Build the Dictionary

To further work with the corpus in vector form, we need to build a dictionary. 

This function needs to be called only once, since we are able to save the dictionary created by it and load it in future use.

__DO NOT RUN THE FOLLOWING CODE IF THE DICTIONARY CAN BE LOADED FROM A FILE__

In [None]:
%%time
# build the dictionary:
dictionary = build_dictionary(xml_file, num_documents)

In [None]:
%%time
# remove words that appear only once
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(once_ids)
# remove gaps in id sequence after words that were removed
dictionary.compactify()

In [None]:
#save the dictionary
dictionary.save('data/wiki_200_new.dict')

__CONTINUE HERE TO LOAD THE DICTIONARY__

In [None]:
#load the dictionary
dictionary = Dictionary.load('data/wiki_200_new.dict')

In [None]:
# check if the dictionary has been loaded 
print(dictionary)

---

## Similarity with LDA (Latent Dirichlet Allocation)

### Train the LDA model

Parameters:
* corpus: the corpus
* num_topics: topics to be extracted from the training corpus
* id2word: id to word mapping, the dictionary
* workers: number of cpu cores used

The trained model can be stored and loaded, as same as the dictionary before.

In [None]:
%%time
lda = LdaMulticore(corpus_small, num_topics=300, id2word=dictionary)

First experiments have shown that a topic number of 10 (default) is too low. 100 resulted in better disctinction between the different articles.
__Further fine tuning needed here__

In [None]:
#save the trained model
lda.save("data/lda_model_200_t300.txt")

In [None]:
#load the trained model
lda = LdaModel.load("data/lda_model_200_t300.txt")

Index the corpus with the trained model:

In [None]:
%%time
corpus_index = similarities.MatrixSimilarity(list(lda[corpus_small]), num_features=len(dictionary))

In [None]:
#save the index
pickle_out = open("data/lda_index_200_t300.pickle", "wb")
pickle.dump(corpus_index, pickle_out)
pickle_out.close()

In [None]:
# load the index from disk
corpus_index = pickle.load(open("data/lda_index_200_t300.pickle", "rb"))

## Similarity Check

Now that we have a LDA model and an index we can check the similarity of an input document against all documents in our corpus.

In [None]:
# define document to use in similarity check
test_document = open('beispieltexte/wikibeispiele.txt', encoding='utf-8')
test_document = test_document.read()

In [None]:
print(test_document)

In [None]:
# transform the document to vector space
test_vec = dictionary.doc2bow(preprocess_text(test_document))
# convert to lda space
test_vec_lda = lda[test_vec]

In [None]:
# get the similarities
sims = corpus_index[test_vec_lda]

## Results

In [None]:
hits = 0
hit_title =[]
for ids in list(enumerate(sims)):
    if ids[1] >= sim_threshold and "Liste von Autoren" not in title_ids.get(ids[0]):
        hits += 1
        title = title_ids.get(ids[0])
        hit_title.append(title)
        print("Similarity Score: ",ids[1],"\n","Document ID:",ids[0],"\n","Title:", title,"\n", "------------------------------------")
print(hits, "cases of possible plagiarism detected.")

In [None]:
hit_ids = {}
for ids in list(enumerate(sims)):
    if ids[1] >= sim_threshold and "Liste von Autoren" not in title_ids.get(ids[0]):
        hit_ids[ids[0]] = ids[1]
hit_ids

## Sentence Similarity

The next step would be to define all documents that were found to have a specific similarity score as a new corpus. Then we can check the similarty score for each sentence from the input document in relation to the sentences from the "new" corpus.

### Build new dictionary

In [None]:
%%time
index = 0
first_elem = True
# loop through all nodes
for event, elem in ET.iterparse(xml_file, events = ("start", "end")):        
    if index < num_documents:
        # check if current node contains a document
        if event == "end" and "text" in elem.tag:
            if index in hit_ids.keys():
                # preprocess the text
                text = preprocess_text(elem.text)
                # if this is the first document found, create a new dictionary with it
                if first_elem:
                    dictionary_hits = Dictionary([text])
                    first_elem = False
                    index += 1
                # all documents after the first one get appended to the dictionary
                else:
                    dictionary_hits.add_documents([text])
                    index += 1
                # clear the node
                elem.clear()
                
            else:
                index += 1
                elem.clear()
    else:
        break

In [None]:
len(dictionary_hits)

In [None]:
# Define a smaller corpus, containing only the first i documents:
class MyCorpus_small_hits:
    def __iter__(self):
        index = 0
        # define the XML tree
        for event, elem in ET.iterparse(xml_file, events = ("start", "end")):
            if index < num_documents:
                if index in hit_ids.keys():
                    # Each document is represented as an object between <text> tags in the xml file
                    if event == 'end' and "text" in elem.tag:
                        # Transfom the corpus to vectors
                        yield dictionary_hits.doc2bow(preprocess_text(elem.text))
                        index+=1
                        # clear the node
                        elem.clear()
                else:
                    index+=1  
            else:
                break  

In [None]:
corpus_small_hits = MyCorpus_small_hits()

In [None]:
%%time
hit_lda = LdaMulticore(corpus_small_hits, num_topics=300, id2word=dictionary_hits)

In [None]:
print(hit_lda)

In [None]:
%%time
corpus_hit_index = similarities.MatrixSimilarity(list(hit_lda[corpus_small_hits]), num_features=len(dictionary_hits))

In [None]:
print(corpus_hit_index)

In [None]:
#use nltk tokenize to slice sentences
from nltk import tokenize

#slice test document to sentences
test_doc_raw_slice = []
for split in tokenize.sent_tokenize(test_document):
    test_doc_raw_slice.append(preprocess_text(str(split)))

test_doc_raw_sentence = []
for split in tokenize.sent_tokenize(test_document):
    test_doc_raw_sentence.append(str(split))

In [None]:
sim_hits = []
for sentence in test_doc_raw_slice:
    # test doc Sätze vs hit_corpus 
    test_vec = dictionary_hits.doc2bow(sentence)
    # convert to lda space
    test_vec_lda = hit_lda[test_vec]
    sim_hits.append(corpus_hit_index[test_vec_lda])

In [None]:
for elm in list(enumerate(sim_hits)):
    title = hit_title[np.argmax(elm[1])]
    
    if elm[1][np.argmax(elm[1])] > 0.80:
        print(test_doc_raw_sentence[elm[0]])
        print("aus Dokument: ", title)
        print("Übereinstimmung: ", elm[1][np.argmax(elm[1])])
        print("  ")
        print("Mehr Infos:")
        print(str(elm[1]).replace("         ", " ").replace("        ", ""))
        print("max: ", elm[1][np.argmax(elm[1])], "position: ", np.argmax(elm[1]))
        print("----------------------------------------------")

In [None]:
# creates result tags for html output
hit_result_html = ""
hit_vis = []
hits = 0
for elm in list(enumerate(sim_hits)):
    title = hit_title[np.argmax(elm[1])]
    if elm[1][np.argmax(elm[1])] < 0.60:
        cr_level="zero"
    if elm[1][np.argmax(elm[1])] >= 0.70:
        cr_level="low"
    if elm[1][np.argmax(elm[1])] >= 0.80:
        cr_level="medium"
    if elm[1][np.argmax(elm[1])] >= 0.90:
        cr_level="higher"
    if elm[1][np.argmax(elm[1])] >= 0.99:
        cr_level="high"
    
    if cr_level=="zero":
        hit_result_html = hit_result_html+" <t class='"+cr_level+"'>"+test_doc_raw_sentence[elm[0]]+"</t> "
    else:
        hit_result_html = hit_result_html+" <t class='"+cr_level+"'>"+test_doc_raw_sentence[elm[0]]+"<b> <a href='https://de.wikipedia.org/wiki/"+title+"'>"+title+"</a></b></t>"

In [None]:
# html output of all results
display(HTML("""
<style>
.high {background-color: #F8E0E0;}
.higher {background-color: #F8ECE0;}
.medium {background-color: #F7F8E0;}
.low {background-color: #E0F8E0;}
.zero {background-color: white;}
</style>

  """+hit_result_html+""))