# Wikipedia Corpus

Corpus from: https://dumps.wikimedia.org/dewiki/20200820/

Sentences for comparison from: https://github.com/t-systems-on-site-services-gmbh/german-wikipedia-text-corpus

In [1]:
#imports
from xml.etree.ElementTree import *
import xml.etree.ElementTree as ET
from collections import Counter
import os
import pprint
import gensim
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import LdaMulticore
import nltk
from nltk.corpus import stopwords
from smart_open import open 
import spacy
import de_core_news_md

### Global Variables

In [2]:
# the XML-file
xml_file = "data/wiki_corpus/dewiki-20200820-pages-articles-multistream.xml"

# number of documents to parse 
num_documents = 200

## Preprocessing

1. Load the language model from spacy
2. Function preprocess_text(text) transforms text to preprocessed tokens

In [3]:
# load the language model from spacy
spacy_data = de_core_news_md.load()

def preprocess_text(text):
    # load and tokenize text with the spacy language model
    prep_text = spacy_data(text)
    # list for tokens
    prep_tokens = []
    # for every token in text
    for token in prep_text:
        # remove stopwords and punctuatiuon
        if token.pos_ != 'PUNCT' and token.is_stop == False:
            # lemmatize and transform to lowercase
            lemma_token = token.lemma_.lower()
            # remove non-alphabetic tokens
            if lemma_token.isalpha() or lemma_token == '-PRON-':
                prep_tokens.append(lemma_token)
    # return preprocessed text 
    return prep_tokens

For experimental purposes:

Load the texts into a list

__NOT MEMORY FRIENDLY__

In [13]:
%%time
texts = []
index = 0
for event, elem in ET.iterparse(xml_file, events = ("start", "end")):        
    if index < 20:
        if event == 'end' and "text" in elem.tag:
            index += 1  
            texts.append(str(elem.text))
            elem.clear()
    else:
        break


Wall time: 4 ms


To be able to return the title of a given article later on, we need to store those in a dictionary:

In [4]:
title_ids = {}
index = 0
for event, elem in ET.iterparse(xml_file, events = ("start", "end")):        
    if index < 200:
        if event == 'end' and "title" in elem.tag:
            title_ids[index]=str(elem.text)
            index += 1    
            elem.clear()
    else:
        break 

## Build the corpus

Create a corpus from the text contents of the XML file.

1. Corpus is defined as a class object, so it can be called when needed.
2. Loops through the XML-file, searching for closing "text" tags.
3. Returns the text contents from these nodes in preprocessed form.
4. Then clears the current node from memory

Now define the corpus:

In [5]:
# Define the corpus as an object
class MyCorpus:
    def __iter__(self):
        # define the XML tree
        for event, elem in ET.iterparse(xml_file, events = ("start", "end")):            
            # Each document is represented as an object between <text> tags in the xml file
            if event == 'end' and "text" in elem.tag:
                # Transfom the corpus to vectors
                yield dictionary.doc2bow(preprocess_text(elem.text))
                # clear the node
                elem.clear()

The whole corpus is too big for this experiment and takes too long to parse through. For our proof-of-concept approach we therefore propose a function which only loops through the first i documents (text nodes) in the XML tree:

In [5]:
# Define a smaller corpus, containing only the first i documents:
class MyCorpus_small:
    def __iter__(self):
        index = 0
        # define the XML tree
        for event, elem in ET.iterparse(xml_file, events = ("start", "end")):
            if index < num_documents:
                # Each document is represented as an object between <text> tags in the xml file
                if event == 'end' and "text" in elem.tag:
                    # Transfom the corpus to vectors
                    yield dictionary.doc2bow(preprocess_text(elem.text))
                    index+=1
                    # clear the node
                    elem.clear()
            else:
                break

Initialize the corpus, without loading it into memory, this is not needed when working with the smaller corpus.

In [None]:
corpus = MyCorpus()

Initialize the smaller corpus, again without loading it into memory:

In [6]:
corpus_small = MyCorpus_small()

---

## Build the Dictionary

To further work with the corpus in vector form, we need to build a dictionary. 

This function needs to be called only once, since we are able to save the dictionary created by it and load it in future use.

In [41]:
def build_dictionary(xml_file):
    index = 0
    first_elem = True
    # loop through all nodes
    for event, elem in ET.iterparse(xml_file, events = ("start", "end")):        
        if index < num_documents:
            # check if current node contains a document
            if event == "end" and "text" in elem.tag:
                # preprocess the text
                text = preprocess_text(elem.text)
                # if this is the first document found, create a new dictionary with it
                if first_elem:
                    dictionary = Dictionary([text])
                    first_elem = False
                    index += 1
                # all documents after the first one get appended to the dictionary
                else:
                    dictionary.add_documents([text])
                    index += 1
                # clear the node
                elem.clear()
        else:
            break
    return dictionary

__DO NOT RUN THE FOLLOWING CODE IF THE DICTIONARY CAN BE LOADED FROM A FILE__

In [42]:
%%time
# build the dictionary:
dictionary = build_dictionary(xml_file)

Wall time: 2min 12s


In [43]:
%%time
# remove words that appear only once
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(once_ids)
# remove gaps in id sequence after words that were removed
dictionary.compactify()

Wall time: 43 ms


In [44]:
#save the dictionary
dictionary.save('data/wiki_200_new.dict')

__CONTINUE HERE TO LOAD THE DICTIONARY__

In [7]:
#load the dictionary
dictionary = Dictionary.load('data/wiki_200_new.dict')

In [8]:
# check if the dictionary has been loaded 
print(dictionary)

Dictionary(20308 unique tokens: ['abc', 'abkehr', 'ablehnen', 'abrufen', 'abschluss']...)


---

## Similarity with LDA (Latent Dirichlet Allocation)

### Train the LDA model

Parameters:
* corpus: the corpus
* num_topics: topics to be extracted from the training corpus
* id2word: id to word mapping, the dictionary
* workers: number of cpu cores used

The trained model can be stored and loaded, as same as the dictionary before.

In [9]:
%%time
lda = LdaMulticore(corpus_small, num_topics=200, id2word=dictionary)

Wall time: 4min 30s


First experiments have shown that a topic number of 10 (default) is too low. 100 resulted in better disctinction between the different articles.
__Further fine tuning needed here__

In [10]:
#save the trained model
lda.save("data/lda_model_200_t200.txt")

In [11]:
#load the trained model
lda = LdaModel.load("data/lda_model_200_t200.txt")

Index the corpus with the trained model:

In [12]:
%%time
corpus_index = similarities.MatrixSimilarity(list(lda[corpus_small]), num_features=len(dictionary))

Wall time: 2min 10s


In [31]:
#save the index
corpus_index.save("data/lda_index_200_t200.txt")

In [32]:
#load the index from disk
corpus_index.load("data/lda_index_200_t200.txt")

<gensim.similarities.docsim.MatrixSimilarity at 0x288a650fc40>

# Beispiele hier einlesen
## Similarity Check

Now that we have a LDA model and an index we can check the similarity of an input document against all documents in our corpus.
First we have to define an input document, in this case we took a text from our corpus to see if the expected similarity of 1.0 can be computed:

In [14]:
# define document to use in similarity check
test_document = texts[0]

In [15]:
print(test_document)

'''Alan Smithee''' steht als [[Pseudonym]] für einen fiktiven Regisseur, der Filme verantwortet, bei denen der eigentliche [[Regisseur]] seinen Namen nicht mit dem Werk in Verbindung gebracht haben möchte. Von 1968 bis 2000 wurde es von der [[Directors Guild of America]] (DGA) für solche Situationen empfohlen, seither ist es '''Thomas Lee'''.<ref>[[Los Angeles Times|latimes]].com: [http://articles.latimes.com/2000/jan/15/entertainment/ca-54271 ''Name of Director Smithee Isn't What It Used to Be''], zuletzt geprüft am 2. April 2011</ref> ''Alan Smithee'' ist jedoch weiterhin in Gebrauch.

Alternative Schreibweisen sind unter anderem die Ursprungsvariante ''Al'''len''' Smithee'' sowie ''Alan Sm'''y'''thee'' und ''A'''dam''' Smithee''. Auch zwei teilweise asiatisch anmutende Schreibweisen ''Alan Smi Thee'' und ''Sumishii Aran'' gehören – so die [[Internet Movie Database]] – dazu.<ref name="IMDb">[http://www.imdb.com/name/nm0000647/ Eigener Eintrag für ''Alan Smithee'' in der IMDb]</ref>



In [16]:
# transform the document to vector space
test_vec = dictionary.doc2bow(preprocess_text(test_document))
# convert to lda space
test_vec_lda = lda[test_vec]

In [17]:
# get the similarities
sims = corpus_index[test_vec_lda]

In [18]:
sims = corpus_index[test_vec_lda]
print(list(enumerate(sims)))

[(0, 1.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.09635989), (12, 0.22857422), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.0), (17, 0.0), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.0), (31, 0.0), (32, 0.10481228), (33, 0.030247318), (34, 0.0), (35, 0.036757674), (36, 0.0), (37, 0.0), (38, 0.0), (39, 0.012771295), (40, 0.0), (41, 0.0015398836), (42, 0.0), (43, 0.0), (44, 0.0), (45, 0.028419478), (46, 0.0), (47, 0.0), (48, 0.0020706612), (49, 0.0), (50, 0.0), (51, 0.0), (52, 0.0), (53, 0.0026773836), (54, 0.0), (55, 0.0), (56, 0.0), (57, 0.0), (58, 0.0), (59, 0.068634436), (60, 0.776972), (61, 0.0), (62, 0.027996365), (63, 0.030247318), (64, 0.0), (65, 0.0), (66, 0.0), (67, 0.0), (68, 0.0), (69, 0.0), (70, 0.0), (71, 0.0), (72, 0.0), (73, 0.00035092066), (74, 0.079071365), (75, 0.0), (76, 0.0), (77, 0.0), (78, 0.0), (79, 0.000617

## Results

In [25]:
hits = 0
for ids in list(enumerate(sims)):
    if ids[1] >= 0.75:
        hits += 1
        title = title_ids.get(ids[0])
        print("Similarity Score: ",ids[1],"\n","Document ID:",ids[0],"\n","Title:", title,"\n", "------------------------------------")
print(hits, "cases of possible plagiarism detected.")

Similarity Score:  1.0 
 Document ID: 0 
 Title: Alan Smithee 
 ------------------------------------
Similarity Score:  0.776972 
 Document ID: 60 
 Title: Anthropologie 
 ------------------------------------
2 cases of possible plagiarism detected.
