In [2]:
#imports
from xml.etree.ElementTree import *
import xml.etree.ElementTree as ET
from collections import Counter
import os
import pprint
import gensim
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import LdaMulticore
import nltk
from nltk.corpus import stopwords
from smart_open import open 
import spacy
import de_core_news_md

In [3]:
# the XML-file
xml_file = "../Textmining/dewiki-20200920-pages-articles-multistream.xml"

# number of documents to parse 
num_documents = 200

In [4]:
# load the language model from spacy
spacy_data = de_core_news_md.load()

def preprocess_text(text):
    # load and tokenize text with the spacy language model
    prep_text = spacy_data(text)
    # list for tokens
    prep_tokens = []
    # for every token in text
    for token in prep_text:
        # remove stopwords and punctuatiuon
        if token.pos_ != 'PUNCT' and token.is_stop == False:
            # lemmatize and transform to lowercase
            lemma_token = token.lemma_.lower()
            # remove non-alphabetic tokens
            if lemma_token.isalpha() or lemma_token == '-PRON-':
                prep_tokens.append(lemma_token)
    # return preprocessed text 
    return prep_tokens


In [5]:
# Define the corpus as an object
class MyCorpus:
    def __iter__(self):
        # define the XML tree
        for event, elem in ET.iterparse(xml_file, events = ("start", "end")):            
            # Each document is represented as an object between <text> tags in the xml file
            if event == 'end' and "text" in elem.tag:
                # Transfom the corpus to vectors
                yield dictionary.doc2bow(preprocess_text(elem.text))
                # clear the node
                elem.clear()

In [6]:
# Define a smaller corpus, containing only the first i documents:
class MyCorpus_small:
    def __iter__(self):
        index = 0
        # define the XML tree
        for event, elem in ET.iterparse(xml_file, events = ("start", "end")):
            if index < num_documents:
                # Each document is represented as an object between <text> tags in the xml file
                if event == 'end' and "text" in elem.tag:
                    # Transfom the corpus to vectors
                    yield dictionary.doc2bow(preprocess_text(elem.text))
                    index+=1
                    # clear the node
                    elem.clear()
            else:
                break

In [7]:
corpus = MyCorpus()

In [8]:
corpus_small = MyCorpus_small()

In [9]:
def build_dictionary(xml_file):
    index = 0
    first_elem = True
    # loop through all nodes
    for event, elem in ET.iterparse(xml_file, events = ("start", "end")):        
        if index < num_documents:
            # check if current node contains a document
            if event == "end" and "text" in elem.tag:
                # preprocess the text
                text = preprocess_text(elem.text)
                # if this is the first document found, create a new dictionary with it
                if first_elem:
                    dictionary = Dictionary([text])
                    first_elem = False
                    index += 1
                # all documents after the first one get appended to the dictionary
                else:
                    dictionary.add_documents([text])
                    index += 1
                # clear the node
                elem.clear()
        else:
            break
    return dictionary

nicht

In [20]:
%%time
# build the dictionary:
dictionary = build_dictionary(xml_file)


CPU times: user 1min 48s, sys: 13.2 s, total: 2min 1s
Wall time: 2min 1s


In [21]:
%%time
# remove words that appear only once
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(once_ids)
# remove gaps in id sequence after words that were removed
dictionary.compactify()

CPU times: user 57.4 ms, sys: 5.63 ms, total: 63 ms
Wall time: 62.1 ms


In [23]:
#save the dictionary
dictionary.save('../Textmining/wiki_200_new.dict')

weiter

In [10]:
#load the dictionary
dictionary = Dictionary.load('../Textmining/wiki_200_new.dict')

In [11]:
# check if the dictionary has been loaded 
print(dictionary)

Dictionary(20416 unique tokens: ['abc', 'abkehr', 'ablehnen', 'abrufen', 'abschluss']...)


In [12]:
%%time
lda = LdaMulticore(corpus_small, num_topics=200, id2word=dictionary)

CPU times: user 3min 43s, sys: 27.9 s, total: 4min 11s
Wall time: 4min 4s


In [13]:
#save the trained model
lda.save("../Textmining/lda_model_200.txt")

In [14]:
#load the trained model
lda = LdaModel.load("../Textmining/lda_model_200.txt")

In [15]:
%%time
corpus_index = similarities.MatrixSimilarity(list(lda[corpus_small]), num_features=len(dictionary))

CPU times: user 4min 11s, sys: 13.6 s, total: 4min 25s
Wall time: 1min 58s


ab hier csv

In [16]:
# define document to use in similarity check
#test_document = texts[23]

In [17]:
# transform the document to vector space
#test_vec = dictionary.doc2bow(preprocess_text(test_document))
# convert to lda space
#test_vec_lda = lda[test_vec]

In [18]:
# get the similarities
#sims = corpus_index[test_vec_lda]

### csv

In [19]:
#pd.read_csv("../Textmining/beispieltext.csv",sep=";") #test_document 

In [20]:
#csv_file = pd.read_csv("../Textmining/beispieltext.csv",sep=";") #test_document  xml_file 

In [21]:
#csv_file[['Text']]

In [22]:
#test_document = pd.DataFrame(csv_file)
#test_document['Text'] = test_document['Text'].astype(str)

In [23]:
#test_document = csv_file[['Text']]
#print(test_document.dtypes)

In [24]:
#test_document.Text.astype(str)

In [25]:
#test_document[['Text']] = test_document[['Text']].astype(str)

In [26]:
#%timeit test_document.astype(str) 

### txt

In [27]:
import pandas as pd

In [28]:
test_document = open('wikibeispiele.txt', encoding='utf-8')
#test_document.readlines()
test_document = test_document.read()

In [29]:
# transform the document to vector space
test_vec = dictionary.doc2bow(preprocess_text(test_document)) #test_document
# convert to lda space
test_vec_lda = lda[test_vec]

In [30]:
# get the similarities
sims = corpus_index[test_vec_lda]

In [31]:
hits = 0
for ids in list(enumerate(sims)):
    if ids[1] >= 0.75:
        hits += 1
        print("Übereinstimmung von ","%.2f" %(ids[1]*100),"%","\n","Document ID:",ids[0],test_document[ids[0]],"\n", "------------------------------------")
print(hits, "Plagiatsfälle gefunden")

Übereinstimmung von  83.88 % 
 Document ID: 34   
 ------------------------------------
Übereinstimmung von  78.72 % 
 Document ID: 45 r 
 ------------------------------------
Übereinstimmung von  76.87 % 
 Document ID: 85   
 ------------------------------------
Übereinstimmung von  83.75 % 
 Document ID: 97   
 ------------------------------------
Übereinstimmung von  90.90 % 
 Document ID: 123 . 
 ------------------------------------
Übereinstimmung von  83.91 % 
 Document ID: 195 p 
 ------------------------------------
6 Plagiatsfälle gefunden
