In [18]:
import os
import pickle
import re
import xml.etree.ElementTree as ET
from collections import defaultdict
from datetime import timedelta
from timeit import default_timer as timer

import ktrain
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

np.set_printoptions(edgeitems=30, linewidth=1000)
nltk.download('stopwords')
nltk.download('wordnet')

data_dir = "C:/Users/admin/Documents/Projects/accademic/python-lda-topic-modeling-ec-laws/data/"

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ermal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ermal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
def to_string_utf8(document):
    return document.decode('utf-8')

def get_file_content(filepath):
    tree = ET.parse(filepath)
    document = ET.tostring(tree.getroot(), encoding='utf-8', method='text')
    document = to_string_utf8(document)
    document = re.sub('[ \t\n]+', ' ', document)
    return document

def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    stopwordsList = set(stopwords.words("english"))
    words = text.lower().split(" ")
    cleaned_text = ""
    for word in words:
        if word in stopwordsList: continue
        cleaned_text += lemmatizer.lemmatize(word) + " "
    return cleaned_text

In [20]:
 %%time
year = "2016"
documents = []
for doc in os.listdir(data_dir):
    # if doc.endswith(".xml"):
    if doc.startswith("reg_" + year) and doc.endswith(".xml"):
        try:
            documents.append([doc, lemmatization(get_file_content(os.path.join(data_dir, doc)))])
        except:
            pass
documents = np.array(documents)

Wall time: 3.29 s


In [21]:
%%time
model = ktrain.text.get_topic_model(documents[:, 1])

n_topics automatically set to 11
lang: en
preprocessing texts...
fitting model...
iteration: 1 of max_iter: 5
iteration: 2 of max_iter: 5
iteration: 3 of max_iter: 5
iteration: 4 of max_iter: 5
iteration: 5 of max_iter: 5
done.
Wall time: 5.96 s


In [22]:
%%time
model.build(documents[:, 1], threshold=0.25)

done.
Wall time: 1.82 s


In [23]:
model.print_topics(show_counts=True)

topic:5 | count:63 | fishing product stock vessel price investigation import industry country quota
topic:0 | count:58 | food substance use directive list health claim product efsa animal
topic:9 | count:46 | agency national border data decision paragraph act procedure service directive
topic:3 | count:29 | relevant power requirement operator product plant module paragraph point demand
topic:4 | count:23 | benchmark law paragraph fishing competent vessel decision data court property
topic:8 | count:15 | data processing personal body supervisory subject conformity protection right decision
topic:2 | count:10 | animal disease point health product competent establishment paragraph listed engine
topic:1 | count:3 | data europol management personal board agency border director national right
topic:6 | count:2 | restriction agency benchmark safety directive paragraph relevant assessment rac competent


In [25]:
topics = model.get_doctopics()
model.visualize_documents(doc_topics=topics)

reducing to 2 dimensions...[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 249 samples in 0.001s...
[t-SNE] Computed neighbors for 249 samples in 0.005s...
[t-SNE] Computed conditional probabilities for sample 249 / 249
[t-SNE] Mean sigma: 0.006865
[t-SNE] KL divergence after 250 iterations with early exaggeration: 52.884964
[t-SNE] KL divergence after 1000 iterations: 0.187976
done.


In [26]:
topic_to_document = defaultdict(list)
for doc in documents:
    pred = model.predict([doc[1]])[0]
    found = False
    for i in range(len(pred)):
        if pred[i] >= 0.25:  # 0.25 is threshold value of similarity. Less than this is talking for different topic
            topic_to_document[i].append(doc[0])
            found = True

    if not found:
        print("No Topic found for document ", doc[0], "(similarity threshold 0.25)")

No Topic found for document  reg_2016_nr-003_seq-0002_akn.xml (similarity threshold 0.25)


In [29]:
print(len(documents), "documents are spread as follow:")
for topic_doc in range(len(topic_to_document)):
    print("Topic", topic_doc, "is found in", len(topic_to_document[topic_doc]), "documents")

250 documents are spread as follow:
Topic 0 is found in 70 documents
Topic 1 is found in 4 documents
Topic 2 is found in 13 documents
Topic 3 is found in 39 documents
Topic 4 is found in 35 documents
Topic 5 is found in 68 documents
Topic 6 is found in 4 documents
Topic 7 is found in 0 documents
Topic 8 is found in 18 documents
Topic 9 is found in 55 documents


In [30]:
text = "In order to ensure a consistent level of protection for natural persons throughout the Union and to prevent divergences hampering the free movement of personal data within the internal market, a Regulation is necessary to provide legal certainty and transparency for economic operators, including micro, small and medium-sized enterprises, and to provide natural persons in all Member States with the same level of legally enforceable rights and obligations and responsibilities for controllers and processors, to ensure consistent monitoring of the processing of personal data, and equivalent sanctions in all Member States as well as effective cooperation between the supervisory authorities of different Member States. The proper functioning of the internal market requires that the free movement of personal data within the Union is not restricted or prohibited for reasons connected with the protection of natural persons with regard to the processing of personal data. To take account of the specific situation of micro, small and medium-sized enterprises, this Regulation includes a derogation for organisations with fewer than 250 employees with regard to record-keeping. In addition, the Union institutions and bodies, and Member States and their supervisory authorities, are encouraged to take account of the specific needs of micro, small and medium-sized enterprises in the application of this Regulation. The notion of micro, small and medium-sized enterprises should draw from Article 2 of the Annex to Commission Recommendation 2003/361/EC"

In [32]:
pred = np.argmax(model.predict([lemmatization(text)]))
pred

8

In [33]:
print("Inserted text is similar to Topic", pred, " - \"", topics[pred], "\"")

Inserted text is similar to Topic 8  - " data processing personal body supervisory subject conformity protection right decision "


In [34]:
print(len(topic_to_document[pred]), "documents containing Topic", pred, ": ", topic_to_document[pred])

18 documents containing Topic 8 :  ['reg_2016_679_akn_nr119seq0001.xml', 'reg_2016_nr-010_seq-0001_akn.xml', 'reg_2016_nr-012_seq-0001_akn.xml', 'reg_2016_nr-040_seq-0001_akn.xml', 'reg_2016_nr-068_seq-0001_akn.xml', 'reg_2016_nr-081_seq-0001_akn.xml', 'reg_2016_nr-081_seq-0002_akn.xml', 'reg_2016_nr-081_seq-0003_akn.xml', 'reg_2016_nr-085_seq-0001_akn.xml', 'reg_2016_nr-096_seq-0001_akn.xml', 'reg_2016_nr-153_seq-0001_akn.xml', 'reg_2016_nr-162_seq-0001_akn.xml', 'reg_2016_nr-193_seq-0002_akn.xml', 'reg_2016_nr-255_seq-0001_akn.xml', 'reg_2016_nr-259_seq-0001_akn.xml', 'reg_2016_nr-268_seq-0006_akn.xml', 'reg_2016_nr-317_seq-0001_akn.xml', 'reg_2016_nr-336_seq-0001_akn.xml']
