# Batch Legal Mockup Model

In [1]:
#Imports

import pandas as pd
import string
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
#Loading html-file + converting it into txt

file = open("../../raw_data/test_data.html", "r")
data = BeautifulSoup(file)
as_txt = data.get_text()

In [3]:
#Tokenizing the sentences

as_sentences = sent_tokenize(as_txt)


In [4]:
#Transforming sentences into DF

txt_df = pd.DataFrame(as_sentences)
txt_df.head()

Unnamed: 0,0
0,\n\n\n\n\n\nConsolidated TEXT: 32004L0038 — EN...
1,The Union's institutions do not assume any lia...
2,"The authentic versions of the relevant acts, i..."
3,Those official texts are directly accessible t...
4,Article 2\nDefinitions\nFor the purposes of th...


In [5]:
type(txt_df)

pandas.core.frame.DataFrame

In [6]:
#Defining Davy's Preproc-Function

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## removing punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
              for word in tokenized_sentence_cleaned]
    cleaned_sentence = ' '.join(word for word in lemmatized)
    return cleaned_sentence

In [7]:
# Applying Davy's Function

clean_txt = txt_df[0].apply(cleaning)

In [8]:
#Checking outcome of Preprocessing
clean_txt

0      consolidate text l — en — l — en — — text mean...
1           unions institutions assume liability content
2      authentic versions relevant act include preamb...
3      official texts directly accessible link embed ...
4                  article definitions purpose directive
                             ...                        
234        methods make reference shall lay member state
235                                                     
236    member state shall communicate commission text...
237    article entry force directive shall enter forc...
238    article addressees directive address member state
Name: 0, Length: 239, dtype: object

In [9]:
#Vectorizing data

vectorizer = CountVectorizer()
vectorized_text = vectorizer.fit_transform(clean_txt)

In [10]:
#Modelling

# Instantiating the LDA 
n_components = 3
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 100)

# Fitting the LDA on the vectorized documents
lda_model.fit(vectorized_text)

# Getting topics
topics = lda_model.transform(vectorized_text)

In [11]:
#Topic model function from ML-10-lecture
def print_topics(model, vectorizer, top_words):
    for idx, topic in enumerate(model.components_):
        print("-"*20)
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], round(topic[i],2))
                        for i in topic.argsort()[:-top_words - 1:-1]])

In [12]:
#Printing topics

print_topics(lda_model, vectorizer, top_words = 8)

--------------------
Topic 0:
[('public', 35.28), ('shall', 29.99), ('concern', 21.92), ('state', 21.77), ('member', 19.55), ('decision', 19.31), ('security', 16.35), ('expulsion', 16.12)]
--------------------
Topic 1:
[('member', 32.61), ('state', 31.95), ('eec', 29.33), ('shall', 26.12), ('article', 22.72), ('directive', 21.3), ('union', 20.59), ('right', 19.92)]
--------------------
Topic 2:
[('member', 109.84), ('residence', 105.42), ('state', 105.27), ('shall', 60.89), ('right', 57.71), ('union', 55.04), ('article', 51.85), ('family', 51.28)]
