# Batch Legal Mockup Model

In [5]:
#Imports

import pandas as pd
import string
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [7]:
!pwd

/home/christopher/code/fvanlitsenburg/BatchLegal/notebooks


In [8]:
#Loading html-file + converting it into txt

file = open("../raw_data/test_data.html", "r")
data = BeautifulSoup(file)
as_txt = data.get_text()

In [9]:
#Tokenizing the sentences

as_sentences = sent_tokenize(as_txt)


In [10]:
#Transforming sentences into DF

txt_df = pd.DataFrame(as_sentences)
txt_df

Unnamed: 0,0
0,\n\n\nL_2022020EN.01000101.xml\n\n\n\n\n\n\n\n...
1,(2)\n\n\nThe COVID-19 pandemic has highlighted...
2,As recognised by the World Health Organization...
3,Diseases may be transmitted from humans to ani...
4,"Approximately 70 % of emerging diseases, and a..."
...,...
560,(19) Regulation (EC) No 1049/2001 of the Euro...
561,(20) Directive (EU) 2019/1937 of the European...
562,(21) Directive (EU) 2016/943 of the European ...
563,"(22) Regulation (EU, Euratom) 2018/1046 of th..."


In [11]:
type(txt_df)

pandas.core.frame.DataFrame

In [27]:
# list used to remove unrelevant terms 
ignore_list = {'ec', 'no', 'european', 'commission', 'eu', 'union',
                   'article', 'directive', 'council', 'regulation', 'official',
                   'journal', 'article', 'information', 'agency', 'regulation',
                   'mssg', 'data', 'member', 'states', 'etf', 'mdssg'
                  }

In [28]:
#Defining Davy's Preproc-Function

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## removing punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence_cleaned
                                  if not w in ignore_list]
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
              for word in tokenized_sentence_cleaned]
    cleaned_sentence = ' '.join(word for word in lemmatized)
    return cleaned_sentence

In [29]:
# Applying Davy's Function

clean_txt = txt_df[0].apply(cleaning)

In [30]:
#Checking outcome of Preprocessing
clean_txt

0      lenxml en l parliament january reinforce role ...
1      covid pandemic highlight interconnectedness hu...
2      recognise world health organization many micro...
3      diseases may transmit humans animals vice vers...
4      approximately emerge diseases almost know pand...
                             ...                        
560    parliament may regard public access parliament...
561    parliament october protection persons report b...
562    parliament june protection undisclosed knowhow...
563    euratom parliament july financial rule applica...
564    february fee payable evaluation medicinal prod...
Name: 0, Length: 565, dtype: object

In [31]:
#Vectorizing data

vectorizer = CountVectorizer()
vectorized_text = vectorizer.fit_transform(clean_txt)

In [32]:
#Modelling

# Instantiating the LDA 
n_components = 3
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 100)

# Fitting the LDA on the vectorized documents
lda_model.fit(vectorized_text)

# Getting topics
topics = lda_model.transform(vectorized_text)

In [33]:
#Topic model function from ML-10-lecture
def print_topics(model, vectorizer, top_words):
    for idx, topic in enumerate(model.components_):
        print("-"*20)
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], round(topic[i],2))
                        for i in topic.argsort()[:-top_words - 1:-1]])

In [34]:
#Printing topics

print_topics(lda_model, vectorizer, top_words = 8)

--------------------
Topic 0:
[('shall', 203.46), ('refer', 141.31), ('medicinal', 102.21), ('point', 95.24), ('devices', 93.5), ('provide', 90.3), ('medical', 88.34), ('list', 84.31)]
--------------------
Topic 1:
[('medicinal', 155.43), ('products', 150.03), ('health', 134.1), ('public', 103.04), ('shortages', 77.75), ('major', 74.29), ('medical', 65.5), ('devices', 56.1)]
--------------------
Topic 2:
[('clinical', 53.92), ('scientific', 39.62), ('parliament', 38.94), ('advice', 33.0), ('provide', 30.92), ('expert', 30.27), ('panel', 28.3), ('trial', 27.3)]
