# Batch Legal – Comparing the Model's output with the given EU-Directories




In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


***Base***: 

- Adjusted preprocessing steps: took out sentence tokenization and use spacy is used for lemmatization.
- Changed from CountVectorizer to TfidfVectorizer

In [None]:
!pip install -U spacy

In [None]:
!python -m spacy download en_core_web_sm

In [5]:
#Imports

import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.collocations import *

import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from sklearn.model_selection import GridSearchCV

In [48]:
#Loading a lot of data from csv
"""CSV with 3146 documents"""

data = pd.read_csv("/content/drive/MyDrive/3145_docs_scraped_clean.csv")

In [49]:
data.columns

Index(['Unnamed: 0', 'title', 'cellar', 'date', 'dir_code', 'dir_1', 'dir_2',
       'dir_3', 'dir_4', 'dir_5', 'dir_6', 'Content'],
      dtype='object')

In [50]:
data.shape

(3146, 12)

In [51]:
df_content = data.Content

In [52]:
df_content.head()

0    THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE...
1    THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE...
2    THE EUROPEAN COMMISSION, Having regard to the ...
3    THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE...
4    THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE...
Name: Content, dtype: object

In [None]:
# list used to remove unrelevant terms 
"""The ignore_list is not in use so far"""

ignore_list = {'ec', 'no', 'european', 'commission', 'eu', 'union',
                   'article', 'directive', 'council', 'regulation', 'official',
                   'journal', 'article', 'information', 'agency', 'regulation',
                   'mssg', 'data', 'member', 'states', 'etf', 'mdssg', 'shall'
                  }

In [53]:
#Preproc-Function

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## removing punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    
    #tokenized_sentence_cleaned = [w for w in tokenized_sentence_cleaned if not w in ignore_list] COMMENTED IGNORE OUT!
    
    sentences = ' '.join(word for word in tokenized_sentence_cleaned)
    
    #spacy
    nlp = spacy.load('en_core_web_sm', disable=["parser"])

    doc = nlp(sentences)
    lemmatized = " ".join([token.lemma_ for token in doc])
    
    return lemmatized

In [12]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [54]:
# Applying Cleaning Function

clean_txt = df_content.apply(cleaning)

In [55]:
#Checking outcome of Preprocessing

clean_txt[0]



In [56]:
# bigram vectorization

vectorizer_n_gram = TfidfVectorizer(ngram_range = (1,1)) # BI-GRAMS
cleaned_vectorizer_n_gram = vectorizer_n_gram.fit_transform(clean_txt)

In [57]:
df = pd.DataFrame(cleaned_vectorizer_n_gram.toarray(), columns=vectorizer_n_gram.get_feature_names_out())

In [58]:
#Topic model function from ML-10-lecture
def print_topics(model, vectorizer, top_words):
    for idx, topic in enumerate(model.components_):
        print("-"*20)
        print("Topic %d:" % (idx))
        print([(vectorizer_n_gram.get_feature_names_out()[i], round(topic[i],2))
                        for i in topic.argsort()[:-top_words - 1:-1]])

# Topic-Modelling with BERTopic




In [None]:
#PIP-installing BERTtopic

!pip install bertopic

In [26]:
from bertopic import BERTopic #BERTtopic-model: https://github.com/MaartenGr/BERTopic

In [83]:
#Training

topic_model = BERTopic(nr_topics=20, language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(clean_txt)

Batches:   0%|          | 0/99 [00:00<?, ?it/s]

2022-06-02 11:23:55,819 - BERTopic - Transformed documents to Embeddings
2022-06-02 11:24:09,445 - BERTopic - Reduced dimensionality
2022-06-02 11:24:10,439 - BERTopic - Clustered reduced embeddings
2022-06-02 11:24:30,698 - BERTopic - Reduced number of topics from 89 to 21


In [84]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1207,-1_shall_article_regulation_use
1,0,192,0_control_regulation_ec_product
2,1,154,1_substance_regulation_mrl_ec
3,2,153,2_person_council_regulation_annex
4,3,142,3_additive_feed_premixture_substance
5,4,122,4_mm_quota_tariff_rn
6,5,120,5_shall_member_union_state
7,6,108,6_producer_commission_price_import
8,7,98,7_fishing_vessel_catch_fishery
9,8,91,8_amendment_regulation_european_specification


In [85]:
topic_model.get_topic(-1)

[('shall', 0.02365256817997433),
 ('article', 0.019141296752447367),
 ('regulation', 0.017761383517371974),
 ('use', 0.015016649540168561),
 ('follow', 0.014423584542103305),
 ('eu', 0.01409303177313302),
 ('member', 0.013965600286295182),
 ('state', 0.01382310444650231),
 ('include', 0.012575865323694554),
 ('accordance', 0.01164099393045191)]

In [86]:
topic_model.visualize_topics()

In [129]:
topic_model.visualize_barchart()

In [88]:
topic_model.visualize_hierarchy()

# Comparing Topics to the Directories given by the EU

In [99]:
df_compare = data[["date", "dir_1"]]

In [116]:
df_compare.dir_1.unique()

array(['Right of establishment and freedom to provide services',
       'Transport policy', 'External relations',
       'Freedom of movement for workers and social policy',
       'Environment, consumers and health protection',
       'General, financial and institutional matters',
       'Economic and monetary policy and free movement of capital',
       'Regional policy and coordination of structural instruments',
       'Industrial policy and internal market',
       'Science, information, education and culture', 'Taxation',
       "People's Europe", 'Customs Union and free movement of goods',
       'Fisheries', 'Agriculture', 'Common Foreign and Security Policy',
       'Area of freedom, security and justice', 'Energy',
       'Law relating to undertakings', nan, 'Competition policy'],
      dtype=object)

In [100]:
df_compare['topics'] = topics

In [117]:
df_compare.sample(10)

Unnamed: 0,date,dir_1,topics
1191,2021-11-15,Agriculture,-1
1896,2020-12-11,Common Foreign and Security Policy,2
282,2020-03-31,Regional policy and coordination of structural...,5
896,2020-03-17,Agriculture,-1
1621,2021-03-26,Right of establishment and freedom to provide ...,10
2024,2020-08-07,,-1
1991,2021-12-22,Agriculture,-1
1031,2021-07-12,Agriculture,18
2551,2013-07-23,External relations,6
1414,2020-11-13,Industrial policy and internal market,-1


In [112]:
fisheries = df_compare.loc[df_compare.dir_1 == "Fisheries"]

In [119]:
fisheries.sample(5)

Unnamed: 0,date,dir_1,topics
1655,2021-11-26,Fisheries,7
2223,2019-10-31,Fisheries,7
1529,2020-12-28,Fisheries,7
1232,2020-09-02,Fisheries,14
2863,2022-01-27,Fisheries,14


In [115]:
fisheries.topics.value_counts()

 7     94
 14    67
-1      2
 11     2
 10     1
 0      1
Name: topics, dtype: int64

In [121]:
external_rel = df_compare.loc[df_compare.dir_1 == "External relations"]

In [122]:
external_rel.sample(10)

Unnamed: 0,date,dir_1,topics
2802,2014-12-11,External relations,-1
2578,2019-07-23,External relations,-1
2883,2015-03-27,External relations,6
3080,2015-08-14,External relations,-1
1299,2020-08-10,External relations,-1
3096,2014-12-18,External relations,6
430,2020-10-13,External relations,6
79,2021-01-07,External relations,6
718,2021-12-22,External relations,6
839,2021-04-19,External relations,6


In [123]:
external_rel.topics.value_counts()

 6     106
-1      76
 5      12
 7       4
 4       3
 0       2
 8       1
 13      1
 2       1
 15      1
Name: topics, dtype: int64

In [127]:
topic_model.get_topic(6)

[('producer', 0.041384000216344495),
 ('commission', 0.03931125550671033),
 ('price', 0.0379875983961771),
 ('import', 0.03562183997825092),
 ('union', 0.03527475095115937),
 ('industry', 0.03383772887100563),
 ('export', 0.03366342417037225),
 ('investigation', 0.0324365702539636),
 ('company', 0.027646699083183273),
 ('claim', 0.02555104907852173)]

## Running LDA with same amount of topics as BERT spits out

In [124]:
# Instantiating LDA with predefined n_components
n_components = 20
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 100)

# Fitting the LDA on the vectorized documents
lda_model.fit(df)

LatentDirichletAllocation(max_iter=100, n_components=20)

In [125]:
print_topics(lda_model, cleaned_vectorizer_n_gram, top_words=10)

--------------------
Topic 0:
[('de', 23.5), ('el', 14.86), ('la', 13.18), ('en', 10.15), ('del', 9.25), ('los', 8.51), ('que', 6.08), ('reglamento', 5.64), ('las', 5.48), ('se', 5.11)]
--------------------
Topic 1:
[('washerdryer', 1.85), ('seabream', 1.53), ('dishwasher', 1.53), ('kic', 1.28), ('mallorca', 1.23), ('prawn', 1.02), ('arrow', 0.99), ('diplodus', 0.93), ('verdana', 0.92), ('crowdfunde', 0.91)]
--------------------
Topic 2:
[('pfoarelate', 1.1), ('pfoa', 1.08), ('clopyralid', 0.82), ('sidi', 0.77), ('souleman', 0.73), ('cypermethrin', 0.72), ('alginate', 0.69), ('esr', 0.68), ('jiaxe', 0.65), ('intergenerational', 0.63)]
--------------------
Topic 3:
[('на', 4.62), ('seine', 3.72), ('transshipping', 3.08), ('retaining', 3.02), ('pst', 2.83), ('за', 2.76), ('от', 2.33), ('aka', 1.73), ('hicp', 1.57), ('albacore', 1.38)]
--------------------
Topic 4:
[('ifrs', 6.48), ('aspergillus', 3.82), ('cncm', 3.74), ('airworthiness', 3.35), ('endobetaxylanase', 3.08), ('niger', 2.84),