# Batch Legal – Pipeline Week 1: Data Retrieval – Preprocessing – Modelling




### Focus on »Consumer information, education and representation«

## Imports for the entire Notebook

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -U spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 5.1 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
!pip install spacy-lookups-data

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
#Imports

import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.collocations import *

import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


## Data Retrieval and Exploration

In [1]:
#Jakob, you should enter your code here!

## Preprocessing

In [6]:
#Loading a lot of data from csv
"""CSV with 3146 documents"""

data = pd.read_csv("/content/drive/MyDrive/over_2800_docs_scraped.csv")

In [7]:
data.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'title', 'cellar', 'date', 'dir_code',
       'dir_1', 'dir_2', 'dir_3', 'dir_4', 'dir_5', 'dir_6', 'Content'],
      dtype='object')

In [8]:
#Sellecting only the documents characterized as »Consumer information, education and representation«
df_selec = data.loc[data.dir_3 == "Consumer information, education and representation"]

In [9]:
df_selec.shape

(245, 13)

In [10]:
#Starting the actual Preprocessing

In [10]:
df_content = df_selec.Content

In [11]:
df_content.head()

41     (1) The Communication from the Commission of 2...
280    (1) The Union and its Member States are Partie...
363    (1) Pursuant to the first subparagraph of Arti...
364    (1) Pursuant to the first subparagraph of Arti...
370    (1) Pursuant to the first subparagraph of Arti...
Name: Content, dtype: object

In [12]:
#Preproc-Function

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## removing punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    
    #tokenized_sentence_cleaned = [w for w in tokenized_sentence_cleaned if not w in ignore_list] COMMENTED IGNORE OUT!
    
    sentences = ' '.join(word for word in tokenized_sentence_cleaned)
    
    #spacy
    nlp = spacy.load('en_core_web_sm', disable=["tok2vec", "tagger", "parser", "attribute_ruler"])
    nlp.remove_pipe("lemmatizer")
    nlp.add_pipe("lemmatizer", config={"mode": "lookup"}).initialize()
    doc = nlp(sentences)
    lemmatized = " ".join([token.lemma_ for token in doc])
    
    return lemmatized

In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
# Applying Cleaning Function

clean_txt = df_content.apply(cleaning)

In [17]:
len(clean_txt)

245

In [18]:
#Transforming Series in List

clean_txt = clean_txt.tolist()

In [19]:
#Checking outcome of Preprocessing

type(clean_txt)

list

In [20]:
# bigram vectorization

vectorizer_n_gram = TfidfVectorizer(ngram_range = (1,1)) # BI-GRAMS
cleaned_vectorizer_n_gram = vectorizer_n_gram.fit_transform(clean_txt)

In [21]:
df = pd.DataFrame(cleaned_vectorizer_n_gram.toarray(), columns=vectorizer_n_gram.get_feature_names_out())

In [22]:
#Topic model function from ML-10-lecture
def print_topics(model, vectorizer, top_words):
    for idx, topic in enumerate(model.components_):
        print("-"*20)
        print("Topic %d:" % (idx))
        print([(vectorizer_n_gram.get_feature_names_out()[i], round(topic[i],2))
                        for i in topic.argsort()[:-top_words - 1:-1]])

# Topic-Modelling with BERTopic




In [23]:
#PIP-installing BERTtopic

!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
from bertopic import BERTopic #BERTtopic-model: https://github.com/MaartenGr/BERTopic

In [25]:
#Training

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(clean_txt)

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

2022-06-02 15:19:07,239 - BERTopic - Transformed documents to Embeddings
2022-06-02 15:19:17,742 - BERTopic - Reduced dimensionality
2022-06-02 15:19:17,771 - BERTopic - Clustered reduced embeddings


In [26]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,38,-1_regulation_register_name_eu
1,0,35,0_shall_regulation_article_authority
2,1,31,1_regulation_article_eu_indication
3,2,23,2_de_regulation_amendment_specification
4,3,22,3_amendment_regulation_commission_specification
5,4,19,4_amendment_regulation_de_specification
6,5,18,5_mozzarella_name_di_del
7,6,16,6_regulation_specification_amendment_eu
8,7,16,7_name_register_regulation_enter
9,8,15,8_name_enter_register_regulation


In [27]:
topic_model.get_topics()

{-1: [('regulation', 0.07566686466598094),
  ('register', 0.06027783541542448),
  ('name', 0.060252367521033204),
  ('eu', 0.05800572248945586),
  ('enter', 0.054106503941887096),
  ('journal', 0.05117982107658842),
  ('official', 0.04968049395389201),
  ('article', 0.04920611348126005),
  ('commission', 0.0423281803471129),
  ('de', 0.04039612460776867)],
 0: [('shall', 0.030196501488494357),
  ('regulation', 0.02961942415474895),
  ('article', 0.02725378352382896),
  ('authority', 0.02610730030530101),
  ('product', 0.025453457583736364),
  ('follow', 0.024649172437327796),
  ('member', 0.022830052596636227),
  ('state', 0.022208012588136785),
  ('point', 0.021694036893391517),
  ('concern', 0.021222606282655643)],
 1: [('regulation', 0.11416652610301858),
  ('article', 0.07670011308635497),
  ('eu', 0.06105025213344312),
  ('indication', 0.05817359155220287),
  ('european', 0.05614277342783024),
  ('journal', 0.05449586406640241),
  ('commission', 0.054192561315355985),
  ('geograph

In [28]:
topic_model.visualize_topics()

In [29]:
topic_model.visualize_barchart()

In [30]:
topic_model.visualize_hierarchy()

In [31]:
topic_model.visualize_distribution(probs[0])

# Iteration 1: Getting rid of repetitive frequent words and train model again

In [32]:
#Building new DF
txt_without_frequ = pd.DataFrame(clean_txt)

In [33]:
txt_without_frequ

Unnamed: 0,0
0,communication commission november entitle ‘ fu...
1,union member state party unite nation economic...
2,pursuant 1 subparagraph article regulation eu ...
3,pursuant 1 subparagraph article regulation eu ...
4,pursuant 1 subparagraph article regulation eu ...
...,...
240,union contribute ensure high level consumer pr...
241,spirit drink make maceration sloe prunus spino...
242,annex ii regulation ec provide spirit drink ca...
243,pursuant article regulation ec commission exam...


In [34]:
#Getting repetitive and frequent words in topics

In [35]:
topic_words = topic_model.get_topics()

In [36]:
#Getting topic words

def get_words(topic_words):
  length = len(topic_words)
  words = []
  for i in range (-1, length-2):
    for h in range (0,9):
      word = topic_words[i][h][0]
      words.append(word)
  return pd.DataFrame(words)


In [37]:
words_df = get_words(topic_words)

In [38]:
words_df

Unnamed: 0,0
0,regulation
1,register
2,name
3,eu
4,enter
...,...
85,eu
86,journal
87,official
88,hungary


In [39]:
words_df.value_counts()

regulation       10
journal           8
eu                7
official          7
commission        6
article           5
specification     4
name              4
register          4
amendment         4
enter             4
european          4
de                2
accordance        1
opposition        1
product           1
point             1
member            1
shall             1
state             1
mozzarella        1
hungary           1
marc              1
indication        1
gioia             1
geographical      1
follow            1
di                1
del               1
colle             1
cacaval           1
authority         1
approve           1
union             1
dtype: int64

In [40]:
# list used to remove irrelevant terms 

ignore_list = ['regulation', 'eu', 'official', 'article', 'name', 'register', 'commission', 'european', 'amendment', 'specification', 'journal', 'union', 'european', 'de']

In [41]:
#Function to get rid of these terms

def ignore(sentence):
  tokenized_sentence = word_tokenize(sentence) ## tokenizing 
  cleaned  = [w for w in tokenized_sentence if not w in ignore_list]
  sentence_cleaned = ' '.join(word for word in cleaned)
  return sentence_cleaned

In [42]:
txt_clean = txt_without_frequ[0].apply(ignore)

In [43]:
len(txt_clean)

245

In [44]:
#New Training

topic_model_2 = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model_2.fit_transform(txt_clean)

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

2022-06-02 15:22:55,988 - BERTopic - Transformed documents to Embeddings
2022-06-02 15:23:02,321 - BERTopic - Reduced dimensionality
2022-06-02 15:23:02,361 - BERTopic - Clustered reduced embeddings


In [48]:
topic_model_2.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,81,0_enter_opposition_product_cyprus
1,1,40,1_shall_product_authority_follow
2,2,26,2_approve_publish_application_italy
3,3,20,3_accordance_protect_enter_publish
4,4,18,4_approve_publish_france_application
5,5,17,5_approve_publish_application_spain
6,-1,16,-1_operator_siret_protect_designation
7,6,16,6_ec_indication_geographical_application
8,7,11,7_approve_publish_application_staropolski


In [49]:
topic_model_2.visualize_barchart()

# Iteration 2: Getting rid of repetitive frequent words and train model again

In [51]:
topic_words_2 = topic_model_2.get_topics()

In [52]:
words_df_2 = get_words(topic_words_2)

In [53]:
words_df_2.value_counts()

publish         5
application     4
minor           3
approval        3
approve         3
enter           3
shall           3
product         2
protect         2
opposition      2
ec              2
since           2
question        2
accordance      1
per             1
opinion         1
operator        1
objection       1
origin          1
queijo          1
point           1
siret           1
spain           1
specify         1
state           1
technical       1
object          1
italy           1
member          1
designation     1
authority       1
cheese          1
class           1
committee       1
common          1
concern         1
cyprus          1
denote          1
examine         1
june            1
file            1
follow          1
force           1
france          1
french          1
geographical    1
indication      1
agricultural    1
xi              1
dtype: int64

In [58]:
ignore_list_2 = ['public', 'application', 'minor', 'approval', 'approve', 'shall']

In [55]:
def ignore_2(sentence):
  tokenized_sentence = word_tokenize(sentence) ## tokenizing 
  cleaned  = [w for w in tokenized_sentence if not w in ignore_list_2]
  sentence_cleaned = ' '.join(word for word in cleaned)
  return sentence_cleaned

In [56]:
txt_without = pd.DataFrame(txt_clean)

In [59]:
txt = txt_without[0].apply(ignore_2)

In [62]:
#New Training

topic_model_3 = BERTopic(nr_topics=2, language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model_3.fit_transform(txt)

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

2022-06-02 15:39:03,426 - BERTopic - Transformed documents to Embeddings
2022-06-02 15:39:06,668 - BERTopic - Reduced dimensionality
2022-06-02 15:39:06,694 - BERTopic - Clustered reduced embeddings
2022-06-02 15:39:06,965 - BERTopic - Reduced number of topics from 7 to 3


In [66]:
topic_model_3.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,125,0_product_follow_member_authority
1,1,73,1_publish_accordance_enter_force
2,2,47,2_publish_question_since_hereby


In [63]:
topic_model_3.visualize_barchart()

In [64]:
topic_model_3.visualize_distribution(probs[0])

In [65]:
topic_model_3.get_topics()

{0: [('product', 0.04473099294491721),
  ('follow', 0.039810074115563324),
  ('member', 0.035158680133180326),
  ('authority', 0.0348268274395142),
  ('state', 0.03450901141095335),
  ('ec', 0.028984609061306416),
  ('annex', 0.02875103357494486),
  ('concern', 0.028090420001279044),
  ('point', 0.02780721457378768),
  ('consumer', 0.02577561176312341)],
 1: [('publish', 0.12519248063465943),
  ('accordance', 0.11678417753230885),
  ('enter', 0.1077396083274612),
  ('force', 0.09738646108752382),
  ('protect', 0.09329987727889978),
  ('hereby', 0.08635995814313809),
  ('20', 0.08518047534294938),
  ('examine', 0.08509617679965649),
  ('ec', 0.08360644759229063),
  ('brussels', 0.08332661174248436)],
 2: [('publish', 0.1673530467759543),
  ('question', 0.10565372298211426),
  ('since', 0.10358345356233066),
  ('hereby', 0.0971959946772244),
  ('examine', 0.0970140017314615),
  ('20', 0.09586851598879097),
  ('brussels', 0.09378203840684907),
  ('force', 0.09372719184085342),
  ('entiret