# Batch Legal – Pipeline Week 1: Data Retrieval – Preprocessing – Modelling




### Using all data

## Imports for the entire Notebook

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -U spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 5.0 MB/s 
[?25hCollecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 82.4 MB/s 
Collecting spacy-legacy<3.1.0,>=3.0.9
  Downloading spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting thinc<8.1.0,>=8.0.14
  Downloading thinc-8.0.17-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (660 kB)
[K     |████████████████████████████████| 660 kB 70.0 MB/s 
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.2-py3-none-any.whl (7.2 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux

In [3]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 5.1 MB/s 
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 2.2.5
    Uninstalling en-core-web-sm-2.2.5:
      Successfully uninstalled en-core-web-sm-2.2.5
Successfully installed en-core-web-sm-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
!pip install spacy-lookups-data

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy-lookups-data
  Downloading spacy_lookups_data-1.0.3-py2.py3-none-any.whl (98.5 MB)
[K     |████████████████████████████████| 98.5 MB 1.3 MB/s 
Installing collected packages: spacy-lookups-data
Successfully installed spacy-lookups-data-1.0.3


In [5]:
#Imports

import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.collocations import *

import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


## Data Retrieval and Exploration

In [None]:
#Jakob, you should enter your code here!

## Preprocessing

In [6]:
#Loading a lot of data from csv
"""CSV with 3146 documents"""

data = pd.read_csv("/content/drive/MyDrive/over_2800_docs_scraped.csv")

In [7]:
data.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'title', 'cellar', 'date', 'dir_code',
       'dir_1', 'dir_2', 'dir_3', 'dir_4', 'dir_5', 'dir_6', 'Content'],
      dtype='object')

In [8]:
#Starting the actual Preprocessing
df_content = data.Content

In [9]:
df_content.head()

0    (1) Crowdfunding is increasingly an establishe...
1    (1) The Agreement on the withdrawal of the Uni...
2    (1) On 14 February 2020, the European Commissi...
3    (1) The horizontal principles set out in Artic...
4    (1) Every citizen of the Union has the fundame...
Name: Content, dtype: object

In [10]:
#Preproc-Function

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## removing punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    
    #tokenized_sentence_cleaned = [w for w in tokenized_sentence_cleaned if not w in ignore_list] COMMENTED IGNORE OUT!
    
    sentences = ' '.join(word for word in tokenized_sentence_cleaned)
    
    #spacy
    nlp = spacy.load('en_core_web_sm', disable=["tok2vec", "tagger", "parser", "attribute_ruler"])
    nlp.remove_pipe("lemmatizer")
    nlp.add_pipe("lemmatizer", config={"mode": "lookup"}).initialize()
    doc = nlp(sentences)
    lemmatized = " ".join([token.lemma_ for token in doc])
    
    return lemmatized

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [14]:
# Applying Cleaning Function

clean_txt = df_content.apply(cleaning)

In [None]:
len(clean_txt)

2871

In [None]:
type(clean_txt)

pandas.core.series.Series

In [38]:
clean_txt_df = pd.DataFrame(clean_txt)

In [40]:
#Save cleaned txt as csv

clean_txt_df.to_csv('/content/drive/MyDrive/2800_docs_clean_txt.csv')

In [15]:
all_content = ''.join(clean_txt)

In [16]:
from collections import Counter

# split() returns list of all the words in the string
split_it = all_content.split()

# Pass the split_it list to instance of Counter class.
Counters_found = Counter(split_it)
#print(Counters)

# most_common() produces k frequently encountered
# input values and their respective counts.
most_occur = Counters_found.most_common(100)


In [17]:
most_occur

[('shall', 130848),
 ('–', 117687),
 ('—', 104459),
 ('regulation', 101195),
 ('article', 100317),
 ('’', 56628),
 ('union', 55350),
 ('state', 52679),
 ('eu', 51565),
 ('member', 50829),
 ('commission', 50463),
 ('follow', 48055),
 ('b', 43097),
 ('use', 43044),
 ('‘', 42822),
 ('accordance', 41703),
 ('include', 39163),
 ('provide', 37390),
 ('point', 36896),
 ('refer', 36753),
 ('authority', 34999),
 ('information', 33711),
 ('product', 33245),
 ('c', 29944),
 ('annex', 29126),
 ('may', 28433),
 ('report', 28420),
 ('european', 27230),
 ('datum', 26167),
 ('measure', 25770),
 ('paragraph', 24868),
 ('set', 24258),
 ('little', 22180),
 ('period', 22089),
 ('apply', 22061),
 ('system', 21703),
 ('requirement', 21186),
 ('take', 21055),
 ('ec', 20545),
 ('risk', 20434),
 ('animal', 20311),
 ('country', 20012),
 ('market', 19599),
 ('concern', 19434),
 ('part', 19244),
 ('mean', 18967),
 ('institution', 18435),
 ('implement', 18024),
 ('financial', 17785),
 ('ensure', 17763),
 ('relevan

In [18]:
# list used to remove 10 most frequent words + 'european'

ignore_list = ['shall', 'regulation', 'article', 'union', 'state', 'eu', 'official',  'member', 'commission', 'commission', 'accordance', 'european']

In [19]:
#Function to get rid of these terms

def ignore(sentence):
  tokenized_sentence = word_tokenize(sentence) ## tokenizing 
  cleaned  = [w for w in tokenized_sentence if not w in ignore_list]
  sentence_cleaned = ' '.join(word for word in cleaned)
  return sentence_cleaned

In [20]:
#Applying function

txt_clean = clean_txt.apply(ignore)

In [21]:
txt_clean.head()

0    crowdfunding increasingly establish form alter...
1    agreement withdrawal unite kingdom great brita...
2    february ‘ ’ initiate antidumping investigatio...
3    horizontal principle set treaty teu treaty fun...
4    every citizen fundamental right move reside fr...
Name: Content, dtype: object

In [22]:
#Transforming Series in List

clean_txt = clean_txt.tolist()

# Topic-Modelling with BERTopic




In [23]:
#PIP-installing BERTtopic

!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.10.0-py2.py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 3.1 MB/s 
Collecting hdbscan>=0.8.28
  Downloading hdbscan-0.8.28.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 10.3 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 311 kB/s 
[?25hCollecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 8.1 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 56.2 MB/s 
Collect

In [24]:
from bertopic import BERTopic #BERTtopic-model: https://github.com/MaartenGr/BERTopic

In [25]:
#Training

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(txt_clean)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Batches:   0%|          | 0/90 [00:00<?, ?it/s]

2022-06-03 07:30:44,341 - BERTopic - Transformed documents to Embeddings
2022-06-03 07:31:01,350 - BERTopic - Reduced dimensionality
2022-06-03 07:31:02,063 - BERTopic - Clustered reduced embeddings


In [26]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,457,-1_datum_service_payment_information
1,0,119,0_animal_establishment_disease_consignment
2,1,115,1_institution_exposure_crr_report
3,2,96,2_quota_tariff_licence_quantity
4,3,93,3_device_clinical_notify_covid
...,...,...,...
70,69,11,69_import_duty_wheat_cif
71,70,11,70_design_specially_exceed_great
72,71,11,71_rice_husk_basmati_implement
73,72,11,72_biodiesel_us_taric_usa


In [27]:
topic_model.visualize_topics()

In [28]:
topic_model.visualize_barchart()

In [29]:
topic_model.visualize_hierarchy()

# Reducing amount of Topics

In [31]:
#Reducing the amount of topics and  Training

topic_model_reduced = BERTopic(nr_topics=12, language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model_reduced.fit_transform(txt_clean)

Batches:   0%|          | 0/90 [00:00<?, ?it/s]

2022-06-03 07:41:27,318 - BERTopic - Transformed documents to Embeddings
2022-06-03 07:41:40,440 - BERTopic - Reduced dimensionality
2022-06-03 07:41:41,164 - BERTopic - Clustered reduced embeddings
2022-06-03 07:41:58,139 - BERTopic - Reduced number of topics from 78 to 13


In [32]:
topic_model_reduced.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,946,-1_follow_use_datum_little
1,0,344,0_animal_gminy_od_gemeinde
2,1,207,1_annex_person_list_entity
3,2,190,2_amendment_journal_name_specification
4,3,161,3_fish_vessel_catch_fishery
5,4,148,4_substance_ec_submit_approval
6,5,138,5_additive_fee_premixtures_content
7,6,134,6_programme_support_activity_climate
8,7,132,7_device_medicinal_product_veterinary
9,8,131,8_institution_exposure_crr_report


In [33]:
topic_model_reduced.visualize_barchart()

In [36]:
topic_model_reduced.get_topics()

{-1: [('follow', 0.019531569336935915),
  ('use', 0.019102400953056684),
  ('datum', 0.016555070488379356),
  ('little', 0.01649857816292652),
  ('exceed', 0.015497319838045654),
  ('system', 0.01529580747386529),
  ('include', 0.015038379037968919),
  ('information', 0.014845534550974957),
  ('provide', 0.013824975131096592),
  ('imo', 0.013657215708807584)],
 0: [('animal', 0.05818556213231144),
  ('gminy', 0.039481101630709114),
  ('od', 0.033031318552745666),
  ('gemeinde', 0.028517642204709603),
  ('na', 0.02505676442364942),
  ('product', 0.024218112390753926),
  ('disease', 0.024201078716116386),
  ('establishment', 0.02353706507913952),
  ('plant', 0.023099767636565478),
  ('country', 0.02289228479778682)],
 1: [('annex', 0.057889033856247184),
  ('person', 0.0493181373268216),
  ('list', 0.03982596553737806),
  ('entity', 0.03610132770726065),
  ('council', 0.03342539369870246),
  ('force', 0.03319735414532101),
  ('amend', 0.03136561363679184),
  ('birth', 0.02926558789811775