# Batch Legal – Pipeline Week 1: Data Retrieval – Preprocessing – Modelling




### Using all data

## Imports for the entire Notebook

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install -U spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 35.4 MB/s 
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 2.2.5
    Uninstalling en-core-web-sm-2.2.5:
      Successfully uninstalled en-core-web-sm-2.2.5
Successfully installed en-core-web-sm-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
!pip install spacy-lookups-data

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
#Imports

import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.collocations import *

import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


## Data Retrieval and Exploration

In [None]:
#Jakob, you should enter your code here!

## Preprocessing

In [5]:
#Loading a lot of data from csv
"""CSV with 3146 documents"""

data = pd.read_csv("/content/drive/MyDrive/over_2800_docs_scraped.csv")

In [6]:
data.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'title', 'cellar', 'date', 'dir_code',
       'dir_1', 'dir_2', 'dir_3', 'dir_4', 'dir_5', 'dir_6', 'Content'],
      dtype='object')

In [7]:
#Starting the actual Preprocessing

In [7]:
df_content = data.Content

In [9]:
df_content.head()

0    (1) Crowdfunding is increasingly an establishe...
1    (1) The Agreement on the withdrawal of the Uni...
2    (1) On 14 February 2020, the European Commissi...
3    (1) The horizontal principles set out in Artic...
4    (1) Every citizen of the Union has the fundame...
Name: Content, dtype: object

In [8]:
#Preproc-Function

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## removing punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    
    #tokenized_sentence_cleaned = [w for w in tokenized_sentence_cleaned if not w in ignore_list] COMMENTED IGNORE OUT!
    
    sentences = ' '.join(word for word in tokenized_sentence_cleaned)
    
    #spacy
    nlp = spacy.load('en_core_web_sm', disable=["tok2vec", "tagger", "parser", "attribute_ruler"])
    nlp.remove_pipe("lemmatizer")
    nlp.add_pipe("lemmatizer", config={"mode": "lookup"}).initialize()
    doc = nlp(sentences)
    lemmatized = " ".join([token.lemma_ for token in doc])
    
    return lemmatized

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
# Applying Cleaning Function

clean_txt = df_content.apply(cleaning)

In [13]:
len(clean_txt)

2871

In [14]:
type(clean_txt)

pandas.core.series.Series

In [15]:
all_content = ''.join(clean_txt)

In [16]:
from collections import Counter

# split() returns list of all the words in the string
split_it = all_content.split()

# Pass the split_it list to instance of Counter class.
Counters_found = Counter(split_it)
#print(Counters)

# most_common() produces k frequently encountered
# input values and their respective counts.
most_occur = Counters_found.most_common(100)


In [17]:
most_occur

[('shall', 130848),
 ('–', 117687),
 ('—', 104459),
 ('regulation', 101195),
 ('article', 100317),
 ('’', 56628),
 ('union', 55350),
 ('state', 52679),
 ('eu', 51565),
 ('member', 50829),
 ('commission', 50463),
 ('follow', 48055),
 ('b', 43097),
 ('use', 43044),
 ('‘', 42822),
 ('accordance', 41703),
 ('include', 39163),
 ('provide', 37390),
 ('point', 36896),
 ('refer', 36753),
 ('authority', 34999),
 ('information', 33711),
 ('product', 33245),
 ('c', 29944),
 ('annex', 29126),
 ('may', 28433),
 ('report', 28420),
 ('european', 27230),
 ('datum', 26167),
 ('measure', 25770),
 ('paragraph', 24868),
 ('set', 24258),
 ('little', 22180),
 ('period', 22089),
 ('apply', 22061),
 ('system', 21703),
 ('requirement', 21186),
 ('take', 21055),
 ('ec', 20545),
 ('risk', 20434),
 ('animal', 20311),
 ('country', 20012),
 ('market', 19599),
 ('concern', 19434),
 ('part', 19244),
 ('mean', 18967),
 ('institution', 18435),
 ('implement', 18024),
 ('financial', 17785),
 ('ensure', 17763),
 ('relevan

In [18]:
# list used to remove 10 most frequent words + 'european'

ignore_list = ['shall', 'regulation', 'article', 'union', 'state', 'eu', 'official',  'member', 'commission', 'commission', 'accordance', 'european']

In [19]:
#Function to get rid of these terms

def ignore(sentence):
  tokenized_sentence = word_tokenize(sentence) ## tokenizing 
  cleaned  = [w for w in tokenized_sentence if not w in ignore_list]
  sentence_cleaned = ' '.join(word for word in cleaned)
  return sentence_cleaned

In [24]:
#Applying function

txt_clean = clean_txt.apply(ignore)

In [25]:
txt_clean.head()

0    crowdfunding increasingly establish form alter...
1    agreement withdrawal unite kingdom great brita...
2    february ‘ ’ initiate antidumping investigatio...
3    horizontal principle set treaty teu treaty fun...
4    every citizen fundamental right move reside fr...
Name: Content, dtype: object

In [None]:
#Transforming Series in List

clean_txt = clean_txt.tolist()

# Topic-Modelling with BERTopic




In [None]:
#PIP-installing BERTtopic

!pip install bertopic

In [27]:
from bertopic import BERTopic #BERTtopic-model: https://github.com/MaartenGr/BERTopic

In [28]:
#Training

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(txt_clean)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Batches:   0%|          | 0/90 [00:00<?, ?it/s]

2022-06-02 17:29:25,873 - BERTopic - Transformed documents to Embeddings
2022-06-02 17:29:44,058 - BERTopic - Reduced dimensionality
2022-06-02 17:29:44,930 - BERTopic - Clustered reduced embeddings


In [29]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,525,-1_design_follow_exceed_device
1,0,100,0_institution_exposure_crr_report
2,1,95,1_quota_tariff_licence_quantity
3,2,94,2_animal_establishment_disease_consignment
4,3,89,3_covid_health_pandemic_support
...,...,...,...
74,73,11,73_additive_fee_spectrometry_ethoxyquin
75,74,11,74_import_duty_wheat_cif
76,75,10,75_eppo_prosecutor_confiscation_execute
77,76,10,76_threshold_directive_eur_procurement


In [30]:
topic_model.visualize_topics()

In [31]:
topic_model.visualize_barchart()

In [32]:
topic_model.visualize_hierarchy()