# Batch Legal LDA Grid Search

***Base***: Preproc Mockup Version 3 (from Chris):
- Adjusted preprocessing steps: sentence tokenization and multiple lemmatizing steps.
- Changed from CountVectorizer to TfidfVectorizer

In [1]:
#Imports

import pandas as pd
import string
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.collocations import *

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from sklearn.model_selection import GridSearchCV

In [2]:
#Loading data from csv
data = pd.read_csv("test_data_scraped_new.csv")

In [3]:
data.columns

Index(['Unnamed: 0', 'Date of document', 'Title', 'Subtitle', 'CELEX number',
       'EUROVOC descriptor', 'Subject matter', 'Directory code', 'Author',
       'In force indicator', 'Content'],
      dtype='object')

In [4]:
df_content = data.Content

In [5]:
df_content.head()

0     (1) Pursuant to Articles 9 and 168 of the Tre...
1     (1) The objective of the Union’s policy on as...
2     (1) The development of health technologies is...
3     (1) The Commission communication of 29 Novemb...
4     (1) The Commission communication of 29 Novemb...
Name: Content, dtype: object

In [6]:
# list used to remove unrelevant terms 
ignore_list = {'ec', 'no', 'european', 'commission', 'eu', 'union',
                   'article', 'directive', 'council', 'regulation', 'official',
                   'journal', 'article', 'information', 'agency', 'regulation',
                   'mssg', 'data', 'member', 'states', 'etf', 'mdssg', 'shall'
                  }

In [7]:
#Defining Davy's Preproc-Function

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## removing punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    #tokenized_sentence_cleaned = [w for w in tokenized_sentence_cleaned if not w in ignore_list] COMMENTED IGNORE OUT!
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
              for word in tokenized_sentence_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
              for word in tokenized_sentence_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "a") # Lemmatizing the adjectives
              for word in tokenized_sentence_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "r") # Lemmatizing the adverbs
              for word in tokenized_sentence_cleaned]
    cleaned_sentence = ' '.join(word for word in lemmatized)
    return cleaned_sentence

In [8]:
# Applying Davy's Function

clean_txt = df_content.apply(cleaning)

In [9]:
#Checking outcome of Preprocessing
clean_txt


0     pursuant articles treaty functioning european ...
1     objective union ’ policy asylum develop establ...
2     development health technologies key driver eco...
3     commission communication november entitled ‘ f...
4     commission communication november entitled ‘ f...
5     technical difficulties breeding due complex ge...
6     agreement withdrawal united kingdom great brit...
7     regulation eu european parliament council expi...
8     december commission adopted communication enti...
9     directive ec european parliament council lays ...
10    regulation ec european parliament council subs...
11    context evolving migratory challenges characte...
12    union ’ objective ensuring high level security...
13    national security remains solely competence me...
14    order achieve smart sustainable inclusive grow...
15    european maritime fisheries aquaculture fund ‘...
16    existential threat posed climate change requir...
17    customs offices situated external borders 

In [13]:
clean_txt[0]



In [10]:
# bigram vectorization

vectorizer_n_gram = TfidfVectorizer(ngram_range = (1,1)) # BI-GRAMS
cleaned_vectorizer_n_gram = vectorizer_n_gram.fit_transform(clean_txt)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.DataFrame(cleaned_vectorizer_n_gram.toarray(), columns=vectorizer_n_gram.get_feature_names_out())


In [12]:
df.head()

Unnamed: 0,aa,ab,abandoned,abeyance,ability,able,abnormal,abnormally,abovementioned,abroad,...,york,young,youth,zagreb,zero,zone,zones,zoonoses,zoonotic,μgm
0,0.0,0.0,0.0,0.0,0.002509,0.0077,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007241,0.002414,0.0
1,0.002244,0.002244,0.0,0.0,0.0,0.016109,0.0,0.0,0.0,0.0,...,0.003945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.008168,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.007169,0.0,0.0,0.0,0.0,...,0.0,0.041537,0.000816,0.0,0.000659,0.0,0.000891,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.012245,0.002193,0.00658,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.003479,0.0,0.0,0.0


In [16]:
#Topic model function from ML-10-lecture
def print_topics(model, vectorizer, top_words):
    for idx, topic in enumerate(model.components_):
        print("-"*20)
        print("Topic %d:" % (idx))
        print([(vectorizer_n_gram.get_feature_names_out()[i], round(topic[i],2))
                        for i in topic.argsort()[:-top_words - 1:-1]])

In [14]:
# Instantiating the LDA 
n_components = 2
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 100)

# Fitting the LDA on the vectorized documents
lda_model.fit(df)

In [20]:
print_topics(lda_model, cleaned_vectorizer_n_gram, top_words=10)

--------------------
Topic 0:
[('species', 0.5), ('variety', 0.5), ('plant', 0.5), ('woody', 0.5), ('varieties', 0.5), ('currency', 0.5), ('service', 0.5), ('provider', 0.5), ('payer', 0.5), ('payment', 0.5)]
--------------------
Topic 1:
[('shall', 4.53), ('article', 3.96), ('regulation', 3.86), ('union', 3.36), ('member', 3.25), ('commission', 2.54), ('eu', 2.31), ('states', 2.29), ('support', 2.19), ('referred', 2.15)]


In [28]:
#Grid-Search

# Instantiating the LDA
lda = LatentDirichletAllocation()

# Hyperparameter Grid
search_params = {'n_components': [2, 4, 6, 8, 10, 12, 14], 
                 'learning_decay': [.5, .7, .9]}

# Init the Model

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)


# Instanciate Grid Search
g_search = GridSearchCV(lda_model, grid) 


# Getting topics
topics = g_search.fit(df)

In [29]:
# Best score
g_search.best_score_

-1148.7429999909305

In [30]:
# Best Params
g_search.best_params_

{'learning_decay': 0.5, 'n_components': 10}

In [31]:
# Best estimator
g_search.best_estimator_

In [32]:
#Printing topics

print_topics(g_search.best_estimator_, cleaned_vectorizer_n_gram, top_words = 8)

--------------------
Topic 0:
[('shall', 4.14), ('article', 3.56), ('regulation', 3.46), ('union', 2.97), ('member', 2.86), ('commission', 2.14), ('eu', 1.91), ('states', 1.89)]
--------------------
Topic 1:
[('card', 0.1), ('markup', 0.1), ('broadly', 0.1), ('larger', 0.1), ('complex', 0.1), ('comprehensible', 0.1), ('messages', 0.1), ('message', 0.1)]
--------------------
Topic 2:
[('medicinal', 0.61), ('species', 0.56), ('variety', 0.49), ('plant', 0.48), ('woody', 0.42), ('varieties', 0.38), ('mssg', 0.33), ('shortages', 0.29)]
--------------------
Topic 3:
[('card', 0.1), ('markup', 0.1), ('broadly', 0.1), ('larger', 0.1), ('complex', 0.1), ('comprehensible', 0.1), ('messages', 0.1), ('message', 0.1)]
--------------------
Topic 4:
[('abuse', 0.61), ('sexual', 0.5), ('currency', 0.47), ('service', 0.45), ('child', 0.43), ('provider', 0.43), ('payer', 0.39), ('online', 0.35)]
--------------------
Topic 5:
[('card', 0.1), ('markup', 0.1), ('broadly', 0.1), ('larger', 0.1), ('complex'