# Batch Legal LDA Grid Search

***Base***: Preproc Mockup Version 3 (from Chris):
- Adjusted preprocessing steps: sentence tokenization and multiple lemmatizing steps.
- Changed from CountVectorizer to TfidfVectorizer
- Changed input from one document to list of documents

In [1]:
#Imports

import pandas as pd
import string
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.collocations import *

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from sklearn.model_selection import GridSearchCV

In [2]:
#Loading data from csv
data = pd.read_csv("larger_data_scraped.csv")

In [3]:
data.columns

Index(['Unnamed: 0', 'title', 'cellar', 'date', 'dir_code', 'dir_1', 'dir_2',
       'dir_3', 'dir_4', 'dir_5', 'dir_6', 'Content'],
      dtype='object')

In [4]:
df_content = data.Content

In [5]:
df_content.head()

0    THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE...
1    THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE...
2    THE EUROPEAN COMMISSION, Having regard to the ...
3    THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE...
4    THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE...
Name: Content, dtype: object

In [6]:
# list used to remove unrelevant terms 
ignore_list = {'ec', 'no', 'european', 'commission', 'eu', 'union',
                   'article', 'directive', 'council', 'regulation', 'official',
                   'journal', 'article', 'information', 'agency', 'regulation',
                   'mssg', 'data', 'member', 'states', 'etf', 'mdssg', 'shall'
                  }

In [7]:
#Defining Davy's Preproc-Function

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## removing punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    #tokenized_sentence_cleaned = [w for w in tokenized_sentence_cleaned if not w in ignore_list] COMMENTED IGNORE OUT!
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
              for word in tokenized_sentence_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
              for word in tokenized_sentence_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "a") # Lemmatizing the adjectives
              for word in tokenized_sentence_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "r") # Lemmatizing the adverbs
              for word in tokenized_sentence_cleaned]
    cleaned_sentence = ' '.join(word for word in lemmatized)
    return cleaned_sentence

In [8]:
# Applying Davy's Function

clean_txt = df_content.apply(cleaning)

In [9]:
#Checking outcome of Preprocessing
clean_txt


0     european parliament council european union reg...
1     european parliament council european union reg...
2     european commission regard treaty functioning ...
3     european parliament council european union reg...
4     european parliament council european union reg...
                            ...                        
68    european commission regard treaty functioning ...
69    european commission regard treaty functioning ...
70    european commission regard treaty functioning ...
71    european commission regard treaty functioning ...
72    european commission regard treaty functioning ...
Name: Content, Length: 73, dtype: object

In [10]:
clean_txt[0]



In [11]:
# bigram vectorization

vectorizer_n_gram = TfidfVectorizer(ngram_range = (1,1)) # BI-GRAMS
cleaned_vectorizer_n_gram = vectorizer_n_gram.fit_transform(clean_txt)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.DataFrame(cleaned_vectorizer_n_gram.toarray(), columns=vectorizer_n_gram.get_feature_names_out())


In [13]:
df.head()

Unnamed: 0,aa,aarhus,aas,ab,ababa,abandoned,abatement,abbreviation,abbreviations,abeyance,...,βapocarotenoic,βglucan,βsesquiphellandrene,βturmerone,δgdpi,κi,νetwork,νi,υi,ﬁnalised
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.027864,0.0,0.0,0.018994,0.0,0.0,0.002484,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
#Topic model function from ML-10-lecture
def print_topics(model, vectorizer, top_words):
    for idx, topic in enumerate(model.components_):
        print("-"*20)
        print("Topic %d:" % (idx))
        print([(vectorizer_n_gram.get_feature_names_out()[i], round(topic[i],2))
                        for i in topic.argsort()[:-top_words - 1:-1]])

In [15]:
# Instantiating the LDA 
n_components = 3
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 100)

# Fitting the LDA on the vectorized documents
lda_model.fit(df)

In [16]:
print_topics(lda_model, cleaned_vectorizer_n_gram, top_words=10)

--------------------
Topic 0:
[('imo', 0.89), ('iec', 0.76), ('reg', 0.65), ('solas', 0.65), ('resmsc', 0.62), ('en', 0.62), ('hsc', 0.54), ('ed', 0.49), ('code', 0.48), ('incl', 0.44)]
--------------------
Topic 1:
[('code', 0.33), ('en', 0.33), ('resmsc', 0.33), ('solas', 0.33), ('reg', 0.33), ('iec', 0.33), ('hsc', 0.33), ('imo', 0.33), ('ed', 0.33), ('incl', 0.33)]
--------------------
Topic 2:
[('regulation', 9.72), ('shall', 7.32), ('article', 6.53), ('union', 6.5), ('commission', 5.1), ('european', 4.85), ('member', 4.74), ('additive', 4.63), ('eu', 4.51), ('states', 3.59)]


In [17]:
#Grid-Search

# Instantiating the LDA
lda = LatentDirichletAllocation()

# Hyperparameter Grid
search_params = {'n_components': [2, 4, 6, 8], 
                 'learning_decay': [.5, .7, .9]}

# Instanciate Grid Search
g_search = GridSearchCV(lda, search_params) 


# Getting topics
topics = g_search.fit(df)

In [18]:
# Best score
g_search.best_score_

-2059.352343662646

In [19]:
# Best Params
g_search.best_params_

{'learning_decay': 0.5, 'n_components': 2}

In [20]:
# Best estimator
g_search.best_estimator_

In [21]:
#Printing topics

print_topics(g_search.best_estimator_, cleaned_vectorizer_n_gram, top_words = 8)

--------------------
Topic 0:
[('regulation', 9.89), ('shall', 7.48), ('article', 6.69), ('union', 6.66), ('commission', 5.26), ('european', 5.02), ('member', 4.9), ('additive', 4.79)]
--------------------
Topic 1:
[('imo', 1.05), ('iec', 0.92), ('reg', 0.82), ('solas', 0.81), ('resmsc', 0.79), ('en', 0.78), ('hsc', 0.7), ('ed', 0.66)]


# Comparing Outcome with BERTopic results

In [22]:
#PIP-installing BERTtopic

#!pip install bertopic

In [23]:
from bertopic import BERTopic #BERTtopic-model: https://github.com/MaartenGr/BERTopic

In [24]:
#Training

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(clean_txt)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2022-06-01 16:25:49,207 - BERTopic - Transformed documents to Embeddings
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2022-06-01 16:25:53,471 - BERTopic - Reduced dimensionality
2022-06-01 16:25:53,480 - BERTopic - Clustered reduced embeddings


In [25]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,34,0_shall_article_regulation_union
1,1,26,1_imo_iec_union_en
2,2,13,2_additive_feed_premixtures_content
