# Batch Legal Preproc Mockup Model V2

Adjusted preprocessing steps: No sentence tokenization in this version.

In [26]:
#Imports

import pandas as pd
import string
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.collocations import *

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [95]:
#Loading data from csv
data = pd.read_csv("../raw_data/test_data_scraped.csv")

In [98]:
#Tokenizing the sentences

test_data = data['content'][0]
type(test_data)

str

Ignore list should be updated after once running the preproc on the entire dataset 

In [99]:
# list used to remove unrelevant terms 
ignore_list = {'ec', 'no', 'european', 'commission', 'eu', 'union',
                   'article', 'directive', 'council', 'regulation', 'official',
                   'journal', 'article', 'information', 'agency', 'regulation',
                   'mssg', 'data', 'member', 'states', 'etf', 'mdssg', 'shall'
                  }

In [100]:
#Defining Davy's Preproc-Function

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## removing punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence_cleaned
                                  if not w in ignore_list]
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
              for word in tokenized_sentence_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
              for word in tokenized_sentence_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "a") # Lemmatizing the adjectives
              for word in tokenized_sentence_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "r") # Lemmatizing the adverbs
              for word in tokenized_sentence_cleaned]
    cleaned_sentence = ' '.join(word for word in lemmatized)
    return cleaned_sentence

In [102]:
# Applying Davy's Function

clean_txt = cleaning(test_data)

In [103]:
#Checking outcome of Preprocessing
clean_txt



In [84]:
#Vectorizing data

# vectorizer = CountVectorizer()
# vectorized_text = vectorizer.fit_transform(clean_txt)

In [109]:
# bigram vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_n_gram = TfidfVectorizer(ngram_range = (1,1)) # BI-GRAMS
cleaned_vectorizer_n_gram = vectorizer_n_gram.fit_transform([clean_txt])

In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer


df = pd.DataFrame(cleaned_vectorizer_n_gram.toarray(), columns=vectorizer_n_gram.get_feature_names_out())
df

Unnamed: 0,ability,able,absence,academia,accelerated,accelerating,access,accessed,accessible,accessories,...,without,work,working,world,worsening,would,year,years,zoonoses,zoonotic
0,0.003103,0.012412,0.003103,0.003103,0.003103,0.001551,0.012412,0.001551,0.001551,0.003103,...,0.026375,0.034132,0.046544,0.004654,0.001551,0.012412,0.001551,0.001551,0.004654,0.001551


In [116]:
df.T[0]

ability        0.003103
able           0.012412
absence        0.003103
academia       0.003103
accelerated    0.003103
                 ...   
would          0.012412
year           0.001551
years          0.001551
zoonoses       0.004654
zoonotic       0.001551
Name: 0, Length: 1517, dtype: float64

In [117]:
#Modelling

# Instantiating the LDA 
n_components = 3
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 100)

# Fitting the LDA on the vectorized documents
lda_model.fit(cleaned_vectorizer_n_gram)

# Getting topics
topics = lda_model.transform(cleaned_vectorizer_n_gram)

In [118]:
#Topic model function from ML-10-lecture
def print_topics(model, vectorizer_n_gram, top_words):
    for idx, topic in enumerate(model.components_):
        print("-"*20)
        print("Topic %d:" % (idx))
        print([(vectorizer_n_gram.get_feature_names_out()[i], round(topic[i],2))
                        for i in topic.argsort()[:-top_words - 1:-1]])

In [108]:
#Printing topics

print_topics(lda_model, vectorizer_n_gram, top_words = 8)

--------------------
Topic 0:


NotFittedError: Vocabulary not fitted or provided