# BatchLegal Preprocessing

***Base***: preproc final (from Chris):
- Adjusted preprocessing steps: sentence tokenization and multiple lemmatizing steps.
- Changed from CountVectorizer to TfidfVectorizer
- Now spacy is used for lemmatization

In [1]:
!pip install -U spacy



In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [10]:
#Imports

import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.collocations import *

import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [4]:
#Loading data from csv
data = pd.read_csv("data_19_rows.csv")

In [5]:
data.columns

Index(['Unnamed: 0', 'Date of document', 'Title', 'Subtitle', 'CELEX number',
       'EUROVOC descriptor', 'Subject matter', 'Directory code', 'Author',
       'In force indicator', 'Content'],
      dtype='object')

In [6]:
df_content = data.Content

In [18]:
# Prepared function to later integrate notebook into .py files 
# def get_content(data):
#     return data.content

In [7]:
df_content.head()

0     (1) Pursuant to Articles 9 and 168 of the Tre...
1     (1) The objective of the Union’s policy on as...
2     (1) The development of health technologies is...
3     (1) The Commission communication of 29 Novemb...
4     (1) The Commission communication of 29 Novemb...
Name: Content, dtype: object

In [6]:
# list used to remove unrelevant terms 
ignore_list = {'ec', 'no', 'european', 'commission', 'eu', 'union',
                   'article', 'directive', 'council', 'regulation', 'official',
                   'journal', 'article', 'information', 'agency', 'regulation',
                   'mssg', 'data', 'member', 'states', 'etf', 'mdssg', 'shall'
                  }

In [11]:
def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## removing punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    
    #tokenized_sentence_cleaned = [w for w in tokenized_sentence_cleaned if not w in ignore_list] COMMENTED IGNORE OUT!
    
    sentences = ' '.join(word for word in tokenized_sentence_cleaned)
    
    #spacy
    nlp = spacy.load('en_core_web_sm', disable=["parser"])

    doc = nlp(sentences)
    lemmatized = " ".join([token.lemma_ for token in doc])
    
    return lemmatized

In [12]:
# Applying Davy's Function

clean_txt = df_content.apply(cleaning)

In [13]:
#Checking outcome of Preprocessing
clean_txt


0     pursuant article treaty function european unio...
1     objective union ' policy asylum develop establ...
2     development health technology key driver econo...
3     commission communication november entitle ' fu...
4     commission communication november entitle ' fu...
5     technical difficulty breed due complex genetic...
6     agreement withdrawal united kingdom great brit...
7     regulation eu european parliament council expi...
8     december commission adopt communication entitl...
9     directive ec european parliament council lay r...
10    regulation ec european parliament council subs...
11    context evolve migratory challenge characteris...
12    union ' objective ensure high level security w...
13    national security remain solely competence mem...
14    order achieve smart sustainable inclusive grow...
15    european maritime fishery aquaculture fund ' e...
16    existential threat pose climate change require...
17    customs office situate external border uni

In [14]:
clean_txt[0]



In [11]:
# bigram vectorization

vectorizer_n_gram = TfidfVectorizer(ngram_range = (1,1)) # BI-GRAMS
cleaned_vectorizer_n_gram = vectorizer_n_gram.fit_transform(clean_txt)

In [None]:
# function for integration to .py file later
# def vectorizer(clean_txt):
#     vectorizer_n_gram = TfidfVectorizer(ngram_range = (1,1)) # BI-GRAMS
#     cleaned_vectorizer_n_gram = vectorizer_n_gram.fit_transform(clean_txt)
#     return cleaned_vectorizer_n_gram

In [12]:
df = pd.DataFrame(cleaned_vectorizer_n_gram.toarray(), columns=vectorizer_n_gram.get_feature_names_out())