In [20]:
import pandas as pd
import numpy as np 
from nltk.corpus import stopwords # get stopwords to remove
import re # regular expression
import string # used to remove punctuation
from gensim.models import Word2Vec # for word embeddings
stop = stopwords.words('english')

# Multi-label classification

Classify the reasons (violations) behind each docket_num (document).

# Read in data

In [21]:
raw_text = pd.read_csv('./data/clean_mea_text.csv') # this holds the raw text
reasons = pd.read_csv('./data/mea_reasons_filtered.csv') # these are our target classifications

In [22]:
print(raw_text.head())
print(raw_text.shape)

         date docket_num                                               text  \
0  2009-11-18     09_160  STATE OF NORTH CAROLINA\nWAKE COUNTY\nIN A MAT...   
1  2009-11-18     09_164  STATE OF NORTH CAROLINA\nWAKE COUNTY\nIN A MAT...   
2  2009-10-16    09_142B  OAH File No. 10 COB 2895\nSTATE OF NORTH CAROL...   
3  2009-09-09     09_081  STATE OF NORTH CAROLINA\nWAKE COUNTY\nIN A MAT...   
4  2009-08-24     09_070  STATE OF NORTH CAROLINA\nWAKE COUNTY\nIN A MAT...   

   year  month  day  
0  2009     11   18  
1  2009     11   18  
2  2009     10   16  
3  2009      9    9  
4  2009      8   24  
(177, 6)


In [23]:
print(reasons.head())
print(reasons.shape)

  docket_num        date                                        reason
0     09_160  11/18/2009                    Conspiracy to commit fraud
1     09_164  11/18/2009                    Conspiracy to commit fraud
2    09_142B  10/16/2009                     Allowed unlawful activity
3     09_081    9/9/2009  Falsification and misrepresentation of loans
4     09_070   8/24/2009                       Retained borrower funds
(375, 3)


In [28]:
# There is a bit of data mismatch, so filter both dfs for text that appears in both
bothdocs = set(raw_text.docket_num.values).intersection(reasons.docket_num.values)
raw_text = raw_text[raw_text.docket_num.isin(bothdocs)]
reasons = reasons[reasons.docket_num.isin(bothdocs)]
print(raw_text.shape)
print(reasons.shape)

(169, 6)
(359, 3)


# Preprocessing

In [29]:
raw_text['text'] = raw_text['text'].astype(str)
# Lowercase
raw_text['text'] = raw_text['text'].str.lower()
# Remove extra space
raw_text['text'] = raw_text['text'].apply(lambda x: re.sub(r'\s+', ' ', x))
# Don't need numbers or punctuation
raw_text['text'] = raw_text['text'].apply(lambda x: re.sub(r'\d+', ' ', x))
raw_text['text'] = raw_text['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
# Remove extra space again in case punctuation created more space 
# (and also need to do before and after since removing punctuation removes /)
raw_text['text'] = raw_text['text'].apply(lambda x: re.sub(r'\s+', ' ', x))
# Remove stop words - don't do if using CBOW since context of word is important
# raw_text['text'] = raw_text['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [30]:
raw_text.head()

Unnamed: 0,date,docket_num,text,year,month,day
0,2009-11-18,09_160,state of north carolina wake county in a matte...,2009,11,18
1,2009-11-18,09_164,state of north carolina wake county in a matte...,2009,11,18
2,2009-10-16,09_142B,oah file no cob state of north carolina eee wa...,2009,10,16
3,2009-09-09,09_081,state of north carolina wake county in a matte...,2009,9,9
4,2009-08-24,09_070,state of north carolina wake county in a matte...,2009,8,24


# Vectorize

We use the Continuous Bag of Words model to create our word embeddings to be used in our ML models.

In [35]:
# Need list of lists as input to gensim
textls = [t.split(' ') for t in raw_text.text]
wordmodel = Word2Vec(textls, min_count=1) # min_count=1 because we're not sure if relevant words occur multiple times
embedls = wordmodel.wv.syn0 # our word embeddings

  after removing the cwd from sys.path.
