In [38]:
import pandas as pd
import nltk
#nltk.download('punkt')

In [39]:
dataset = pd.read_pickle('other_xmltodict_all_major_fields.pkl')

### Creating the corpus

In [12]:
def make_corpus(dataset, column):
    '''This function reads in a pandas dataframe (dataset) and the column name (e.g. 'text_column'). 
    Then it strips all punctuation, makes it lower case, tokenizes it, and removes english stopwords. 
    Finally it returns two lists (either directly or via tuple unpacking. the first is the documents
    without stopwords; the second is the documents stemmed via porterstemmer.
    '''
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    stop_words = set(stopwords.words('english'))
    import string 
    import sys
    translator = str.maketrans('', '', string.punctuation)
    import re
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    filtered_text_list = []
    stemmed_text_list = []
    for i in range (0, len(dataset)):
        text = dataset.iloc[i][column].lower().translate(translator)
        text = re.sub('\n', ' ', text)
        word_tokens = word_tokenize(text) 
        filtered_text = [w for w in word_tokens if not w in stop_words]
        filtered_text = []
        for w in word_tokens:
            if w not in stop_words:
                filtered_text.append(w)
        stemmed_text = []
        for w in filtered_text:
            stemmed_text.append(ps.stem(w))
        filtered_text_list.append(str(filtered_text).translate(translator))
        stemmed_text_list.append(str(stemmed_text).translate(translator))
        sys.stdout.write("\r" + "Creating Corpus.. Processing Record: " + str(i+1) + " of " + str(len(dataset)))
        sys.stdout.flush()
    print("\nDONE!")
    return filtered_text_list, stemmed_text_list

### Creating the Sparse Matrix

In [41]:
filtered_text, stemmed_text = make_corpus(dataset,'section_bodies')

Creating Corpus.. Processing Record: 4109 of 4109
DONE!


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
cv = CountVectorizer()

In [23]:
X = cv.fit_transform(filtered_text).toarray()

In [27]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [29]:
X.shape

(4109, 18639)

In [31]:
#The count of each word in the sparse matrix
cv.vocabulary_

{'contravention': 6316,
 'provision': 13561,
 'environmental': 7937,
 'act': 3701,
 'regulations': 14131,
 'set': 15490,
 'column': 5874,
 'schedule': 14882,
 'designated': 7023,
 'violation': 18179,
 'may': 11109,
 'proceeded': 13388,
 'accordance': 3629,
 'order': 12128,
 'direction': 7192,
 'made': 10926,
 'failure': 8313,
 'comply': 6021,
 'condition': 6093,
 'permit': 12875,
 'licence': 10680,
 'authorization': 4512,
 'issued': 10230,
 'referred': 14037,
 'classified': 5707,
 'type': 17708,
 'respective': 14424,
 'amount': 4052,
 'penalty': 12810,
 'determined': 7083,
 'formula': 8719,
 'baseline': 4665,
 'corresponds': 6425,
 'category': 5406,
 'violator': 18181,
 'committed': 5934,
 'columns': 5876,
 'respectively': 14425,
 'history': 9408,
 'noncompliance': 11689,
 'five': 8544,
 'years': 18594,
 'preceding': 13216,
 'purpose': 13640,
 'subsection': 16426,
 'means': 11121,
 'imposition': 9673,
 'ticket': 17351,
 'conviction': 6361,
 'injunction': 9918,
 'use': 17991,
 'protecti