In [2]:
from convokit import Corpus, download, TextCleaner, TextParser, BoWTransformer
import pandas as pd
import json

In [3]:
corpus = Corpus(filename="/Users/vaughnfranz/.convokit/downloads/supreme-corpus")

## Data Preprocessing 
These first steps use the built in functionality of convokit. 

The TextCleaner will, by default:
- fix unicode errors, transliterate text to the closest ASCII representation
- lowercase text
- remove line breaks
- replace URLs, emails, phone numbers, numbers, and currency symbols with special tokens

The cleaner will operate by default on the utterances (specifically, utterance.text).

In [4]:
corpus = TextCleaner(verbosity=250000).transform(corpus)

250000/1700789 utterances processed
500000/1700789 utterances processed
750000/1700789 utterances processed
1000000/1700789 utterances processed
1250000/1700789 utterances processed
1500000/1700789 utterances processed
1700789/1700789 utterances processed


The cleaner can also perform custom cleaning. It takes as an input parameter 'text_cleaner' which is a function that produces the cleaned text. Let's use this to perform a couple of additional cleaning steps:
- remove stop words
- remove punctuation
- stem our text

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import string 
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

def custom_cleaner(text):
    toks = word_tokenize(text)
    toks = [word for word in toks if not word in string.punctuation]
    toks = [word for word in toks if not word in stop_words]
    toks = [porter.stem(word) for word in toks]
    cleaned = " ".join(toks)
    return cleaned

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vaughnfranz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
corpus = TextCleaner(verbosity=250000,text_cleaner=custom_cleaner).transform(corpus)

250000/1700789 utterances processed
500000/1700789 utterances processed
750000/1700789 utterances processed
1000000/1700789 utterances processed
1250000/1700789 utterances processed
1500000/1700789 utterances processed
1700789/1700789 utterances processed


In [14]:
utterances_df = corpus.get_utterances_dataframe()

In [15]:
utterances_df.head()

Unnamed: 0_level_0,timestamp,text,speaker,reply_to,conversation_id,meta.case_id,meta.start_times,meta.stop_times,meta.speaker_type,meta.side,meta.timestamp,meta.original,vectors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
13127__0_000,,number number lonni affronti versu unit state ...,j__earl_warren,,13127,1955_71,"[0.0, 7.624]","[7.624, 9.218]",J,,0.0,"number <number>, lonnie affronti versus united...",[]
13127__0_001,,may pleas court writ certiorari eighth circuit...,harry_f_murphy,13127__0_000,13127,1955_71,"[9.218, 11.538, 15.653, 22.722, 28.849, 33.575]","[11.538, 15.653, 22.722, 28.849, 33.575, 48.138]",A,1.0,9.218,may it please the court. we are here by writ o...,[]
13127__0_002,,consecut sentenc,j__william_o_douglas,13127__0_001,13127,1955_71,[48.138],[49.315],J,,48.138,consecutive sentences.,[]
13127__0_003,,consecut sentenc case defend affronti indict n...,harry_f_murphy,13127__0_002,13127,1955_71,"[49.315, 51.844, 60.81, 67.083, 72.584, 89.839...","[51.844, 60.81, 67.083, 72.584, 89.839, 95.873...",A,1.0,49.315,"consecutive sentences. in this case, the defen...",[]
13127__0_004,,aggreg prison sentenc number number year,<INAUDIBLE>,13127__0_003,13127,1955_71,[174.058],[176.766],,,174.058,was the aggregate prison sentence was <number>...,[]


In [16]:
utterances_df["text"][0]

'number number lonni affronti versu unit state america mr. murphi'