conda install -c conda-forge gensim

In [1]:
import pprint

In [2]:
# Create a set of frequent words
stoplist = set('for a of the and to in with also that'.split(' '))

In [3]:
stoplist

{'a', 'also', 'and', 'for', 'in', 'of', 'that', 'the', 'to', 'with'}

In [4]:
#from collections import defaultdict
#from gensim import corpora
#from collections import defaultdict
#from gensim import corpora
#from gensim import models
#from gensim import similarities

In [5]:
legal_document = ""

In [6]:
example = "Thomas Jefferson defeated John Adams in the presidential election of 1800, which was decided on February 17, 1801. Before Jefferson took office on March 4, Adams and Congress passed the Judiciary Act of 1801, which created new district courts, expanded the number of circuit courts, added more judges to each circuit, gave the President more control over appointing federal judges, and reduced the number of Supreme Court Justices from six to five. This law essentially was an attempt by Adams and his political party to frustrate the incoming opposition, since he used his new power to appoint 16 new circuit judges and 42 new justices of the peace, a group known as the ""Midnight Judges."" The incoming appointees were approved by the Adams Senate, but their appointments were not valid until each of their commissions was delivered by John Marshall in his capacity as acting Secretary of State."

In [7]:
example

'Thomas Jefferson defeated John Adams in the presidential election of 1800, which was decided on February 17, 1801. Before Jefferson took office on March 4, Adams and Congress passed the Judiciary Act of 1801, which created new district courts, expanded the number of circuit courts, added more judges to each circuit, gave the President more control over appointing federal judges, and reduced the number of Supreme Court Justices from six to five. This law essentially was an attempt by Adams and his political party to frustrate the incoming opposition, since he used his new power to appoint 16 new circuit judges and 42 new justices of the peace, a group known as the Midnight Judges. The incoming appointees were approved by the Adams Senate, but their appointments were not valid until each of their commissions was delivered by John Marshall in his capacity as acting Secretary of State.'

In [8]:
legal_document = example

In [9]:
text_list = list(legal_document.split('.'))

In [10]:
text_list

['Thomas Jefferson defeated John Adams in the presidential election of 1800, which was decided on February 17, 1801',
 ' Before Jefferson took office on March 4, Adams and Congress passed the Judiciary Act of 1801, which created new district courts, expanded the number of circuit courts, added more judges to each circuit, gave the President more control over appointing federal judges, and reduced the number of Supreme Court Justices from six to five',
 ' This law essentially was an attempt by Adams and his political party to frustrate the incoming opposition, since he used his new power to appoint 16 new circuit judges and 42 new justices of the peace, a group known as the Midnight Judges',
 ' The incoming appointees were approved by the Adams Senate, but their appointments were not valid until each of their commissions was delivered by John Marshall in his capacity as acting Secretary of State',
 '']

In [11]:
text_corpus = text_list

In [12]:
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]

In [13]:
texts

[['thomas',
  'jefferson',
  'defeated',
  'john',
  'adams',
  'presidential',
  'election',
  '1800,',
  'which',
  'was',
  'decided',
  'on',
  'february',
  '17,',
  '1801'],
 ['before',
  'jefferson',
  'took',
  'office',
  'on',
  'march',
  '4,',
  'adams',
  'congress',
  'passed',
  'judiciary',
  'act',
  '1801,',
  'which',
  'created',
  'new',
  'district',
  'courts,',
  'expanded',
  'number',
  'circuit',
  'courts,',
  'added',
  'more',
  'judges',
  'each',
  'circuit,',
  'gave',
  'president',
  'more',
  'control',
  'over',
  'appointing',
  'federal',
  'judges,',
  'reduced',
  'number',
  'supreme',
  'court',
  'justices',
  'from',
  'six',
  'five'],
 ['this',
  'law',
  'essentially',
  'was',
  'an',
  'attempt',
  'by',
  'adams',
  'his',
  'political',
  'party',
  'frustrate',
  'incoming',
  'opposition,',
  'since',
  'he',
  'used',
  'his',
  'new',
  'power',
  'appoint',
  '16',
  'new',
  'circuit',
  'judges',
  '42',
  'new',
  'justices',


Before proceeding, we want to associate each word in the corpus with a unique integer ID. 
We can do this using the gensim.corpora.Dictionary class. 
This dictionary defines the vocabulary of all words that our processing knows about.

In [14]:
# Count word frequencies
from collections import defaultdict

In [15]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

In [16]:
# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus)

[['jefferson', 'john', 'adams', 'which', 'was', 'on'],
 ['jefferson',
  'on',
  'adams',
  'which',
  'new',
  'courts,',
  'number',
  'circuit',
  'courts,',
  'more',
  'judges',
  'each',
  'more',
  'number',
  'justices'],
 ['was',
  'by',
  'adams',
  'his',
  'incoming',
  'his',
  'new',
  'new',
  'circuit',
  'judges',
  'new',
  'justices',
  'as',
  'judges'],
 ['incoming',
  'were',
  'by',
  'adams',
  'their',
  'were',
  'each',
  'their',
  'was',
  'by',
  'john',
  'his',
  'as'],
 []]


In [17]:
from gensim import corpora

Because our corpus is small, there are only 20 different tokens in this gensim.corpora.Dictionary. 
For larger corpuses, dictionaries that contains hundreds of thousands of tokens are quite common.

Our processed corpus has 20 unique words in it, which means that each document will be represented by a 20-dimensional vector under the bag-of-words model. 
We can use the dictionary to turn tokenized documents into these 20-dimensional vectors. We can see what these IDs correspond to:

In [32]:
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(20 unique tokens: ['adams', 'jefferson', 'john', 'on', 'was']...)


In [33]:
pprint.pprint(dictionary.token2id)

{'adams': 0,
 'as': 14,
 'by': 15,
 'circuit': 6,
 'courts,': 7,
 'each': 8,
 'his': 16,
 'incoming': 17,
 'jefferson': 1,
 'john': 2,
 'judges': 9,
 'justices': 10,
 'more': 11,
 'new': 12,
 'number': 13,
 'on': 3,
 'their': 18,
 'was': 4,
 'were': 19,
 'which': 5}


For example, suppose we wanted to vectorize the phrase “jefferson adams” 
(note that this phrase was not in our original corpus). 
We can create the bag-of-word representation for a document using the doc2bow method of the dictionary, 
which returns a sparse representation of the word counts:

In [37]:
new_doc = "jefferson interaction adams"

In [38]:
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(0, 1), (1, 1)]


Explanation

The first entry in each tuple corresponds to the ID of the token in the dictionary, 
the second corresponds to the count of this token.

Note that “interaction” did not occur in the original corpus and 
so it was not included in the vectorization. 

Also note that this vector only contains entries for words that actually appeared in the document. 
Because any given document will only contain a few words out of the many words in the dictionary, 
words that do not appear in the vectorization are represented as implicitly zero as a space saving measure.

In [40]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(bow_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)],
 [(0, 1),
  (1, 1),
  (3, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 2),
  (12, 1),
  (13, 2)],
 [(0, 1),
  (4, 1),
  (6, 1),
  (9, 2),
  (10, 1),
  (12, 3),
  (14, 1),
  (15, 1),
  (16, 2),
  (17, 1)],
 [(0, 1),
  (2, 1),
  (4, 1),
  (8, 1),
  (14, 1),
  (15, 2),
  (16, 1),
  (17, 1),
  (18, 2),
  (19, 2)],
 []]


In [41]:
from gensim import models

# train the model
tfidf = models.TfidfModel(bow_corpus)

In [42]:
words = "judges justices".lower().split()

In [43]:
print(tfidf[dictionary.doc2bow(words)])

[(9, 0.7071067811865475), (10, 0.7071067811865475)]


In [44]:
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)

In [45]:
index

<gensim.similarities.docsim.SparseMatrixSimilarity at 0x7fc3f565b490>

### The kernel appears to have died. It will restart automatically.

In [46]:
query_document = 'supreme judges'.split()
query_bow = dictionary.doc2bow(query_document)
#sims = index[tfidf[query_bow]]
#print(list(enumerate(sims)))

sims = tfidf[query_bow]

In [47]:
sims

[(9, 1.0)]

In [48]:
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

0 (9, 1.0)
