In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
df = pd.read_csv('../data/scotus_file', usecols = ['author', 'html'])

In [3]:
print(len(df))
df.dropna(how="any", inplace=True)
print(len(df))

63948
21337


In [6]:
def parse_html(raw_html):
    return BeautifulSoup(raw_html, "html.parser").text
df['text'] = df.apply(lambda row: parse_html(row['html']), axis=1)

In [25]:
MAX_FEATURES = 1000
tf_vectorizer = CountVectorizer(min_df=5, max_df=0.9,
                                max_features=MAX_FEATURES,
                                lowercase=True, 
                                token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}',
                                stop_words='english')

In [27]:
dtm = tf_vectorizer.fit_transform(df['text'])
dtm.shape

In [32]:
NUM_TOPICS = 10
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS,
                                      max_iter=10,
                                      learning_method='online',
                                      verbose = 1)

In [33]:
lda_Z = lda_model.fit_transform(dtm)



iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [35]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tf_vectorizer)

LDA Model:
Topic 0:
[('said', 83629.916209462099), ('property', 51456.481996489078), ('plaintiff', 38194.600897437915), ('defendant', 35111.800672575737), ('time', 34898.954922122284), ('contract', 34790.798860925803), ('sale', 28888.685257007975), ('law', 27139.334400037966), ('estate', 26646.986353519147), ('money', 24697.830085842852)]
Topic 1:
[('company', 119990.66082534924), ('railroad', 43307.528321264675), ('said', 40289.243594930595), ('city', 40232.220996844124), ('corporation', 39005.806929502884), ('stock', 30179.779146228415), ('bonds', 28056.690369330754), ('act', 24428.999657301203), ('bank', 23594.082500628054), ('state', 23476.955884713661)]
Topic 2:
[('act', 103244.06604674198), ('united', 71173.176149062347), ('states', 64489.975021464321), ('congress', 42166.906772862356), ('government', 37424.221606424173), ('section', 35333.361283586069), ('board', 35322.344689089732), ('shall', 32868.656756484226), ('labor', 28330.297832379845), ('union', 28165.831379039217)]
Top