In [None]:
from plotly import tools
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [9]:
# Usual imports
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os

import sys
sys.path.append("..")
from src.utils import load_env, load_table

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [50]:
env = load_env("../local.env")
df = load_table(env, "dl_hc_orders.orders", "pdf_text", limit=10**5, print_every=10**4)

2020-05-14 08:28:30,226 - src.utils - INFO - Loaded 0 rows of dl_hc_orders.orders
2020-05-14 08:28:30,367 - src.utils - INFO - Loaded 10000 rows of dl_hc_orders.orders
2020-05-14 08:28:30,504 - src.utils - INFO - Loaded 20000 rows of dl_hc_orders.orders
2020-05-14 08:28:30,640 - src.utils - INFO - Loaded 30000 rows of dl_hc_orders.orders
2020-05-14 08:28:30,783 - src.utils - INFO - Loaded 40000 rows of dl_hc_orders.orders
2020-05-14 08:28:30,934 - src.utils - INFO - Loaded 50000 rows of dl_hc_orders.orders
2020-05-14 08:28:31,072 - src.utils - INFO - Loaded 60000 rows of dl_hc_orders.orders
2020-05-14 08:28:31,210 - src.utils - INFO - Loaded 70000 rows of dl_hc_orders.orders
2020-05-14 08:28:31,357 - src.utils - INFO - Loaded 80000 rows of dl_hc_orders.orders
2020-05-14 08:28:31,504 - src.utils - INFO - Loaded 90000 rows of dl_hc_orders.orders
2020-05-14 08:28:31,561 - src.utils - INFO - Number of nan values: 0


In [24]:
df.head()

Unnamed: 0,string,id
0,In The High Court Of Delhi At New Delhi Cs(Os)...,0
1,In The High Court Of Delhi At New Delhi Cs(Os)...,1
2,$~22 * In The High Court Of Delhi At New Delhi...,2
3,$~39 * + Larsen & Toubro Limited & Anr In The ...,3
4,In The High Court Of Delhi At New Delhi 10.08....,4


In [25]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)
nlp = spacy.load('en_core_web_lg')

In [26]:
doc = nlp(df["string"][3])
spacy.displacy.render(doc, style='ent',jupyter=True)

In [27]:
review = str(" ".join([i.lemma_ for i in doc]))
doc = nlp(review)
spacy.displacy.render(doc, style='ent',jupyter=True)

In [51]:
parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [52]:
tqdm.pandas()
df["processed_string"] = df["string"].progress_apply(spacy_tokenizer)

100%|██████████| 100000/100000 [01:55<00:00, 866.93it/s]


In [53]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(df["processed_string"])

In [63]:
NUM_TOPICS = 5

In [64]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [65]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized) 

In [66]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)

In [67]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [68]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('justice', 57408.77018812542), ('honble', 50068.97896172454), ('versus', 46405.24064261523), ('coram', 40448.939381207565), ('adv', 32408.391559389147), ('advocate', 28938.07842627948), ('ors', 22573.57291369387), ('september', 20519.14958527128), ('singh', 18930.263843359124), ('respondents', 16730.722504049543)]
Topic 1:
[('petitioner', 33676.647730383964), ('order', 23672.982459525916), ('petition', 21316.26479801796), ('dated', 19021.652199899334), ('learned', 16307.283426707067), ('application', 15556.102353506742), ('counsel', 13285.644819812313), ('shall', 12163.819332320458), ('respondent', 11697.607793431527), ('parties', 10503.599972711736)]
Topic 2:
[('respondent', 46266.51550333638), ('counsel', 44580.15930544465), ('petitioner', 44179.0800276173), ('advocate', 31109.79962305498), ('present', 27798.92953099665), ('filed', 21715.663939776616), ('learned', 19082.931525337284), ('weeks', 16437.677748447717), ('list', 15919.285811343036), ('file', 13699.04

In [69]:
# Keywords for topics clustered by Latent Semantic Indexing
print("NMF Model:")
selected_topics(nmf, vectorizer)

NMF Model:
Topic 0:
[('petitioner', 26.56995569636603), ('order', 6.947625375705595), ('petition', 6.2949266039593486), ('counsel', 5.107978192454125), ('dated', 4.869514297684798), ('learned', 4.709434258083243), ('respondent', 4.221769769866517), ('present', 3.5092597703041593), ('application', 3.092288721820296), ('said', 2.6455175949837524)]
Topic 1:
[('plaintiff', 12.892216778925937), ('defendant', 12.792275521024877), ('counsel', 7.118057728298815), ('filed', 5.901429273282107), ('joint', 4.866170845594627), ('registrar', 4.852483462151228), ('application', 4.262368660206288), ('defendants', 3.866109137803685), ('dhjs', 3.7480899538995063), ('documents', 3.6251634824453185)]
Topic 2:
[('respondent', 19.90246530815137), ('counsel', 7.166461862220928), ('advocate', 6.641493543962795), ('filed', 4.403242971456712), ('appellant', 4.063178454658431), ('notice', 3.3911217474339868), ('present', 2.8971688239158397), ('learned', 2.790555574360786), ('weeks', 2.6704344333213177), ('septem

In [70]:
# Keywords for topics clustered by Non-Negative Matrix Factorization
print("LSI Model:")
selected_topics(lsi, vectorizer)

LSI Model:
Topic 0:
[('petitioner', 0.40599530564914643), ('respondent', 0.3451393555179713), ('counsel', 0.2831542608956604), ('versus', 0.18923636342291714), ('advocate', 0.18660439487718722), ('learned', 0.1721978281039284), ('filed', 0.16954651178992836), ('justice', 0.15793177232000669), ('order', 0.1523242662810161), ('respondents', 0.14879330756551276)]
Topic 1:
[('petitioner', 0.40704041438741606), ('respondent', 0.24330324827991714), ('respondents', 0.11894664422438074), ('pharmacy', 0.08178512112492474), ('crl', 0.07764651589838677), ('petition', 0.07111822085041049), ('justice', 0.05956753959548943), ('india', 0.05606287723637576), ('anr', 0.052106979106896045), ('council', 0.05019590095568996)]
Topic 2:
[('petitioner', 0.4847728858965246), ('order', 0.1932144382222651), ('petition', 0.14376208978559246), ('dated', 0.1290654339701133), ('present', 0.10591376690896535), ('application', 0.10281199508218712), ('said', 0.0868420230128064), ('learned', 0.0820379161699399), ('defe

In [71]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash