In [1]:
# Imports required libaries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import spacy
import re
from nltk.corpus import stopwords
import nltk
from collections import Counter
# nltk.download('names')
# nltk.download('stopwords')

In [2]:
# Load spacy and extract stop words
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
nlp = spacy.load("en_core_web_sm")



In [3]:
# Load the data frame
sc_india = pd.read_csv('Supreme_Court_cases_India_all.csv')
if 'Unnamed: 0' in sc_india.columns:
    sc_india.drop(['Unnamed: 0'], axis = 1, inplace = True)
sc_india.head()

Unnamed: 0,Case Title,Year,Author,Judges,Document Id
0,Hardeep Singh vs State Of Punjab & Ors on 10 J...,1947,. B Chauhan,"P Sathasivam, B.S. Chauhan, Ranjana Prakash D...",52754564
1,Sanjay Kumar vs State Of Bihar & Anr on 28 Jan...,1947,…………......................J.,"B.S. Chauhan, J. Chelameswar, M.Y. Eqbal",130498285
2,State Of T.Nadu Tr.Insp.Of Police vs N Suresh ...,1947,C K Prasad,"Chandramauli Kr. Prasad, M.Y. Eqbal",23015624
3,Union Of India & Ors vs Vasavi Co-Op. Housing ...,1947,…..………………………J.,"K.S. Radhakrishnan, A.K. Sikri",175295508
4,"Syed Sadiq Etc vs Divisional Manager,United In...",1947,………………………………………………………………………J.,"Sudhansu Jyoti Mukhopadhaya, V. Gopala Gowda",140442067


In [4]:
sc_india.shape

(53423, 5)

In [5]:
# Logic to fetch the paragraph from the site
website = 'https://indiankanoon.org/doc/130498285/'
response = requests.get(website)
soup = BeautifulSoup(response.content, 'html.parser')
print(soup)
for i in range(1, 1000):
        try:
            para = soup.find('p',  {'id' : 'p_'+str(i)}).get_text().strip()
            doc_para_list.append(para)
            # print(para)
        except:
            break

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">

<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<link href="/static/jquery-ui.1.12.min.css" rel="stylesheet" type="text/css"/>
<link href="/static/jquery-ui.1.12.theme.min.css" rel="stylesheet" type="text/css"/>
<link href="/static/search_desktop_v16.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript">
      (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');

      ga('create', 'UA-87326-4', 'auto');
      ga('require', 'linkid', 'linkid.js');
      ga('require', 'displayfeatures');
      ga('send', 'pageview');

    </script>
<title>Sanjay Kumar vs State Of Bihar &amp; Anr on 28

In [6]:
# doc_list contains a strings
doc_list = []
for i in range(20):
    website = 'https://indiankanoon.org/doc/'+str(sc_india['Document Id'][i])+'/'
    response = requests.get(website)
    soup = BeautifulSoup(response.content, 'html.parser')
    doc_m = ""
    for i in range(1, 1000):
        try:
            para = soup.find('p',  {'id' : 'p_'+str(i)}).get_text().strip()
            doc_m = doc_m + para
        except:
            break
    doc_list.append(doc_m.casefold())

In [7]:
len(doc_list)

20

In [8]:
# Get all the possible stop words using available libraries
# Default stop word by spacy

# All stopwords are present in lower case
spacy_stop = nlp.Defaults.stop_words

# Names by nltk
names = nltk.corpus.names
male_names = names.words('male.txt')
male_names = [name.casefold() for name in male_names]
female_names = names.words('female.txt')
female_names = [name.casefold() for name in female_names]

# Months are also stop words in our case
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august' 'september', 'october', 'november',
          'december']
# Other most common less meaningful words
more_words = ['court', 'courts', 'justice', 'appeals','appeal', 'argued', 'argue', 'decide', 'rptr', 'nervine', 'pp','fd', 'states',
              'supreme', 'opinion' ,'cheif', 'el', 'op', 'quotation', "n't" , 'a', 'aan', 'aba', 'aand', 'of', 'm', 'u',
              'f', 'j', 'juan', 'ca', 'aa', 'aaa','aab', 'aabd', 'aac',  'aag', 'aai', 'aaii', 'aaiii', 'aalthough', 'judge', 
              'author', 'state', 'country', 'india']

evenmore_words = ['join', 'seek','hummel', 'note', 'curiam', 'mosk', 'pd', 'rhino', 'misc', 'assistant', 'whereon', 'dismiss',
                  'sod', 'vote', 'present', 'entire', 'frankfurter', 'leave', 'concur', 'entire', 'mootness', 'track',
                  'constitution', 'jj', 'stat', 'messes', 'like', 'rev', 'trans', 'bra', 'teller', 'lead', 'cf', 'cca', 
                  'stucky', 'aver',"united", "dissent", "footnote","brief", "decision", "member", "curiam", "dismiss", 
                  "note", "affirm", "question", "usc", "file", "district", "circuit", "mr", "law", "quoting", "omit", 'period',
                  "amendment","internal", "slip", 'omitted', 'suit' ,'lawsuit', 'marks', "jr", "findlaw ", "href", "defendant",
                  "judge", "rule", "claim", "comussupremecourt", "petitioner", "act", "federal", "statute", "government", "right",
                  'bench', 'crpc', 'section']


In [9]:
# State Names as well as its capitals

website = 'https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals_in_India'
response = requests.get(website)
soup = BeautifulSoup(response.content, 'html.parser')
states_table = soup.find_all('table', {'class' : 'wikitable'})[0]
states_table = states_table.find_all('tr')
states = []
executive_capital = []
legislative_capital = []
judicial_capital = []
for i in range(1, len(states_table)):
    state = states_table[i].find('th', {'scope':'row'}).get_text().strip()
    states.append(state.casefold())
    e_capital = re.sub(r'\[\w\]|(\(Summer\))|(\(Winter\))', '', states_table[i].find_all('td')[1].get_text()).strip()
    executive_capital.append(e_capital.casefold())
    l_capital = re.split(r'\s+',(re.sub(r'(\[\w\])|(\(Summer\))|(\(Winter\))', '', states_table[i].find_all('td')[2].get_text())).strip())
    for l in l_capital : legislative_capital.append(l.casefold()) 
    j_capital = re.sub(r'\[\w\]', '', states_table[i].find_all('td')[3].get_text()).strip()
    judicial_capital.append(j_capital.casefold())
    
union_territories_table = soup.find_all('table', {'class' : 'wikitable'})[1]
union_territories_table = union_territories_table.find_all('tr')
for i in range(1, len(union_territories_table)):
    ut = union_territories_table[i].find_all('td')[1].get_text().strip()
    states.append(ut.casefold())
    e_capital = re.split(r'\s+',(re.sub(r'(\[\w\])|(\(Summer\))|(\(Winter\))', '', union_territories_table[i].find_all('td')[2].get_text())).strip())
    for e in e_capital : 
        if e not in executive_capital:
            executive_capital.append(e.casefold())
    l_capital = re.split(r'\s+',(re.sub(r'(\[\w\])|(\(Summer\))|(\(Winter\))', '', union_territories_table[i].find_all('td')[3].get_text()).strip()))
    for l in l_capital : 
        if l != '-' and l not in legislative_capital : 
            legislative_capital.append(l.casefold()) 
    j_capital = re.split(r'\s+',(re.sub(r'(\[\w\])|(\(Summer\))|(\(Winter\))', '', union_territories_table[i].find_all('td')[4].get_text())).strip())
    for j in j_capital : 
        if j not in judicial_capital:
            judicial_capital.append(j.casefold())


In [10]:
print(executive_capital)

['visakhapatnam', 'itanagar', 'dispur', 'patna', 'naya raipur', 'panaji', 'gandhinagar', 'chandigarh', 'shimla', 'ranchi', 'bengaluru', 'thiruvananthapuram', 'bhopal', 'mumbai', 'imphal', 'shillong', 'aizawl', 'kohima', 'bhubaneswar', 'chandigarh', 'jaipur', 'gangtok', 'chennai', 'hyderabad', 'agartala', 'lucknow', 'gairsain   (summer)\ndehradun (winter)', 'kolkata', 'port', 'blair', 'chandigarh', 'daman', 'new', 'delhi', 'srinagar', '(summer)', 'jammu', '(winter)', 'leh', 'kargil', 'kavaratti', 'pondicherry']


In [11]:
print(legislative_capital)

['amaravati', 'itanagar', 'dispur', 'patna', 'raipur', 'porvorim', 'gandhinagar', 'chandigarh', 'shimla', 'dharamsala', 'ranchi', 'bengaluru', 'thiruvananthapuram', 'bhopal', 'mumbai', 'nagpur', 'imphal', 'shillong', 'aizawl', 'kohima', 'bhubaneswar', 'chandigarh', 'jaipur', 'gangtok', 'chennai', 'hyderabad', 'agartala', 'lucknow', 'gairsain', '(summer)dehradun', '(winter)', 'kolkata', 'new', 'delhi', 'srinagar', '(summer)', 'jammu', 'pondicherry']


In [12]:
print(judicial_capital)

['kurnool', 'guwahati', 'guwahati', 'patna', 'bilaspur', 'mumbai', 'ahmedabad', 'chandigarh', 'shimla', 'ranchi', 'bengaluru', 'kochi', 'jabalpur', 'mumbai', 'imphal', 'shillong', 'guwahati', 'guwahati', 'cuttack', 'chandigarh', 'jodhpur', 'gangtok', 'chennai', 'hyderabad', 'agartala', 'prayagraj', 'nainital', 'kolkata', 'kolkata', 'chandigarh', 'mumbai', 'new', 'delhi', 'srinagar', '(summer)', 'jammu', '(winter)', 'srinagar', 'jammu', 'kochi', 'chennai']


In [13]:
# sc_india['Judges'].isnull().sum()
# sc_india['Author'].isnull().sum()
# sc_india['Judges'].nunique()
# sc_india['Author'].nunique()

In [14]:
# Get the justice as well as author names
judges_doc = list(sc_india['Judges'].unique())
authors_doc = list(sc_india['Author'].unique())
judges = []
authors = []
for judges_all in judges_doc:
    try:
        judges_split = judges_all.split(',')
        for judge in judges_split:
            if judge not in judges :
                judges.append(judge.casefold())
    except:
        continue
        

for authors_all in authors_doc:
    try:
        authors_split = authors_all.split(',')
        for author in authors_split:
            if author not in authors :
                authors.append(author.casefold())
    except:
        continue

In [15]:
# Get Indian names as well
indian_male = pd.read_csv('Indian-Male-Names.csv')
names = list(indian_male.name)
indian_male_name = []
for name in names:
    try:
        for nm in name.split(' '):
            indian_male_name.append(nm.replace('@', ''))
    except:
        continue
        
indian_male_name = set(indian_male_name)
# indian_male_name

indian_female = pd.read_csv('Indian-Female-Names.csv')
names = list(indian_female.name)
indian_female_name = []
for name in names:
    try:
        for nm in name.split(' '):
            indian_female_name.append(nm.replace('@', ''))
    except:
        continue
        
indian_female_name = set(indian_female_name)
# indian_female_name

In [16]:
STOPLIST = set( stopwords.words('english')  + list(ENGLISH_STOP_WORDS) + judges + authors + list(female_names) + list(male_names) 
                + list(spacy_stop) + months + more_words + evenmore_words + states + executive_capital + list(indian_male_name) + 
                list(indian_female_name) + legislative_capital + judicial_capital)
STOPLIST = set(stopword.lower() for stopword in STOPLIST)

In [17]:
len(STOPLIST)

20762

In [18]:
doc_list[0]

'dr. b.s. chauhan, j.1.   this reference before us arises out of a variety of views  having\n      been expressed by this court and several high courts of the country on\n      the scope and extent of the powers of the courts  under  the  criminal\n      justice system to arraign any person as an accused during  the  course\n      of inquiry or trial as contemplated under section 319 of the  code  of\n      criminal procedure, 1973 (hereinafter referred to as the `cr.p.c.’).2.    the initial reference was made by a two-judge bench  vide  order\n      dated 7.11.2008 in the leading case of hardeep singh (crl. appeal  no.\n      1750 of 2008)  where noticing the conflict between  the  judgments  in\n      the case of rakesh v. state of haryana, air 2001 sc 2521; and  a  two-\n      judge bench decision in the case of mohd. shafi v. mohd. rafiq & anr.,\n      air 2007 sc 1899, a doubt was expressed about the correctness  of  the\n      view in the case of mohd. shafi (supra). the doubts a

In [19]:
# clean the text with some overused words
def clean(text):
#   print(type(text))
    text = text.replace("argued:", " ")
    text = text.replace("supreme court of india", " ")
    text = text.replace("constitution of india", " ")
    text = text.replace("\\n", " ")       
    text = re.sub(r'\(\w+\)', '', text)  # remove all present the content in the brackets
    text = re.sub(r'[^a-z^ ]', '', text)  # remove all chars other than a-z and space
    return text

In [20]:
clean_doc = clean(doc_list[0])

In [21]:
clean_doc

'dr bs chauhan j   this reference before us arises out of a variety of views  having      been expressed by this court and several high courts of the country on      the scope and extent of the powers of the courts  under  the  criminal      justice system to arraign any person as an accused during  the  course      of inquiry or trial as contemplated under section  of the  code  of      criminal procedure  hereinafter referred to as the crpc    the initial reference was made by a twojudge bench  vide  order      dated  in the leading case of hardeep singh crl appeal  no       of   where noticing the conflict between  the  judgments  in      the case of rakesh v state of haryana air  sc  and  a  two      judge bench decision in the case of mohd shafi v mohd rafiq  anr      air  sc  a doubt was expressed about the correctness  of  the      view in the case of mohd shafi  the doubts as categorised  in      paragraphs  and  of the reference order led to the framing of  two      questions 

In [22]:
# Tokenization of the doc
def spacy_tokenizer(sentence):
    tokens = nlp(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (len(tok)>1)]
    tokens = [tok for tok in tokens if tok not in STOPLIST] 
    return tokens


In [23]:
token_doc = spacy_tokenizer(clean_doc)

In [24]:
token_doc

['bs',
 'reference',
 'arise',
 'variety',
 'view',
 'express',
 'high',
 'scope',
 'extent',
 'power',
 'criminal',
 'arraign',
 'accuse',
 'course',
 'inquiry',
 'trial',
 'contemplate',
 'code',
 'criminal',
 'procedure',
 'hereinafter',
 'refer',
 'initial',
 'reference',
 'twojudge',
 'vide',
 'order',
 'date',
 'crl',
 'notice',
 'conflict',
 'judgment',
 'air',
 'sc',
 'anr',
 'air',
 'sc',
 'doubt',
 'express',
 'correctness',
 'view',
 'doubt',
 'categorise',
 'paragraph',
 'reference',
 'order',
 'framing',
 'reproduce',
 'hereunder',
 'reference',
 'desire',
 'resolve',
 'threejudge',
 'come',
 'consideration',
 'vide',
 'order',
 'date',
 'opine',
 'view',
 'reference',
 'anr',
 'scc',
 'issue',
 'involve',
 'identical',
 'nature',
 'resolve',
 'consist',
 'feel',
 'threejudge',
 'refer',
 'matter',
 'event',
 'appropriate',
 'overlap',
 'issue',
 'resolve',
 'similar',
 'strength',
 'reference',
 'come',
 'answer',
 'relation',
 'power',
 'session',
 'invoke',
 'stage',
 '

In [25]:
if 'chauhan' in token_doc:
    print('yes')
else:
    print('no')

no


In [26]:
word_count = Counter(token_doc)

In [27]:
# Create a dic of the word_count for each doc
def word_count_dic(doc_id, token_doc):
    word_count = Counter(token_doc)
    sort_word_count = sorted(word_count.items(), key= lambda x:x[1], reverse=True)
    return {doc_id : sort_word_count}

word_count_dic(sc_india['Document Id'][i], token_doc)

{63441281: [('evidence', 133),
  ('trial', 102),
  ('accuse', 90),
  ('power', 86),
  ('stage', 81),
  ('inquiry', 67),
  ('exercise', 61),
  ('material', 58),
  ('sc', 55),
  ('air', 54),
  ('hold', 53),
  ('offence', 52),
  ('word', 52),
  ('provision', 43),
  ('magistrate', 37),
  ('session', 36),
  ('summon', 34),
  ('proceed', 32),
  ('anr', 31),
  ('use', 31),
  ('record', 31),
  ('appear', 26),
  ('cognizance', 25),
  ('try', 24),
  ('view', 23),
  ('charge', 23),
  ('invoke', 22),
  ('basis', 22),
  ('purpose', 22),
  ('mean', 22),
  ('fact', 20),
  ('discharge', 19),
  ('commit', 19),
  ('commence', 19),
  ('require', 19),
  ('come', 18),
  ('chargesheet', 18),
  ('witness', 18),
  ('refer', 17),
  ('investigation', 17),
  ('order', 16),
  ('add', 16),
  ('legislature', 16),
  ('face', 16),
  ('issue', 15),
  ('committal', 15),
  ('time', 15),
  ('satisfaction', 15),
  ('collect', 15),
  ('prima', 15),
  ('facie', 15),
  ('till', 14),
  ('prosecution', 14),
  ('reference', 13)

In [28]:
# Create a DataFrame to store the token along with it's id
token_sc_india = pd.DataFrame(columns=['Document Id', 'Tokens'])
word_count_list = []

for x, y in enumerate(doc_list):
    doc_id = sc_india['Document Id'][x]
    clean_doc = clean(y)    
    token_doc = spacy_tokenizer(clean_doc)
    token_sc_india = token_sc_india.append({ 'Document Id' : doc_id, 'Tokens' : token_doc}, ignore_index = True)
    word_count_list.append(word_count_dic(doc_id, token_doc))

In [29]:
word_count_list

[{52754564: [('evidence', 133),
   ('trial', 102),
   ('accuse', 90),
   ('power', 86),
   ('stage', 81),
   ('inquiry', 67),
   ('exercise', 61),
   ('material', 58),
   ('sc', 55),
   ('air', 54),
   ('hold', 53),
   ('offence', 52),
   ('word', 52),
   ('provision', 43),
   ('magistrate', 37),
   ('session', 36),
   ('summon', 34),
   ('proceed', 32),
   ('anr', 31),
   ('use', 31),
   ('record', 31),
   ('appear', 26),
   ('cognizance', 25),
   ('try', 24),
   ('view', 23),
   ('charge', 23),
   ('invoke', 22),
   ('basis', 22),
   ('purpose', 22),
   ('mean', 22),
   ('fact', 20),
   ('discharge', 19),
   ('commit', 19),
   ('commence', 19),
   ('require', 19),
   ('come', 18),
   ('chargesheet', 18),
   ('witness', 18),
   ('refer', 17),
   ('investigation', 17),
   ('order', 16),
   ('add', 16),
   ('legislature', 16),
   ('face', 16),
   ('issue', 15),
   ('committal', 15),
   ('time', 15),
   ('satisfaction', 15),
   ('collect', 15),
   ('prima', 15),
   ('facie', 15),
   ('ti

In [30]:
token_sc_india.head()

Unnamed: 0,Document Id,Tokens
0,52754564,"[bs, reference, arise, variety, view, express,..."
1,130498285,"[special, petition, impugn, judgment, order, d..."
2,23015624,"[chandramauli, jcriminal, special, petitioncrl..."
3,175295508,"[ks, radhakrishnan, vasavi, coop, housing, soc..."
4,140442067,"[gopala, gowda, jleave, appellant, correctness..."


In [31]:
sc_india.head()

Unnamed: 0,Case Title,Year,Author,Judges,Document Id
0,Hardeep Singh vs State Of Punjab & Ors on 10 J...,1947,. B Chauhan,"P Sathasivam, B.S. Chauhan, Ranjana Prakash D...",52754564
1,Sanjay Kumar vs State Of Bihar & Anr on 28 Jan...,1947,…………......................J.,"B.S. Chauhan, J. Chelameswar, M.Y. Eqbal",130498285
2,State Of T.Nadu Tr.Insp.Of Police vs N Suresh ...,1947,C K Prasad,"Chandramauli Kr. Prasad, M.Y. Eqbal",23015624
3,Union Of India & Ors vs Vasavi Co-Op. Housing ...,1947,…..………………………J.,"K.S. Radhakrishnan, A.K. Sikri",175295508
4,"Syed Sadiq Etc vs Divisional Manager,United In...",1947,………………………………………………………………………J.,"Sudhansu Jyoti Mukhopadhaya, V. Gopala Gowda",140442067


In [32]:
sc_india.Year.unique()

array([1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957,
       1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968,
       1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979,
       1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021], dtype=int64)

In [33]:
sc_india[sc_india.Year >= 2001].shape

(25007, 5)

In [34]:
sc_india_recent = sc_india[sc_india.Year >= 2006].reset_index()   
sc_india_recent.drop(['index'], axis = 1, inplace = True) 
sc_india_recent.head()

Unnamed: 0,Case Title,Year,Author,Judges,Document Id
0,"Mayar (H.K.) Ltd. & Ors vs Owners & Parties, V...",2006,P Naolekar,"Ruma Pal, P.P. Naolekar",1903089
1,Sandvik Asia Ltd vs Commissioner Of Income Tax...,2006,. A Lakshmanan,"H.K. Sema, Dr. Ar. Lakshmanan",287995
2,Mohd. Yousuf vs Smt. Afaq Jahan & Anr on 2 Jan...,2006,A Pasayat,"Arijit Pasayat, S.H. Kapadia",1938541
3,State Of Karnataka & Ors vs Kgsd Canteen Emplo...,2006,S.B. Sinha,"S.B. Sinha, P.P. Naolekar",509713
4,Radha Mohan Singh @ Lal Saheb & ... vs State O...,2006,G Mathur,"K.G. Balakrishnan, Arun Kumar, G.P. Mathur",1424480


In [35]:
len(sc_india_recent)

18977

In [36]:
sc_india_recent.to_csv('Supreme_Court_cases_India_above_1999.csv')

In [37]:
sc_india_recent = pd.read_csv('Supreme_Court_cases_India_above_1999.csv')
if 'Unnamed: 0' in sc_india_recent.columns:
    sc_india_recent.drop(['Unnamed: 0'], axis = 1, inplace = True)
sc_india_recent.tail()

Unnamed: 0,Case Title,Year,Author,Judges,Document Id
18972,Anand Kumar Tiwari vs High Court Of Madhya Pra...,2021,L. Nageswara Rao,"L. Nageswara Rao, Aniruddha Bose",105056923
18973,Lachhmi Narain Singh (D) Thr. ... vs Sarjug Si...,2021,Hrishikesh Roy,"Sanjay Kishan Kaul, Hrishikesh Roy",185528426
18974,Hemraj Ratnakar Salian vs Hdfc Bank Ltd. on 17...,2021,S. Abdul Nazeer,"S. Abdul Nazeer, Krishna Murari",172174360
18975,The State Of Uttar Pradesh vs Uttam Singh on 3...,2021,Sanjay Kishan Kaul,"Sanjay Kishan Kaul, Hrishikesh Roy",65485129
18976,Rajinder Goel vs High Court Of Punjab And Hary...,2021,Uday Umesh Lalit,"Uday Umesh Lalit, Ajay Rastogi",115649227


In [38]:
# doc_list contains a strings
doc_list = []
for i in range(18977):
    website = 'https://indiankanoon.org/doc/'+str(sc_india_recent['Document Id'][i])+'/'
    response = requests.get(website)
    soup = BeautifulSoup(response.content, 'html.parser')
    doc_m = ""
    for i in range(1, 1000):
        try:
            para = soup.find('p',  {'id' : 'p_'+str(i)}).get_text().strip()
            doc_m = doc_m + para
        except:
            break
    doc_list.append(doc_m.casefold())

ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

In [39]:
len(doc_list)

14239

In [40]:

# Create a DataFrame to store the token along with it's id
token_sc_india_recent = pd.DataFrame(columns=['Document Id', 'Tokens'])
word_count_list = []

for x, y in enumerate(doc_list):
    doc_id = sc_india_recent['Document Id'][x]
    clean_doc = clean(y)    
    token_doc = spacy_tokenizer(clean_doc)
    token_sc_india_recent = token_sc_india_recent.append({ 'Document Id' : doc_id, 'Tokens' : token_doc}, ignore_index = True)
    word_count_list.append(word_count_dic(doc_id, token_doc))

In [41]:


token_sc_india

Unnamed: 0,Document Id,Tokens
0,52754564,"[bs, reference, arise, variety, view, express,..."
1,130498285,"[special, petition, impugn, judgment, order, d..."
2,23015624,"[chandramauli, jcriminal, special, petitioncrl..."
3,175295508,"[ks, radhakrishnan, vasavi, coop, housing, soc..."
4,140442067,"[gopala, gowda, jleave, appellant, correctness..."
5,105015079,"[lokur, principal, consideration, mere, issuan..."
6,22437097,"[radhakrishnan, appellant, invoke, extraordina..."
7,105912122,"[ks, radhakrishnan, writ, petition, article, i..."
8,63441281,"[ts, petition, special, common, arise, conside..."
9,92195429,"[khehar, allahabad, bank, hereinafter, refer, ..."


In [None]:
word_count_list[0]

In [42]:
import pickle

pickle_on = open("word_count_list.pickle","wb")
pickle.dump(word_count_list, pickle_on)
pickle_on.close()

# with open('', 'wb') as fh:
#     pickle.dump(word_count_list, fh)

In [43]:
# with open('word_count_list.txt', 'rb') as f:
#     mynewlist = pickle.load(f)
# mynewlist

In [44]:
token_sc_india_recent.to_csv('Supreme_Court_cases_India_tokens_above_1999.csv')