### Imports

In [131]:
import os
from pathlib import Path, PurePosixPath
import PyPDF2
import pandas as pd
from tqdm.notebook import tqdm
import re

### Download Indian Constitution PDF File

In [3]:
try:
    to_be_downloaded = True
    indian_constition_download_link = 'https://cdnbbsr.s3waas.gov.in/s380537a945c7aaa788ccfcdf1b99b5d8f/uploads/2023/05/2023050195.pdf'
    dir_path = "./data"

    # check directory
    if Path(dir_path).is_dir() != True:
        os.system("mkdir doc")
        print("Directory created")
    else: print("Directory already exists")

    # check file
    file_path = "./data/India.pdf"
    my_file = Path(file_path)

    if my_file.is_file():
        ext = PurePosixPath(my_file).suffix
        if ext == '.pdf':
            to_be_downloaded = False
            print("File Exists")
        else:
            print(f"File exists present with extention {ext}")
    else: 
        print("File does not exist")

    # download if required 
    if to_be_downloaded:
        os.system(f"curl -o {file_path} {indian_constition_download_link}")
        print(f"Downloaded the file at {my_file}")

except Exception as e:
    print(f"Exception occurred: {e}")

Directory already exists
File Exists


### Explore PDF

#### Reading File

In [130]:
# read pdf file
with open(file_path, "rb") as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    total_pages = len(pdf_reader.pages)
    
    # page count
    print(f"PDF page count: {total_pages}")

    # printing sample page-2
    for i in range(2):
        page = pdf_reader.pages[i]
        text = page.extract_text()
        print(text)

PDF page count: 404
 
 
 
 
 
 THE CONSTITUTION OF INDIA 
[As on       May , 2022] 
2022 
 
 
 
PREFACE 
 
This is the  fifth  pocket size edition of the Constitution of 
India in the diglot form. In this edition, the text of the 
Constitution of India has been brought up-to-date by 
incorporating therein all the amendments up to the Constitution 
(One Hundred and Fifth Amendment) Act, 2021. The foot notes 
below the text indicate the Constitution Amendment Acts by 
which such amendments have been made.  
The Constitution (One Hundredth Amendment) Act, 2015 
containing details of acquired and transferred territories 
between the Governments of India and Bangladesh has been 
provided in APPENDIX I. 
The Constitution (Application to Jammu and Kashmir) 
Order, 2019 and the declaration under article 370(3) of the 
Constitution have been provided respectively in Appendix II and 
Appendix III for reference. 
 
 
New Delhi;                                              Dr. Reeta Vasishta, 
   

![image](https://cdn.vectorstock.com/i/preview-1x/28/66/india-map-and-indian-flag-oriental-country-vector-28302866.webp)

In [210]:
# read pdf file
with open(file_path, "rb") as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    # printing sample page-2
    for i in range(31,32):
        page = pdf_reader.pages[i]
        text = page.extract_text()
        print(text)

THE CONSTITUTION  OF INDIA 
 
PREAMBLE  
 
WE, THE PEOPLE OF INDIA, having solemnly resolved to constitute 
India into a 1[SOVEREIGN SOCIALIST SECULAR DEMOCRATIC 
REPUBLIC] and to secure to all its citizens: 
 
JUSTICE, social, economic and political; 
 
LIBERTY of thought, expression, belief, faith and worship; 
 
EQUALITY of status and of opportunity; 
 
and to promote among them all 
 
FRATERNITY assuring the dignity of the individual and the 2[unity 
and integrity of the Nation]; 
 
IN OUR CONSTITUENT ASSEMBLY this twenty-sixth day of 
November, 1949, do HEREBY ADOPT, ENACT AND GIVE TO 
OURSELVES THIS CONSTITUTION. 
 
______________________________________________ 
1. Subs. by the Constitution (Forty-second Amendment) Act, 1976, s.2, for "SOVEREIGN 
DEMOCRATIC REPUBLIC" (w.e.f. 3-1-1977). 
2. Subs. by s. 2, ibid., for "Unity of the Nation" (w.e.f. 3-1-1977).  


#### Extracting Abbreviations from Page-2<br>

Will use to include entire terms in main text

In [50]:
with open(file_path, "rb") as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
        
    page = pdf_reader.pages[2]
    abbr_text = page.extract_text()
    print(abbr_text)

 
LIST OF ABBREVIATIONS USED 
 
 
Art., arts.  ........................................................  for Article, articles. 
Cl., cls.     ........................................................   ″   Clause, clauses. 
C.O.          ........................................................   ″   Constitution Order. 
Ins.            ........................................................   ″    Inserted. 
P., pp.       ........................................................   ″    Page, pages. 
Pt.             ........................................................   ″    Part. 
Rep.          ........................................................   ″    Repealed. 
Ss., ss.     ..........................................................   ″    Section, sections. 
Sch.         .........................................................   ″    Schedule. 
Subs.         ........................................................   ″    Substituted. 
w.e.f.       ..........................

In [121]:
lines = abbr_text.split('\n')
abbr_dict = {}

# iterate over all lines w/ abbr
for line in lines[4:-1]:
    line = line.replace('..', '-').replace("for", "″").replace("″", '@').replace("  ", "").replace(",", "|")
    parts = line.split('-')
    
    # split key and value pairs
    acronym_parts = parts[0].strip().split('|')
    long_parts = parts[-1].replace("@", "").strip().split('|')
    assert len(acronym_parts) == len(long_parts)

    # store keys and values pairs
    for acronym_part, long_part in zip(acronym_parts, long_parts):
        long_part = long_part.strip()
        if long_part[-1] == '.': long_part = long_part[:-1]
        if long_part[0] == '.': long_part = long_part[1:]
        abbr_dict[acronym_part.strip()]  = long_part
            
# manually editing last entry due to extra line
abbr_dict['w.r.e.f.'] = 'with retrospective effect from'
del abbr_dict['from.']

In [122]:
abbr_df = pd.DataFrame(abbr_dict.items(), columns=['acronym', 'fullform'])
abbr_df

Unnamed: 0,acronym,fullform
0,Art.,Article
1,arts.,articles
2,Cl.,Clause
3,cls.,clauses
4,C.O.,Constitution Order
5,Ins.,Inserted
6,P.,Page
7,pp.,pages
8,Pt.,Part
9,Rep.,Repealed


In [123]:
abbr_df.to_csv('data/abbreviations.csv', index=False)

#### Extracting Title Contents from Pages 4-30

In [204]:
articles_dict = {}
title_content_dict = {}
title_dict = {}

with open(file_path, "rb") as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    total_pages = len(pdf_reader.pages)
        
    for page_i in range(3, 31):
        page = pdf_reader.pages[page_i]
        text = page.extract_text().strip()

        article_reg = r'(\d+[A-Za-z]*)(\.\s[^\n]+(?:\n[^\n\d]+)*)'
        article_matches = re.findall(article_reg, text)
        
        # storing article matches in dict
        for num, content in article_matches:
            num = num.strip()
            content = content.strip().replace('\n', ' ').replace('\uf0be', '')
            if content[0] == '.': content = content[2:].strip()
            # ignore PART inclusion in content
            content = content.split("PART")[0].strip()
            articles_dict[num] = content

        # parts_reg = r'(PART\s+\w+)([\w\s]+)(\d+\.)'
        # parts_reg = r'(PART\s+[IVXLCDM]+[A-Za-z]?)([\s\S]+?)(?=\nPART\s+|\d+[A-Za-z]?\.|\Z)(\d+[A-Za-z]?\.)'
        parts_reg = r'(PART\s+[IVXLCDM]+[A-Za-z]?)([\s\S]+?)(?=\nPART\s+|\d+[A-Za-z]?-?[A-Za-z]?\.|\Z)(\d+[A-Za-z]?-?[A-Za-z]?\.?)'
        title_matches = re.findall(parts_reg, text)

        # storing article matches in dict
        for num, content, next in title_matches:
            num = num.strip()
            content = content.strip().replace('\n', ' ').replace('\uf0be', '')
            if content[0] == '.': content = content[2:].strip()
            title_content_dict[num] = content
            title_dict[num] = next.replace('.', '')
    
# manually deleting PART C on page-31
del title_content_dict['PART C'], title_dict['PART C']
# manually correcting the issue in the last article 
articles_dict['395'] = articles_dict['395'].split()[0]

In [205]:
len(title_dict), \
    title_dict

(26,
 {'PART I': '1',
  'PART II': '5',
  'PART III': '12',
  'PART IV': '36',
  'PART IVA': '51A',
  'PART V': '52',
  'PART VI': '152',
  'PART VII': '238',
  'PART VIII': '239',
  'PART IX': '243',
  'PART IXA': '243P',
  'PART IXB': '243ZH',
  'PART X': '244',
  'PART XI': '245',
  'PART XII': '264',
  'PART XIII': '301',
  'PART XIV': '308',
  'PART XIVA': '323A',
  'PART XV': '324',
  'PART XVI': '330',
  'PART XVII': '343',
  'PART XVIII': '352',
  'PART XIX': '361',
  'PART XX': '368',
  'PART XXI': '369',
  'PART XXII': '393'})

In [206]:
len(title_content_dict),\
title_content_dict

(26,
 {'PART I': 'THE UNION AND ITS TERRITORY  ARTICLES',
  'PART II': 'CITIZENSHIP',
  'PART III': 'FUNDAMENTAL RIGHTS  General',
  'PART IV': 'DIRECTIVE PRINCIPLES OF STATE POLICY',
  'PART IVA': 'FUNDAMENTAL DUTIES',
  'PART V': 'THE UNION  CHAPTER  I.THE EXECUTIVE  The President and Vice-President',
  'PART VI': 'THE STATES  CHAPTER I.  GENERAL',
  'PART VII': 'Omitted]   THE STATES IN PART B OF THE FIRST SCHEDULE    [',
  'PART VIII': 'THE UNION TERRITORIES',
  'PART IX': 'THE PANCHAYATS',
  'PART IXA': 'THE MUNICIPALITIES',
  'PART IXB': 'THE CO-OPERATIVE SOCIETIES',
  'PART X': 'THE SCHEDULED AND TRIBAL AREAS',
  'PART XI': 'RELATIONS BETWEEN THE UNION AND THE  STATES  CHAPTER I. LEGISLATIVE  RELATIONS  Distribution of Legislative Powers',
  'PART XII': 'FINANCE, PROPERTY, CONTRACTS AND SUITS  CHAPTER I. FINANCE  General',
  'PART XIII': 'TRADE, COMMERCE AND INTERCOURSE  WITHIN THE TERRITORY OF INDIA',
  'PART XIV': 'SERVICES UNDER THE UNION AND THE STATES  CHAPTER I.  SERVICES'

In [207]:
len(articles_dict),\
    articles_dict

(498,
 {'1': 'Name and territory of the Union.',
  '2': 'Admission or establishment of new States.  [',
  '2A': 'Sikkim to be associated with the Union. —Omitted.]',
  '3': 'Formation of new States and alteration of areas, boundaries or  names of existing  States.',
  '4': 'Laws made under articles 2 and 3 to provide for the amendment of  the First and the Fourth Schedules and supplemental, incidental  and consequential matters.',
  '5': 'Citizenship at the commencement of the Constitution.',
  '6': 'Rights of citizenship of certain persons who have migrated to  India from Pakistan.',
  '7': 'Rights of citizenship of certain migrants to Pakistan.',
  '8': 'Rights of citizenship of certain persons of Indian origin residing     outside India.',
  '9': 'Persons voluntarily acquiring citizenship of a foreign State not to  be citizens.',
  '10': 'Continuance of the rights of citizenship.',
  '11': 'Parliament to regulate the right of citizenship by law.',
  '12': 'Definition.',
  '13': 'Law

#### Extracting Articles Content from Page 33-?

In [None]:
with open(file_path, "rb") as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    total_pages = len(pdf_reader.pages)
        
    for page_i in tqdm(range(total_pages)):
        page = pdf_reader.pages[page_i+2]
        text = page.extract_text()
        print(text)
        break