**Extracting Text from PDFs**

In [1]:
import PyPDF2
pdfFileObj = open('meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages

19

In [3]:
pageObj = pdfReader.getPage(0)
pageObj.extractText()

'OOFFFFIICCIIAALL  BBOOAARRDD  MMIINNUUTTEESS   Meeting of \nMarch 7\n, 2014\n        \n     The Board of Elementary and Secondary Education shall provide leadership and \ncreate policies for education that expand opportunities for children, empower \nfamilies and communities, and advance Louisiana in an increasingly \ncompetitive glob\nal market.\n BOARD \n of ELEMENTARY\n and \n SECONDARY\n EDUCATION\n  '

In [4]:
pdfFileObj.close()

**Decrypting PDFs**

In [5]:
import PyPDF2
pdfReader = PyPDF2.PdfFileReader(open('encrypted.pdf', 'rb'))
pdfReader.isEncrypted

True

In [6]:
pdfReader.getPage(0)



PdfReadError: file has not been decrypted

In [9]:
pdfReader = PyPDF2.PdfFileReader(open('encrypted.pdf', 'rb'))
pdfReader.decrypt('rosebud')

1

In [10]:
pageObj = pdfReader.getPage(0)

**Creating PDFs**

In [11]:
import PyPDF2
pdf1File = open('meetingminutes.pdf', 'rb')
pdf2File = open('meetingminutes2.pdf', 'rb')
pdf1Reader = PyPDF2.PdfFileReader(pdf1File)
pdf2Reader = PyPDF2.PdfFileReader(pdf2File)
pdfWriter = PyPDF2.PdfFileWriter()

for pageNum in range(pdf1Reader.numPages):
    pageObj = pdf1Reader.getPage(pageNum)
    pdfWriter.addPage(pageObj)
    
for pageNum in range(pdf2Reader.numPages):
    pageObj = pdf2Reader.getPage(pageNum)
    pdfWriter.addPage(pageObj)
    
pdfOutputFile = open('combinedminutes.pdf', 'wb')
pdfWriter.write(pdfOutputFile)
pdfOutputFile.close()
pdf1File.close()
pdf2File.close()

**Rotating PDFs**

In [12]:
import PyPDF2
minutesFile = open('meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(minutesFile)
page = pdfReader.getPage(0)
page.rotateClockwise(90)

pdfWriter = PyPDF2.PdfFileWriter()
pdfWriter.addPage(page)
resultPdfFile = open('rotatedPage.pdf', 'wb')
pdfWriter.write(resultPdfFile)
resultPdfFile.close()
minutesFile.close()

**Overlaying PDFs**

In [14]:
import PyPDF2
minutesFile = open('meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(minutesFile)
minutesFirstPage = pdfReader.getPage(0)
pdfWatermarkReader = PyPDF2.PdfFileReader(open('watermark.pdf', 'rb'))
minutesFirstPage.mergePage(pdfWatermarkReader.getPage(0))
pdfWriter = PyPDF2.PdfFileWriter()
pdfWriter.addPage(minutesFirstPage)

for pageNum in range(1, pdfReader.numPages):
    pageObj = pdfReader.getPage(pageNum)
    pdfWriter.addPage(pageObj)
    
resultPdfFile = open('watermarkedCover.pdf', 'wb')
pdfWriter.write(resultPdfFile)
minutesFile.close()
resultPdfFile.close()

**Encrypting PDFs**

In [15]:
import PyPDF2

pdfFile = open("meetingminutes.pdf", "rb")
pdfReader = PyPDF2.PdfFileReader(pdfFile)
pdfWriter = PyPDF2.PdfFileWriter()

for pageNum in range(pdfReader.numPages):
    pdfWriter.addPage(pdfReader.getPage(pageNum))
    
pdfWriter.encrypt('swordfish')
resultPdf = open("encryptedminutes.pdf", "wb")
pdfWriter.write(resultPdf)
resultPdf.close()

**Word Documents**

In [20]:
%pip install python-docx
import docx
doc = docx.Document('demo.docx')
len(doc.paragraphs)

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2
Note: you may need to restart the kernel to use updated packages.


7

In [21]:
doc.paragraphs[0].text

'Document Title'

In [22]:
doc.paragraphs[1].text

'A plain paragraph with some bold and some italic'

In [23]:
len(doc.paragraphs[1].runs)

5

In [24]:
doc.paragraphs[1].runs[0].text

'A plain paragraph with'

In [25]:
doc.paragraphs[1].runs[1].text

' some '

In [26]:
doc.paragraphs[1].runs[2].text

'bold'

In [27]:
doc.paragraphs[1].runs[3].text

' and some '

In [28]:
doc.paragraphs[1].runs[4].text

'italic'

**Getting the Full Text from a .docx File**

In [29]:
import docx

def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

print(getText('demo.docx'))

Document Title
A plain paragraph with some bold and some italic
Heading, level 1
Intense quote
first item in unordered list
first item in ordered list



**Run Attributes**

In [30]:
import docx

doc = docx.Document("demo.docx")
doc.paragraphs[0].text

'Document Title'

In [31]:
doc.paragraphs[0].style

_ParagraphStyle('Title') id: 2883600526816

In [32]:
doc.paragraphs[0].style = 'Normal'

In [33]:
doc.paragraphs[1].text

'A plain paragraph with some bold and some italic'

In [34]:
(doc.paragraphs[1].runs[0].text, doc.paragraphs[1].runs[1].text, doc.
paragraphs[1].runs[2].text, doc.paragraphs[1].runs[3].text)

('A plain paragraph with', ' some ', 'bold', ' and some ')

In [35]:
doc.paragraphs[1].runs[0].style = 'QuoteChar'



In [36]:
doc.paragraphs[1].runs[1].underline = True

In [37]:
doc.paragraphs[1].runs[3].underline = True

In [38]:
doc.save('restyled.docx')

**Writing Word Documents**.

In [39]:
doc = docx.Document()
doc.add_paragraph('Hello world!')
doc.save('helloworld.docx')

In [41]:
doc = docx.Document()
doc.add_paragraph('Hello, world!', 'Title')
paraObj1 = doc.add_paragraph('This is a second paragraph.')
paraObj2 = doc.add_paragraph('This is a yet another paragraph.')
paraObj1.add_run(' This text is being added to the second paragraph.')
doc.save('multipleParagraphs.docx')

**Adding Headings**

In [42]:
doc = docx.Document()
doc.add_heading('Header 0', 0)
doc.add_heading('Header 1', 1)
doc.add_heading('Header 2', 2)
doc.add_heading('Header 3', 3)
doc.add_heading('Header 4', 4)
doc.save('headings.docx')

**Adding Line and Page Breaks**

In [43]:
doc = docx.Document()
doc.add_paragraph('This is on the first page!')
doc.paragraphs[0].runs[0].add_break(docx.text.run.WD_BREAK.PAGE)
doc.add_paragraph('This is on the second page!')
doc.save('twoPage.docx')

**Adding Pictures**

In [44]:
doc = docx.Document()
doc.add_picture('zophie.png', width=docx.shared.Inches(1),
height=docx.shared.Cm(4))
doc.save('zophie.docx')

**Create PDFs from Word Documents**

In [None]:
# This script runs on Windows only, and you must have Word installed.
import win32com.client # install with "pip install pywin32==224"
import docx
wordFilename = 'Chapter15\zophie.docx'
pdfFilename = 'Chapter15\zophie.pdf'
doc = docx.Document()
# Code to create Word document goes here.
doc.save(wordFilename)
wdFormatPDF = 17 # Word's numeric code for PDFs.
wordObj = win32com.client.Dispatch('Word.Application')
docObj = wordObj.Documents.Open(wordFilename)
docObj.SaveAs(pdfFilename, FileFormat=wdFormatPDF)
docObj.Close()
wordObj.Quit()