## Working with Microsoft Word using python-docx
### for more information, visit: https://python-docx.readthedocs.io/en/latest/

In [20]:
#!pip install python-docx


In [8]:
from docx import Document

document = Document('sample.docx')

#print(document.paragraphs)

for p in document.paragraphs:
    print(p.text)


Template for Preparation of Papers for IEEE Sponsored Conferences & Symposia
Frank Anderson, Sam B. Niles, Jr., and Theodore C. Donald, Member, IEEE
Abstract—These instructions give you guidelines for preparing papers for IEEE conferences. Use this document as a template if you are using Microsoft Word 6.0 or later. Otherwise, use this document as an instruction set. Instructions about final paper and figure submissions in this document are for IEEE journals; please use this document as a “template” to prepare your manuscript. For submission guidelines, follow instructions on paper submission system as well as the Conference website. Do not delete the blank line immediately above the abstract; it sets the footnote at the bottom of this column.
INTRODUCTION
T
HIS document is a template for Microsoft Word versions 6.0 or later. If you are reading a paper version of this document, please download the electronic file, ieeeconf_letter.dot (for letter sized paper: 8.5” x 11”) or ieeeconf_A4

## Working with Adobe PDF using PyPDF2
### for more information, visit: https://realpython.com/pdf-python/

In [21]:
#!pip install pypdf2



In [57]:
from PyPDF2 import PdfFileReader, PdfFileWriter

def extract_information(pdf_path):
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        information = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
        page = pdf.getPage(0)
        page.rotateClockwise(90)
        full_text = page.extractText()
    
    pdfWriter = PdfFileWriter()
    pdfWriter.addPage(page)
    with open('rotate_pages.pdf', 'wb') as fh:
        pdfWriter.write(fh)
    

    txt = f"""
    Information about {pdf_path}: 

    Author: {information.author}
    Creator: {information.creator}
    Producer: {information.producer}
    Subject: {information.subject}
    Title: {information.title}
    Number of pages: {number_of_pages}
    """
    
    
    print(txt)
    print(full_text)
    
    return information

if __name__ == '__main__':
    path = 'sample.pdf'
    extract_information(path)
    
    


    Information about sample.pdf: 

    Author: None
    Creator: Rave (http://www.nevrona.com/rave)
    Producer: Nevrona Designs
    Subject: None
    Title: None
    Number of pages: 2
    
 A Simple PDF File  This is a small demonstration .pdf file -  just for use in the Virtual Mechanics tutorials. More text. And more  text. And more text. And more text. And more text.  And more text. And more text. And more text. And more text. And more  text. And more text. Boring, zzzzz. And more text. And more text. And  more text. And more text. And more text. And more text. And more text.  And more text. And more text.  And more text. And more text. And more text. And more text. And more  text. And more text. And more text. Even more. Continued on page 2 ...


## Working with character encodings and HTML

In [51]:
from urllib import request
url = "https://electrek.co/2019/09/10/tesla-new-assembly-line-fremont-factory-model-y-production/"
html = request.urlopen(url).read().decode('utf8')
html[:60]

'<!DOCTYPE html>\n<html lang="en-US">\n\t<head>\n\t\t\t\t<meta charse'

In [54]:
from nltk import word_tokenize
from bs4 import BeautifulSoup
raw = BeautifulSoup(html, 'html.parser').get_text()
#print(raw)
tokens = word_tokenize(raw)
tokens

['Tesla',
 'is',
 'working',
 'on',
 '5th',
 'assembly',
 'line',
 'at',
 'Fremont',
 'factory',
 'ahead',
 'of',
 'Model',
 'Y',
 'production',
 '-',
 'Electrek',
 '(',
 'function',
 '(',
 'w',
 ',',
 'd',
 ',',
 's',
 ',',
 'l',
 ',',
 'i',
 ')',
 '{',
 'w',
 '[',
 'l',
 ']',
 '=w',
 '[',
 'l',
 ']',
 '||',
 '[',
 ']',
 ';',
 'w',
 '[',
 'l',
 ']',
 '.push',
 '(',
 '{',
 "'gtm.start",
 "'",
 ':',
 'new',
 'Date',
 '(',
 ')',
 '.getTime',
 '(',
 ')',
 ',',
 'event',
 ':',
 "'gtm.js",
 "'",
 '}',
 ')',
 ';',
 'var',
 'f=d.getElementsByTagName',
 '(',
 's',
 ')',
 '[',
 '0',
 ']',
 ',',
 'j=d.createElement',
 '(',
 's',
 ')',
 ',',
 'dl=l',
 '!',
 "='dataLayer",
 "'",
 '?',
 "'",
 '&',
 "l='+l",
 ':',
 "''",
 ';',
 'j.async=true',
 ';',
 'j.src=',
 "'https",
 ':',
 '//www.googletagmanager.com/gtm.js',
 '?',
 "id='+i+dl",
 ';',
 'f.parentNode.insertBefore',
 '(',
 'j',
 ',',
 'f',
 ')',
 ';',
 '}',
 ')',
 '(',
 'window',
 ',',
 'document',
 ',',
 "'script",
 "'",
 ',',
 "'dataLayer",
 "'

In [43]:
"résumé".encode("utf-8")

b'r\xc3\xa9sum\xc3\xa9'

In [44]:
b"r\xc3\xa9sum\xc3\xa9".decode("utf-8")

'résumé'

In [45]:
"El Niño".encode("utf-8")

b'El Ni\xc3\xb1o'

In [46]:
b'El Ni\xc3\xb1o'.decode("utf-8")

'El Niño'

## Working with Python requests & JSON libraries

In [35]:
#!pip install requests
import requests, json
from pprint import pprint

In [56]:
response = requests.get('https://api.opencorporates.com/companies/nl/17087985')
#response.content

output = json.loads(response.text)

output

{'api_version': '0.4.8',
 'results': {'company': {'name': 'Bover B.V.',
   'company_number': '17087985',
   'jurisdiction_code': 'nl',
   'incorporation_date': None,
   'dissolution_date': None,
   'company_type': 'Besloten Vennootschap',
   'registry_url': 'https://www.kvk.nl/zoeken/handelsregister/#!uitgebreid-zoeken&handelsnaam=&kvknummer=17087985&straat=&postcode=&huisnummer=&plaats=&hoofdvestiging=true&rechtspersoon=true&nevenvestiging=false&zoekvervallen=1&zoekuitgeschreven=1&start=0&initial=0&searchfield=uitgebreidzoeken',
   'branch': None,
   'branch_status': None,
   'inactive': False,
   'current_status': 'Active',
   'created_at': '2011-01-12T21:50:57+00:00',
   'updated_at': '2019-09-02T15:31:37+00:00',
   'retrieved_at': '2019-08-10T01:17:34+00:00',
   'opencorporates_url': 'https://opencorporates.com/companies/nl/17087985',
   'source': {'publisher': 'Kamer van Koophandel (KvK)',
    'url': 'https://www.kvk.nl/zoeken/handelsregister/#!uitgebreid-zoeken&handelsnaam=&kvknu

## Working with Webhose.io API service

In [60]:
import webhoseio, os

In [61]:
webhoseio.config(token=os.environ['WEBHOSE_TOKEN'])
query_params = {
        "q": "site_category:media spam_score:>0.8 organization:Boeing language:english",
        "ts": "1565566757439",
        "sort": "published"
    }

### Query Webhose for a set of 100 posts/feeds (be careful how many times you make this call)

In [62]:
# be careful how many times you make this call
# get the first batch
output = webhoseio.query("filterWebContent", query_params)



In [65]:
feeds = []
for item in output['posts']:
    feeds.append(item)
    print(item['title'], item['url'])

print(len(feeds))



Firefighters battle to contain more than 130 blazes across NSW and Queensland | Australia http://omgili.com/ri/.wHSUbtEfZSJlbPFf2vcZQ7DQwnenE_ns8VukOdqMCKEQ.848e4P3cf4RJuSGPI.tEYIpEE5R8JFRVVjjw4JAI1YeN5ZJH3xS5ja.vvFmY0YtGbt6A9Na2bl8jKuMdTZFM45JZ1Vlx9n_xStl4aZ0ghP0zIOp.QmlBLL7gD0EffE5hP77j3dMQ--
Oman Air unveils transformation plan http://omgili.com/ri/.wHSUbtEfZSGRyTST0EeZOBofaAmEYJJTWYEhITLnB9d95odvmOr1czG8ROTKIH6bhU2mMJy4wbDWh3c1aC1ClJsD3u38tSd4.LkVxcUC0NAEJy5xafSKw--
Hexcel’s HexAM Additive Manufacturing Approved By Boeing For Commercial Aircraft Platforms http://omgili.com/ri/.wHSUbtEfZRucCBe1xUuQ861gsxCsOXqYgbtxS59pnErCxOO61qfpVnaMhK18_uai8rsAy39eVpvVixyN_O1nUzBpYoP6j0yYOaGuVcLZxUDRQF8lTIbCSippwbbHdGGGlKQ394p9UZWn9cSpCSeeGFLdpt7rXMhNzH0fQCCF.gjwb3lrvkON9.TCsI.fyjM
QA to launch direct flight to Luanda http://omgili.com/ri/jHIAmI4hxg9VT5wBnN8v6UGJiac1dcc_CZwvBvxwklhQ0gYyJR_730enFpn.GY9YCbImtJ8o.50-
Qatar Airways to launch direct flights to Luanda from March 29, 2020 http://omgili.co

### Crawl additional batches of articles

In [66]:
count = 2
while count > 0:    
    output = webhoseio.get_next()
    for item in output['posts']:
        feeds.append(item)
    count -= 1
print(len(feeds))



105


In [67]:
import json
with open("ps5430_boeing.json", "w") as myfile:
    for feed in feeds:
        line = json.dumps(feed)
        myfile.write(line)
        myfile.write("\n")

In [68]:
json_data=open("ps5430_boeing.json").readlines()
feeds_read_from_file = []
for line in json_data:
    feeds_read_from_file.append(json.loads(line))
print(len(feeds_read_from_file))

105


In [69]:
for feed in feeds_read_from_file:
    print(feed['title'])

Firefighters battle to contain more than 130 blazes across NSW and Queensland | Australia
Oman Air unveils transformation plan
Hexcel’s HexAM Additive Manufacturing Approved By Boeing For Commercial Aircraft Platforms
QA to launch direct flight to Luanda
Qatar Airways to launch direct flights to Luanda from March 29, 2020
Boeing suspends testing of 777X aircraft after cargo door fails
Latécoère: Filing of the H1 2019 Financial Report
The world's largest underwater theme park opens in Bahrain
Used Serviceable Material (USM) Latest Innovation, Size,
Oman Air cancels more than 300 flights in Sept. due to grounding of Boeing 737 Max planes - IBTimes India
Graybar Joins Arizona State University Center for
Oman Air cancels 300 flights amid ongoing Boeing 737 Max suspension
Obituary: Cletus Ervan Bubel | Obituaries | magicvalley.com
APAC ‘sees biggest demand for airline staff’ | Yemen News
Delta flies to Nassau until Dorian makes it turn around | wcnc.com
Emirates SkyCargo wins special recogn

In [12]:
for feed in feeds_read_from_file[:5]:
    print(feed['title'], feed['published'])
#print(feeds_read_from_file[0]['title'])

Scoot set to add capacity with 16 Airbus 321neo jets, Transport 2019-07-30T15:07:00.000+03:00
New York State Teachers Retirement System Decreases Position in Boeing Co (NYSE:BA) 2019-07-30T05:46:00.000+03:00
As Boeing targets October, US says no timeline for 737 MAX 2019-07-30T05:10:00.000+03:00
DGCA suspends flying licences of 2 SpiceJet pilots for one year 2019-07-30T04:54:00.000+03:00
Aircraft lessor BOC Aviation expects delayed delivery of up to 30 jets 2019-07-30T04:08:00.000+03:00
