In [None]:
from IPython.display import display, HTML 

import html
import difflib
import os.path
import pickle
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

In [None]:
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/documents.readonly',
          'https://www.googleapis.com/auth/drive.readonly']

In [None]:
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
    with open('token.pickle', 'rb') as token:
        creds = pickle.load(token)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file(
            'credentials.json', SCOPES)
        creds = flow.run_local_server(port=6666)
    # Save the credentials for the next run
    with open('token.pickle', 'wb') as token:
        pickle.dump(creds, token)

doc_service = build('docs', 'v1', credentials=creds)
drive_service = build('drive', 'v3', credentials=creds)

documents = doc_service.documents()

In [None]:
q = """
mimeType='application/vnd.google-apps.folder'
and name='Covid-19 risk assessments'
"""
folder = drive_service.files().list(q=q).execute()
folder_id = folder['files'][0]['id']

In [None]:
q = '"%s" in parents' % folder_id
children = drive_service.files().list(q=q).execute()

if children.get('nextPageToken') or children.get('incompleteSearch'):
    raise ValueError("I've lost some of the files!")

file_ids = {child['name']: child['id'] for child in children['files']}


In [None]:
files = {k: documents.get(documentId=v).execute() for k, v in file_ids.items()}

In [None]:
def read_paragraph_element(element):
    """Returns the text in the given ParagraphElement.

        Args:
            element: a ParagraphElement from a Google Doc.
    """
    text_run = element.get('textRun')
    if not text_run:
        return ''
    return text_run.get('content')


def read_structural_elements(elements):
    """Recurses through a list of Structural Elements to read a document's text where text may be
        in nested elements.

        Args:
            elements: a list of Structural Elements.
    """
    text = ''
    for value in elements:
        if 'paragraph' in value:
            elements = value.get('paragraph').get('elements')
            for elem in elements:
                text += read_paragraph_element(elem)
        elif 'table' in value:
            # The text in table cells are in nested Structural Elements and tables may be
            # nested.
            table = value.get('table')
            for row in table.get('tableRows'):
                cells = row.get('tableCells')
                for cell in cells:
                    text += read_structural_elements(cell.get('content'))
        elif 'tableOfContents' in value:
            # The text in the TOC is also in a Structural Element.
            toc = value.get('tableOfContents')
            text += read_structural_elements(toc.get('content'))
    return html.escape(text, quote=False).split('\n')

In [None]:
text = {k: read_structural_elements(v['body']['content']) for k, v in files.items()}
for name, entries in text.items():
    with open(name, 'w') as fd:
        fd.writelines(x + os.linesep for x in entries)

In [None]:
all_entries = set()
for name, entries in text.items():
    all_entries.update([x for x in entries if name not in x])
for name, entries in text.items():
    missing = all_entries.difference(entries)
    if missing:
        print('%s is missing:' % name)
        for x in missing:
            print(x)
        print()

In [None]:
differ = difflib.HtmlDiff(tabsize=4)
diff = differ.make_table(text['Drake'], text['Chaundra'])
display(HTML(diff))