## Parse ProQuest Metadata
Python function to parse newspaper articles downloaded from ProQuest Global Newsstream as .txt files in batches of 100.
Created by Cody Hennesy and David Naughton, University of Minnesota, Twin Cities, Libraries.

In [69]:
## Step 1: import libraries
import os
import re
import sys
import csv
import glob

The function below takes a directory (full of .txt files from PQ) as its argument.
For example: `parsePQ("txt_input/")`

It assumes you will run the script in a directory with an `output/` folder where the CSV files will be output.

In [67]:
sep = "____________________________________________________________"

## function that takes a directory of .txt files from ProQuest as input 
def parsePQ(path): 
    
    #cycle through every text file in the directory given as an argument
    files_all = glob.iglob(path + "*.txt")
    for filename in files_all:
        
        #remove the path, whitespace, and '.txt' from filename to later use when naming output
        file_id = filename[:-4].strip(path).replace(" ", "")
        
        with open(filename, 'r') as in_file:
            # text var for string of all docs
            text = in_file.read()

            # split string by separator into single articles
            docs = re.split(sep, text)

            # remove first and last items from docs list: first item is empty string; last is copyright info
            docs = docs[1:-1]

            # open a csv file to write metadata to
            with open('output/' + file_id + '.csv', 'w', newline='') as csvfile:

                #declare fields to capture; can add fields as needed
                fieldnames = ['Title', 'Publication title', 'Publication year', 'Document URL', 'Full text']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                writer.writeheader()

                # loop through every doc to collect metadata and full text
                for doc in docs:

                    # remove white space from beginning and end of each article
                    doc = doc.strip()

                    # skip any empty docs
                    if doc=="":
                        continue

                    # split doc on every new line
                    metadata_lines = doc.split('\n\n')

                    #remove first "line" from article which is the article title without any field title
                    metadata_lines = metadata_lines[1:]

                    #declare a new dictionary
                    metadata_dict = {}

                    #for each element add the fieldname/key and following value to a dictionary
                    for line in metadata_lines:

                        #ignore lines that do not have a field beginning "Xxxxxx:" (e.g. "Publication title: ")
                        if not re.match(r'^[^:]+: ', line):
                            continue
                        #looks for beginning of new line following structure of "Publication year: " splitting on the colon space
                        (key,value) = line.split(sep=': ', maxsplit=1)

                        #only add to dictionary if the key is in fieldnames
                        if key in fieldnames:
                            metadata_dict[key] = value

                    #write the dictionary values to new row in csv
                    writer.writerow(metadata_dict)
            print("Writing", file_id)

In [68]:
#run the script
parsePQ("txt_input/")

Writing ProQuestDocuments-2019-07-29
Writing ProQuestDocuments-2019-07-14
Writing ProQuestDocuments-2019-07-29(1)
