# ESOC Research Document In-Processing

## Document Conversion


All documents came in the form of ".doc" files, but in order to utilize the more agile docx2txt python library, a conversion to ".docx" files was neccessary:

In [2]:
journals_location = "/home/zayne/PycharmProjects/ESOC/all_journals"
def docToDocx(journals_location):
    for filename in os.listdir(journals_locations):
        if filename.endswith('.doc'):
            subprocess.call(['soffice', '--headless', '--convert-to', 'docx', filename])

Once in this format, each one of the newly created ".docx" files could be read into pyhton and split into the individual journal entries contained within the overall time period journal:

In [3]:
def allDocxToCsv():
    os.chdir(journals_location)
    for filename in os.listdir(journals_location):
        if filename.endswith('.docx'):
            journalEntries(filename)  
    print "All .docx to .csv"

The "journalEntries()" method utilizes regular exressions to seperate each journal document into the individual entries. 

In [4]:
import subprocess
import os
import docx2txt
import re
import operator
import pandas as pd

def journalEntries(filename):
    print "{} being converted to csv.".format(filename)
    #Import the docx
    journal_all = docx2txt.process(filename)
    #Create the regular expression to find journal entires
    re_entry = re.compile("\n\D{0,3}\d{5}\n")
    #Collect journal codes
    journal_codes = re_entry.findall(journal_all)
    #Remove document header
    journal_texts = re_entry.split(journal_all)[1:]
    #Create Empty Array and add headers
    output = [["Journal Code", "Unified Command", "Region", "Province", "Type of Engagement", "Reference", "Report RN", "Date", "Summary Report", "Enemy Side", "Government Side", "Civilian Side", "Firearms Gained", "Firearms Losses", "Items Recovered / Loss", "Other Details", "Action Taken"]]
    #Remove blanks and outliers
    for code, text in zip(journal_codes,journal_texts):
        #Split each entry into blocks based on newlines and remove the blanks
        blocks = filter(None, text.split(os.linesep))
        #Call method to split each entry into csv rows
        out = entryToCsv(blocks)
        #Add journal code
        out[0] = code.strip("\n")
        #Append to overall entry array
        output.append(out)
    #Turn array into pandas dataframe for ease of export    
    df = pd.DataFrame(output)
    #Send to csv with same filename, but new extension
    df.to_csv(filename.replace(".docx",".csv"),header=False, index=False, encoding='utf-8')

Each journal entry follows the basic format decpicted in the below image: <img src="ESOC_Journal_Template.png"> The "journalEntries()" method calls the "entryToCsv()" method which uses additional regular expressions and pyhton string manipulation to turn each entries set of blocks into a row of a csv with the data organized into the desired columns.

In [5]:
def entryToCsv(blocks):
    #Initialize the array to write into a row of the csv
    toCsv = [None] * 16
    #Data cleaning for some stray parenthesis in the entries
    if ")" in blocks: blocks.remove(")")
        
    #Get Unified Command and Region
    commReg = blocks[0].split("(",1)
    #Write Unified Command
    toCsv[1] = commReg[0]
    #Write Region
    toCsv[2] = commReg[1].strip(")")
    
    # Get Province and Type of Engagement
    provEng = blocks[1].split("(", 1)
    # Write Province
    toCsv[3] = provEng[0]
    # Write Type of Engagement
    toCsv[4] = provEng[1].strip(")")
    
    
    #Get Reference Details
    refDetails = re.split('Ref: |Report RN: |dtd|', blocks[2])[1:]

    #Write [Referenc", Report RN]
    toCsv[5:6] = refDetails[:-1]
    
    #Get and clean date
    try:
        toCsv[7] = re.search('\d{2} \D{3} \d{4}', refDetails[-1:][0]).group(0)
    except:
        print ("Error", refDetails[-1:][0])
        toCsv[7] = "~"
    #Return if no main report (need to investigate cases)
    if len(blocks) < 4:
        return toCsv
    
    #Return if only a main report (need to investigate cases)
    #Write Summary Report
    toCsv[8] = blocks[3]
    if len(blocks) < 5:
        return toCsv
    
    #Dictionary to hold the block numbers for variable sub-headings
    detailBlock = {"Enemy Side:":0, "Government Side:":0, "Civilian Side:":0, "Firearms Gains:":0, "Firearms Losses:":0,
                   "Items Recovered / Loss: ":0, "Other Details:":0, "Action Taken:":0}

    #Main Report Continues before sub-headings
    if any(otherDetails not in blocks[4] for otherDetails in detailBlock):
        #Continue writing Summary Report
        toCsv[8] = toCsv[8] + blocks[4]

    #Collect the starting indicies for the present sub-headings
    for type in detailBlock:
        try:
            detailBlock[type] = blocks.index(type)
        #Assign -1 for absent sub-headings
        except ValueError:
            detailBlock[type] = -1
    #Sort based on value of sub-heading indicie
    sort = sorted(detailBlock.items(), key=operator.itemgetter(1))

    #Iterate through sub-headings
    for i in range(len(sort)):
        #Ignore those that are not present
        if sort[i][1] != -1:
            #If not at the last present sub-heading collect blocks from the previous sub-heading
            if len(sort) - sort.index(sort[i]) != 1:
                detailText = "/".join(blocks[sort[i][1]+1:sort[i+1][1]])
            #Else, last present sub-heading, collect to the end
            else:
                detailText = " ".join(blocks[sort[i][1]+1:])
            #Index values for the various sub-headings in the row to write
            detailIndex = {"Enemy Side:": 0, "Government Side:": 1, "Civilian Side:": 2, "Firearms Gains:": 3,
                           "Firearms Losses:": 4,
                           "Items Recovered / Loss: ": 5, "Other Details:": 6, "Action Taken:": 7}
            #Find the locaiton of the present sub-headings
            outIndex = detailIndex[blocks[sort[i][1]]]
            #Add to array to write to csv row
            toCsv[outIndex + 9] = detailText
    #Return array to be written to csv row        
    return toCsv

Once this process is complete each ".docx" docuemtn will have a corresponding ".csv" file that has its data parsed into the appropriate columns. These are then combined into a single ".csv" for ease of access and analysis:

In [6]:
def allCsvtoMaster():
    os.chdir(journals_location)
    all_data = pd.DataFrame()
    for filename in os.listdir(journals_location):
        if filename.endswith('.csv'):
            df = pd.read_csv(filename, header=0)
            all_data = all_data.append(df,ignore_index=False)
    #Write to a master list
    all_data.to_csv("ESOC_All_Data_2010_2012_Raw.csv",header=True, index=False, encoding='utf-8')

In [7]:
allDocxToCsv()

Journal-Jan 12.docx being converted to csv.
Journal-Jan 10.docx being converted to csv.
Journal-Feb 10.docx being converted to csv.
Journal-Aug 10.docx being converted to csv.
Journal-Sep 10.docx being converted to csv.
Journal-Feb 11.docx being converted to csv.
Journal-Mar 10.docx being converted to csv.
Journal-Jan 11.docx being converted to csv.
Journal-Apr 12.docx being converted to csv.
Journal-Feb 12.docx being converted to csv.
Journal-Nov 12.docx being converted to csv.
Journal-Oct 10.docx being converted to csv.
Journal-Mar 11.docx being converted to csv.
Journal-Nov 11.docx being converted to csv.
Journal-May 12.docx being converted to csv.
Journal-Dec 12.docx being converted to csv.
Journal-Oct 12.docx being converted to csv.
Journal-Dec 11.docx being converted to csv.
Journal-Mar 12.docx being converted to csv.
Journal-Apr 10.docx being converted to csv.
Journal-Jun 10.docx being converted to csv.
Journal-Jul 11.docx being converted to csv.
Journal-Jun 11.docx being conver

In [8]:
allCsvtoMaster()