# <img style="float: right" src="http://geosyntec.com/images/geosyntec-logo-02.png"><br> <div style='color:gray'>List of Acronyms Processor, ORG 2810</div>

-----
<b>Author: Curtis Fang</b><br>
Geosyntec ext.6155<br>
Date: 2017-06-26 (ISO 8601)

<b>QAQC: None</b><br>
Geosyntec ext.xxxx<br>
Date: yyyy-mm-dd (ISO 8601)

-----
<b>Change Log</b><br>
Modified by: <br>
Geosyntec ext.xxxx<br>
Date: yyyy-mm-dd(ISO 8601)<br>
    
Changes made:

-----
<b>Known Issues:</b><br>
MS4 <br>
Prop O (not recognized as an abbreviation, need to be added to the pattern definition) <br> 
Maybe start by importing an existing acronym database? <br>


## Import Libraries 

In [1]:
#Make sure to do 'pip install python-docx' in your conda env if not already installed

import docx
import re
import numpy as np
import pandas as pd
from pandas import Series, DataFrame 

## File Directory 

In [7]:
# Specify input file to be analyzed. Maek sure the input file is placed in the same file directory as the notebook 
input_file = 'Starkist Samoa Company SWPPP Plan 12052017.docx'

## Helper Function

In [8]:
pattern = r'([A-Z][a-zA-Z0-9+\.\&]*[A-Z0-9])\W' #This is the regex pattern that finds acronyms - this can be improved (maybe)

def getText(filename):
    # Import and consolidate all texts into a single string object
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

def defineAcro(df_acronyms, all_text):
    #Find indexs of all spaces in the string object
    space_index = []
    for m in re.finditer(' ', all_text):
        space_index.append(m.start())
    space_index = Series(space_index)
    
    # Extract acronym defintion from the string object
    for index,row in df_acronyms.iterrows():
        #Locate the acronym
        acronym_index = all_text.upper().find(u'(' + row['acronym'].upper())
        #QA/QC - Record intermediate result
        df_acronyms.set_value(index, 'acronym_index', acronym_index)
        if acronym_index == -1:
            # Handles when the acronym is not found inside a bracket
            df_acronyms.set_value(index, 'definition', '#undefined/not an acronym'.upper())
        else:
            #Find the index of the closest space to the acrnoym
            last_space_index = max(space_index[space_index<acronym_index])
            #QA/QC - Record intermediate result
            df_acronyms.set_value(index, 'last_space', last_space_index)

            # Jump over sufficient number of spaces to the left and set the starting position to traverse forward
            try:
                start_search = space_index[space_index[space_index==last_space_index].index[0]-row['length']] 
            except KeyError:
                # KeyError will be raised if the very first few words are given an ancronym. It is handled by hard
                # setting start_search index to 0
                start_search = 0
            #QA/QC - Record intermediate result
            df_acronyms.set_value(index, 'start_search', start_search)

            #Traverse left word to find the first word that starts with the first letter of acronym
            start_def = all_text[start_search:acronym_index].find(row['acronym'][0])
            if start_def == -1:
                # Handels imperfect user input where defintion is not capitalized
                start_def = all_text[start_search:acronym_index].find(row['acronym'].lower()[0])
                
            #QA/QC - Record intermediate result    
            df_acronyms.set_value(index, 'start_def', start_def+start_search)

            #Validate and extract the definition
            definition_candidate = all_text[(start_search+start_def):acronym_index].title().encode('utf-8')
            #Exclude one-word and short definition candidate
            min_length = 10
            if len(definition_candidate) < min_length or " " not in definition_candidate:
                df_acronyms.set_value(index, 'definition', '#undefined/not an acronym'.upper())
            else:
                df_acronyms.set_value(index, 'definition', definition_candidate)
    return df_acronyms

## Execution

In [11]:
#Load Document
all_text =  getText(input_file)

#Find acronyms using regex
acronyms = re.findall(pattern, all_text)
acronyms = set(acronyms)
acronyms = list(acronyms)

#Create a dataframe template in prepration for definition extraction
df_acronyms = DataFrame([acronyms,list(Series(acronyms).str.len())])
df_acronyms= df_acronyms.T
df_acronyms.columns = ['acronym', 'length']

#Extract defintion
defineAcro(df_acronyms,all_text)

Unnamed: 0,acronym,length,acronym_index,definition,last_space,start_search,start_def
0,BAT,3,-1.0,#UNDEFINED/NOT AN ACRONYM,,,
1,CORRECTIVE,10,-1.0,#UNDEFINED/NOT AN ACRONYM,,,
2,CONSIDERATIONS,14,-1.0,#UNDEFINED/NOT AN ACRONYM,,,
3,SUPPORT,7,-1.0,#UNDEFINED/NOT AN ACRONYM,,,
4,MODIFICATIONS,13,-1.0,#UNDEFINED/NOT AN ACRONYM,,,
5,SWPPP,5,-1.0,#UNDEFINED/NOT AN ACRONYM,,,
6,SIC,3,1661.0,Standard Industrial Classification,1660.0,1625.0,1626.0
7,COC,3,24044.0,Chain Of Custody,24043.0,24026.0,24027.0
8,CONTACT,7,-1.0,#UNDEFINED/NOT AN ACRONYM,,,
9,BPT,3,12096.0,Based Effluent Limits,12095.0,12062.0,12074.0


In [10]:
#Save acrnoyms to a csv
df_acronyms.to_csv('output/Acronyms_StarKist_full.csv')
df_acronyms.to_csv('output/Acronyms_StarKist_clean.csv',columns=['acronym','definition'],index=False)

In [6]:
sub_df = df_acronyms[df_acronyms['acronym_index']<>-1 ]
sub_df[sub_df['definition']=='#UNDEFINED/NOT AN ACRONYM']

Unnamed: 0,acronym,length,acronym_index,definition,last_space,start_search,start_def


## Export script to html

In [7]:
!jupyter nbconvert --to HTML AcronymsProcessor_Py34_Master.ipynb

[NbConvertApp] Converting notebook AcronymsProcessor_Py34.ipynb to HTML
[NbConvertApp] Writing 263720 bytes to AcronymsProcessor_Py34.html
