In [1]:

%pprint
import sys
if (osp.join(os.pardir, 'py') not in sys.path): sys.path.insert(1, osp.join(os.pardir, 'py'))

Pretty printing has been turned OFF


In [2]:

from FRVRS import (nu, osp, re, walk, sep)
import os
from IPython.display import clear_output


# Get APA References

Previously downloaded all documents from https://nextcentury.atlassian.net/wiki/spaces/ITMC/pages/2991849482/Domain+Documents and converted them all to TXT files and stored them in the Domain_Knowledge folder.

In [3]:

# Look for the author name in the ENT phrase and set the ENT type to PERSON if it's there
domain_doc_ners_df = nu.load_data_frames(domain_doc_ners_df='domain_doc_ners_df')['domain_doc_ners_df']
author_names_set = nu.load_object('author_names_set')
for author_name in author_names_set:
    search_regex = re.compile(author_name)
    text_columns = ['nlp_word', 'ent_phrase']
    type_columns = ['nlp_type', 'ent_type']
    for text_column, type_column in zip(text_columns, type_columns):
        clear_output(wait=True)
        print(author_name, text_column, type_column)
        mask_series = domain_doc_ners_df[text_column].map(lambda x: bool(search_regex.search(str(x))))
        domain_doc_ners_df.loc[mask_series, type_column] = 'PERSON'
nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)

Lewy G ent_phrase ent_type
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv



## Let us remove the names of people

In [149]:

# Search through all the files we ingested to find refs sections
dk_folder = '../data/Domain_Knowledge'
references_regex = re.compile('(?<![a-zA-Z])REFERENCES[^a-zA-Z\r\n]*$', re.IGNORECASE | re.MULTILINE)
apa_reference_regex = re.compile(
    r'^(?:\d+\.\s+)?' +
    r'((?:(?:de )?[A-Z][A-Za-z-]+(?:\s+[A-Z]{1,2}|,\s+(?:[A-Z]\.){1,2})(?:\s+JR|\s+3RD)?,\s*)*' +
    r'(?:(?:de )?[A-Z][A-Za-z-]+(?:\s+[A-Z]{1,2}|,\s+(?:[A-Z]\.){1,2})(?:\s+JR|\s+3RD)?))(?:\.|,? ?eds\.| \d|,? et al\.)',
    re.MULTILINE
)
file_paths_list = []
for sub_directory, directories_list, files_list in walk(dk_folder):
    for file_name in files_list:
        if file_name.endswith('.txt'):
            file_path = osp.join(sub_directory, file_name)
            with open(file_path, 'r', encoding=nu.encoding_type) as f: text = f.read()
            
            # Successful match, so extract the possible APA references in this file
            if apa_reference_regex.search(text): file_paths_list.append(file_path)

In [150]:

# Create a Notepad++ session file to examine all REFERENCES lists together
session_xml_prefix = '''<?xml version="1.0" encoding="UTF-8" ?>
<NotepadPlus>
    <Session activeView="0">
        <mainView activeIndex="16">
            <File firstVisibleLine="0" xOffset="0" scrollWidth="2152" startPos="0" endPos="0" selMode="0" offset="0" wrapCount="1" lang="None'''
session_xml_prefix += ''' (Normal Text)" encoding="-1" userReadOnly="no" filename="'''
path_prefix_to = 'C:\\Users\\DaveBabbitt\\Documents\\GitHub\\itm-analysis-reporting\\data\\Domain_Knowledge\\'
path_prefix_from = '/mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/data/Domain_Knowledge/'
session_xml_infix = '''" backupFilePath="" tabColourId="-1" mapFirstVisibleDisplayLine="-1" mapFirstVisibleDocLine="-1" mapLastVisibleDocLine="-1" mapNbLine="-1"'''
session_xml_infix += ''' mapHigherPos="-1" mapWidth="-1" mapHeight="-1" mapKByteInDoc="512" mapWrapIndentMode="-1" mapIsWrap="no" />
            <File firstVisibleLine="0" xOffset="0" scrollWidth="2152" startPos="0" endPos="0" selMode="0" offset="0" wrapCount="1" lang="None (Normal Text)"'''
session_xml_infix += ''' encoding="-1" userReadOnly="no" filename="'''
session_xml_suffix = '''" backupFilePath="" tabColourId="-1" mapFirstVisibleDisplayLine="-1" mapFirstVisibleDocLine="-1" mapLastVisibleDocLine="-1" mapNbLine="-1"'''
session_xml_suffix += ''' mapHigherPos="-1" mapWidth="-1" mapHeight="-1" mapKByteInDoc="512" mapWrapIndentMode="-1" mapIsWrap="no" />
        </mainView>
        <subView activeIndex="0" />
    </Session>
</NotepadPlus>
'''
open_files_list = []
for file_path in file_paths_list:
    path_str = osp.abspath(file_path).replace(path_prefix_from, path_prefix_to).replace(sep, '\\')
    open_files_list.append(path_str.strip())
file_path = '../saves/notepad_sessions/notepad_session_of_references.xml'
with open(file_path, 'w', encoding=nu.encoding_type) as f:
    print(session_xml_prefix + session_xml_infix.join(open_files_list) + session_xml_suffix, file=f)

In [160]:

# Create a set of author's names from the REFERENCES sections
section_end_regex = re.compile(
    r'^(CHAPTER|APPENDIX|GLOSSARY|TABLE|Attachment|Required|Related|URL)',
    re.MULTILINE
)
author_names_set = set()
for file_path in file_paths_list:
    file_name = file_path.split('/')[-1]
    with open(file_path, 'r', encoding=nu.encoding_type) as f: text = f.read()
    lines_list = references_regex.findall(text)
    references_list = references_regex.split(text)
    for line, reference in zip(lines_list, references_list[1:]):
        reference = section_end_regex.split(reference)[0]
        apa_refs_list = apa_reference_regex.findall(reference)
        if apa_refs_list:
            for authors_str in apa_refs_list:
                authors_str = re.sub(r',\s+((?:[A-Z]\.){1,2})', r' \1', authors_str).replace('.', '')
                authors_str = re.sub(r'\s+', ' ', authors_str).strip()
                authors_str = re.sub(r',\s*', ',', authors_str)
                for author_name in authors_str.split(','): author_names_set.add(author_name)

In [173]:

authors_list = sorted(author_names_set)
authors_list.extend([
    'Fan D', 'Liu X', 'Lim N', 'Han B', 'Ring A', 'Koff G', 'Song Z', 'Tops M', 'Loo CM', 'Zika S', 'Sin NL', 'Jha AP', 'West J', 'Lim BR', 'Hoge C',
    'Kane R', 'Otte C', 'Thrive', 'Chi SH', 'Roy MJ', 'Rose R', 'Wang C', 'Rhea A', 'Fear NT', 'Haack M', 'Booth B', 'Ford JL', 'Bell DB', 'Spera C',
    'Koren D', 'Motl RW', 'Mohr DC', 'Dovey H', 'Cohen A', 'Cohen S', 'Taft CT', 'Tsai AC', 'Ryan DM', 'Dahn JR', 'Wade NG', 'Ramel J', 'Ware WB',
    'Geher K', 'Wong PT', 'Moore D', 'Cohen F', 'Zhang Y', 'Syme SL', 'Crow JR', 'Kruse K', 'Hoge CW', 'Thom NJ', 'Scott T', 'Bell RJ', 'Segal MW',
    'Draper P', 'Kubik MY', 'Clark JC', 'Parker S', 'Rippen N', 'Wills TA', 'Rentz ED', 'Jones DH', 'House JS', 'Gibbs DA', 'Chang TL', 'Wardle J',
    'Marmot M', 'Berman J', 'Uutela A', 'Adler JM', 'Sanna LJ', 'Knoops L', 'White RF', 'Clark MS', 'Adler AB', 'Farley D', 'Klein EM', 'Burton T',
    'Bowen GL', 'Goff BSN', 'Loomis D', 'Bryant C', 'Burke RJ', 'Erbes CR', 'Newman S', 'Lyons JA', 'Chang EC', 'Yang K-M', 'Brown BA', 'Heeren T',
    'Jacka FN', 'Engel CC', 'McGurk D', 'Britt TW', 'Levin JS', 'Martin D', 'Moodie S', 'Reker GT', 'Kiang PN', 'Simon CR', 'Fritz HL', 'Dritsa M',
    'Norman D', 'Eaton KM', 'Pasco JA', 'Dolan SL', 'Davis SR', 'Smith SE', 'Robbins C', 'Witters D', 'Kemeny ME', 'Orthner D', 'Deckman T', 'Arnold AL',
    'Oswald AJ', 'Steptoe A', 'Jaycox LH', 'Linley PA', 'Gaynes BN', 'Fennell M', 'DeWall CN', 'Reifman A', 'Asbury ET', 'Messer SC', 'Trainor S',
    'Pessimism', 'Ursano RJ', 'Panuzio J', 'Folkman S', 'Nelson JP', 'Bliese PD', 'Casteel C', 'Koenig HG', 'Stubbe JH', 'Schumm WR', 'Martin SL',
    'Hassmen P', 'Penedo FJ', 'Kinney JM', 'Amoroso P', 'Johnson P', 'Holden SL', 'Blakely T', 'Schmid CH', 'Castro CA', 'Uchino BN', 'Thoits PA',
    'Koivula N', 'Martin JA', 'Agazio JG', 'Soeken KL', 'Sutton GW', 'Berger SS', 'Carver CS', 'Bannuru R', 'Stites DP', 'Rostrup N', 'Fritts MJ',
    'Chandra A', 'Morton RK', 'LaChina M', 'Carter KN', 'Padden DL', 'Johnson J', 'Salami SO', 'Zegans LS', 'Deuster PA', 'Schaffer J', 'Francis JL',
    'Hamilton S', 'Kinicki AJ', 'Scheier MF', 'Ryckman RM', 'Kearney KA', 'Dishman RK', 'DeGraff AH', 'Cotting DI', 'Mancini JA', 'Compton JS',
    'Collings S', 'de Moor MH', 'Merrill JC', 'Stanley EA', 'Wanberg CR', 'Da Costa D', 'Connors RA', 'Johnson DC', 'Anderson S', 'Helgeson V',
    'Sweeney PJ', 'Ellison CG', 'Reisbig AM', 'Riviere LA', 'Lazarus RS', 'Bushman BJ', 'Patrick JH', 'Sherrod DR', 'Iversen AC', 'Proctor SP',
    'Harrell MC', 'Hufford DJ', 'Kennedy MC', 'Boomsma DI', 'Griffith J', 'Polusny MA', 'Renshaw KD', 'Ecklund CJ', 'McNally ST', 'Peacock EJ',
    'de Bruin E', 'Davison SL', 'Koffman RL', 'Metzner HL', 'Mykletun A', 'Kaufman JS', 'van Dam HA', 'de Geus EJ', 'Collins RC', 'Castro CAE',
    'Berkman LF', 'Galovski T', 'Orthner DK', 'Frankel BG', 'Burrell LM', 'O’Donnell K', 'Grimsley RN', 'Hamilton KR', 'Winefield A', 'Magruder KM',
    'Chatters LM', 'Dimiceli EE', 'Sonnentag S', 'Campbell JC', 'Gailliot MT', 'Pietrucha A', 'Delfabbro P', 'Johnston SL', 'McLeland KC',
    'Greenberg N', 'Grunberg NE', 'Morrison AB', 'Matthews KA', 'Pressman SD', 'Cacioppo JT', 'Golinelli D', 'Marshall AD', 'Kupelnick B',
    'Marshall SW', 'Manzanera R', 'Macdermid S', 'Birnbaum AS', 'Dickinson JM', 'Crebolder HF', 'McKee-Ryan F', 'Chatterjee A', 'Rodrigues CS',
    'Leardmann CA', 'Mansfield AJ', 'Pargament KI', 'Schoomaker E', 'Castaneda LW', 'Fiksenbaum L', 'Morrissey JP', 'McCarroll JE', 'D’Zurilla TJ',
    'Blackwell SE', 'Lopez-Zafra E', 'Steinhardt MA', 'Chamberlain K', 'Vasterling JJ', 'Durand-Bush N', 'Segerstrom SC', 'Hershfield HE',
    'Dainer-Best J', 'Mullington JM', 'Baldacchino D', 'Lucier-Greer M', 'Pulido-Martos M', 'Montero-Marin J', 'Piva Demarzo MM', 'Blanchflower DG',
    'Stewart-Brown S', 'Lara-Cinisomo S', 'Garcia-Campayo J', 'van den Borne BH', 'Alvarez de Mon M', 'Augusto-Landa JM', 'van der Horst FG',
    'Rius-Ottenheim N', 'Kiecolt-Glaser JK', 'Maydeu-Olivares A', 'Armistead-Jehle P', 'Schulte-van Maaren YW'
])
authors_list.extend([
    'Arend AC', 'Beck RJ', 'Smart JK', 'Franz DR', 'Sidell FR', 'Krepon M', 'Gordon MR', 'Trainor BE', 'Levie HS', 'Feilchenfeld EH', 'Smoke R',
    'Claude IL Jr', 'Bishop WW Jr', 'Singh N', 'Poulet DC', 'Phillipson C', 'Marsden ME', 'Burrelli DF', 'Segal MW', 'Treadwell ME', 'Redhead CS',
    'Brown D', 'Smith AM', 'Coser LA', 'Segal DR', 'Moskos CC', 'Lerner M', 'Stanley SC', 'Aldine', 'Cooper EC', 'Tarantola D', 'Mechanic D',
    'Pressman SD', 'Bryant C', 'Ford JL', 'McCarroll JE', 'Marshall AD', 'Loomis D', 'Marshall SW', 'Stites DP', 'Ecklund CJ'
])
author_names_set = set(authors_list)
authors_list = sorted(author_names_set)
nu.store_objects(author_names_set=author_names_set, authors_list=authors_list)

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/author_names_set.pkl
Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/authors_list.pkl
