# OrgSci

In [531]:
class Section:
    def __init__(self, content, size):
        self.children = []
        self.size = size
        self.parent = None
        self.content = content
        
    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self.content.strip() == other.content.strip()
        return False
    
    def __repr__(self):
        return self.content
    
    def add_child(self, new_child):
        self.children.append(new_child)
        
    def has_child(self):
        return len(self.children) != 0
    
    def set_parent(self, new_parent):
        self.parent = new_parent
    
    def extend(self, content):
        self.content += f" {content}"
    
    def backtrack_add(self, content, size):
        curr = self
        
        while curr.size <= size:
            curr = curr.parent
        
        parent = curr
        cs = Section(content, size)
        parent.add_child(cs)
        cs.set_parent(parent)
                
        return cs
    def print_contents(self):
        if len(self.children) == 0:
            return self.content
        
        return self.content + "\n" + " \n\n ".join([child.print_contents() for child in self.children])
    
import pprint
import fitz
from fitz import Rect
import re
import pandas as pd



def get_sections(doc):

    main_section = Section("", 100)
    main_section   

    prev_size = 100
    curr_section = main_section
    nest = {}

    for page in doc:
        rect = Rect(page.rect.x0 + 40, page.rect.y0 + 60, page.rect.x1 - 40, page.rect.y1 - 40) 
        
        dict = page.get_text("dict", clip = rect)

        blocks = dict["blocks"]
        for block in blocks:
            if "lines" in block.keys():
                spans = block['lines']
                for span in spans:
                    data = span['spans']
                    for lines in data:
                        cur_size = round(lines['size'], 1)

                        # Manual Override for References
                        if lines['text'].strip() in ("Acknowledgements", "References", "Appendix", "Endnotes"):
                            cur = Section(lines['text'], cur_size)
                            curr_section = main_section.children[-1].children[-1]
                            curr_section.add_child(cur)
                            cur.set_parent(curr_section)
                            curr_section = cur

                            prev_size = round(lines['size'], 1)

                        elif cur_size > prev_size:
                            curr_section = curr_section.backtrack_add(lines['text'], cur_size)
                            prev_size = curr_section.size

                        elif cur_size == prev_size:
                            curr_section.extend(lines['text'])

                        else:  
                            cur = Section(lines['text'], cur_size)   
                            curr_section.add_child(cur)
                            cur.set_parent(curr_section)
                            curr_section = cur
                            prev_size = round(lines['size'], 1)

    # orgsci                            
    return main_section.children[-1].children[-1].children

def make_sections_dataframe(path):
    doc = fitz.open(path) # open a document
    
    # Get sections
    sections = get_sections(doc)
    sections = preprocess_sections(sections)
    
    content_nest = {}

    for section in sections:
        content_nest[section.content] = [section.print_contents()]

    sections_df = pd.DataFrame(content_nest, index = ["text"]).T
    sections_df.name = doc.name
    return sections, sections_df

def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    references_clean = text_preprocess_for_reference_matching(sections[-1].print_contents())

    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [
            item 
            for group in 
                [  
                    [c.strip()                             # Remove whitespace
                     for c in citation[1:-1].split(',')    # Remove '(' and ')'
                     if any(char.isdigit() for char in c)] # Remove any that doesn't have digits (year)

                    for citation in in_text_citations
                ]
            for item in group 
        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    
    return references_df

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references_dirty = " ".join(references_dirty.split())
    references = re.sub("([0-9]|html|\))\s?\.", r"\g<0>\n", references_dirty)

    # Make list of references
    pattern = r"[A-ZÆØÅæøå][ÆØÅæøåA-Za-z]+.*[A-Z]{1,3},? .*\(\d{4}\).*[html|\d|\)]\."
    references_clean = re.findall(pattern, references)
    return references_clean
    

def get_in_text_citations(text):
    IN_TEXT_CITATION_REGEX = r"\([\w\s.,]+\s\d{3,4}\s?\)"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    # case 1: 2 authors
    if " and " in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split(" and ")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if f"({year})" in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data

def convert_pdf_to_dataframes(path) -> (pd.DataFrame, pd.DataFrame):
    """Returns (sections_df, references_df)"""
    sections, sections_df = make_sections_dataframe(path)
    references_df = make_references_dataframe(sections, sections_df)
    return sections_df, references_df

def preprocess_sections(sections):
    add_new_section = False
    
    # Preprocess sections
    abstract_section = Section("Abstract. ", 0)
    abstract_index = sections.index(abstract_section)
    abstract_text = sections.pop(abstract_index + 1)
    sections[abstract_index].add_child(abstract_text)
    
    sections_tmp = sections.copy()
    earliest_index = 1000
    txt = ""
    
    # Fit all text that belongs in paragraph into one "Introduction" paragraph
    for idx, section in enumerate(sections_tmp):
        if len(section.content) > 100:
            add_new_section = True
            if min(earliest_index, idx) != earliest_index:
                earliest_index = idx
            sections.remove(section)
            txt += section.content
            
    
    if add_new_section:
        new_section = Section("Introduction", 20)
        new_section.add_child(Section(txt, 15))
        sections[earliest_index] = new_section

    sections = sections[abstract_index:]
    return sections

import fitz
import os

ORGSCI_PATH = "../data/orgsci"
orgsci_pdfs = [f for f in os.listdir(ORGSCI_PATH) if f.endswith('pdf')]
for i in range(len(orgsci_pdfs)):
    path = os.path.join(ORGSCI_PATH, orgsci_pdfs[i])
    sections_df, references_df = convert_pdf_to_dataframes(path)
    abstract_text = sections_df.iloc[0].item()
    abstract, keywords = abstract_text.split("Keywords")
    cleaned_keywords = [keyword.strip() for keyword in keywords.split("•")]
    new_row = pd.DataFrame({"text":str(cleaned_keywords)}, index = ["Keywords"])
    sections_df = pd.concat([new_row, sections_df])


Unnamed: 0,text
Keywords,"[': organizational vocabularies', 'organizatio..."
Abstract.,Abstract.\n The mechanisms by which social net...
Introduction,Introduction\nOrganizational life is made up o...
Data and Methods,Data and Methods\nEmpirical Setting \n\n We ex...
Results,Results\nTable 1 displays descriptive statis...
Discussion,"Discussion\nBut words are things, and a small ..."
Endnotes,Endnotes\n1 \n\n We started the analysis from...
References,References\nAbbott A (1988) The System of Pro...


In [532]:
sections_df

Unnamed: 0,text
Abstract.,Abstract.\n The mechanisms by which social net...
Introduction,Introduction\nOrganizational life is made up o...
Data and Methods,Data and Methods\nEmpirical Setting \n\n We ex...
Results,Results\nTable 1 displays descriptive statis...
Discussion,"Discussion\nBut words are things, and a small ..."
Endnotes,Endnotes\n1 \n\n We started the analysis from...
References,References\nAbbott A (1988) The System of Pro...


# Annurev

In [505]:
class Section:
    def __init__(self, content, size):
        self.children = []
        self.size = size
        self.parent = None
        self.content = content
        
    def __eq__(self, other):
        if type(other) == str:
            return self.content.strip() == other
        if isinstance(other, self.__class__):
            return self.content.strip() == other.content.strip()
        return False
    
    def __repr__(self):
        return self.content
    
    def add_child(self, new_child):
        self.children.append(new_child)
        
    def has_child(self):
        return len(self.children) != 0
    
    def set_parent(self, new_parent):
        self.parent = new_parent
    
    def extend(self, content):
        self.content += f" {content}"
    
    def backtrack_add(self, content, size):
        curr = self
        
        while curr.size <= size:
            curr = curr.parent
        
        parent = curr
        cs = Section(content, size)
        parent.add_child(cs)
        cs.set_parent(parent)
                
        return cs
    def print_contents(self):
        if len(self.children) == 0:
            return self.content
        
        return self.content + "\n" + " \n\n ".join([child.print_contents() for child in self.children])


def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    
    references_clean = text_preprocess_for_reference_matching(sections[-1].print_contents())

    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [
            item 
            for group in 
                [  
                    [c.strip()                             # Remove whitespace
                     for c in citation[1:-1].split(',')    # Remove '(' and ')'
                     if any(char.isdigit() for char in c)] # Remove any that doesn't have digits (year)

                    for citation in in_text_citations
                ]
            for item in group 
        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    
    return references_df

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references_dirty = " ".join(references_dirty.split())
    references = re.sub("([0-9]|html|\))\s?\.", r"\g<0>\n", references_dirty)

    # Make list of references
    pattern = r"[A-ZÆØÅæøå][ÆØÅæøåA-Za-z]+.*[A-Z]{1,3},? .*\(\d{4}\).*[html|\d|\)]\."
    references_clean = re.findall(pattern, references)
    return references_clean
    

def get_in_text_citations(text):
    IN_TEXT_CITATION_REGEX = r"\([\w\s.,]+\s\d{3,4}\s?\)"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    # case 1: 2 authors
    if " and " in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split(" and ")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if f"({year})" in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data

def convert_pdf_to_dataframes(path) -> (pd.DataFrame, pd.DataFrame):
    """Returns (sections_df, references_df)"""
    sections, sections_df = make_sections_dataframe(path)
    references_df = make_references_dataframe(sections, sections_df)
    return sections_df, references_df 

In [506]:
import pprint
import fitz
from fitz import Rect
import re
import pandas as pd



def get_sections(doc):

    main_section = Section("", 100)

    prev_size = 100
    curr_section = main_section
    nest = {}

    for page in doc:
        if page.number not in (doc[-1].number, doc[-2].number):
            if page.number == 0:
                rect = Rect(page.rect.x0 + 20, page.rect.y0 + 250, page.rect.x1 - 20, page.rect.y1 - 20) 
            else:
                rect = Rect(page.rect.x0 + 20, page.rect.y0 + 20, page.rect.x1 - 20, page.rect.y1 - 20) 
            
            dict = page.get_text("blocks", clip = rect)

            blocks = dict["blocks"]
            for block in blocks:
                if "lines" in block.keys():
                    spans = block['lines']
                    for span in spans:
                        data = span['spans']
                        for lines in data:
                            cur_size = round(lines['size'], 1)

                            # Manual Override for References
                            if lines['text'].strip() in ("Acknowledgements", "References", "Appendix", "Endnotes"):
                                if lines['text'].strip() == "References":
                                    print_flag = True
                                cur = Section(lines['text'], cur_size)
                                curr_section = main_section.children[-1].children[-1]
                                curr_section.add_child(cur)
                                cur.set_parent(curr_section)
                                curr_section = cur

                                prev_size = round(lines['size'], 1)

                            elif cur_size > prev_size:
                                curr_section = curr_section.backtrack_add(lines['text'], cur_size)
                                prev_size = curr_section.size

                            elif cur_size == prev_size:
                                curr_section.extend(lines['text'])

                            else:  
                                cur = Section(lines['text'], cur_size)   
                                curr_section.add_child(cur)
                                cur.set_parent(curr_section)
                                curr_section = cur
                                prev_size = round(lines['size'], 1)

    return main_section

def preprocess_sections(sections):
    # Preprocess sections
    abstract_section = Section("Abstract", 0)

    abstract_index = sections.index(abstract_section)
    abstract_text = sections.pop(abstract_index + 1)
    sections[abstract_index].add_child(abstract_text)

    sections = sections[abstract_index:]
    
    return sections

def make_sections_dataframe(path):
    doc = fitz.open(path) # open a document
    
    # Get sections
    sections = get_sections(doc)
    sections = preprocess_sections(sections)
    
    content_nest = {}

    for section in sections:
        content_nest[section.content] = [section.print_contents()]

    sections_df = pd.DataFrame(content_nest, index = ["text"]).T
    sections_df.name = doc.name
    return sections, sections_df

def get_sections(doc):

    main_section = Section("", 100)

    prev_size = 100
    curr_section = main_section
    nest = {}

    for page in doc:
        if page.number not in (doc[-1].number, doc[-2].number):
            rect = Rect(page.rect.x0 + 20, page.rect.y0 + 20, page.rect.x1 - 20, page.rect.y1 - 30)
            
            dict = page.get_text("dict", clip = rect)
            blocks = dict["blocks"]
            for block in blocks:
                if "lines" in block.keys():
                    spans = block['lines']
                    for span in spans:
                        data = span['spans']
                        for lines in data:
                            cur_size = round(lines['size'], 2)

                            # Manual Override for References
                            if lines['text'].strip() in ("Abstract", "Keywords", "LITERATURE CITED"):

                                cur = Section(lines['text'], cur_size)
                                curr_section = main_section.children[-1].children[-1]
                                curr_section.add_child(cur)
                                cur.set_parent(curr_section)
                                curr_section = cur

                                prev_size = round(lines['size'], 2)

                            elif cur_size > prev_size:
                                curr_section = curr_section.backtrack_add(lines['text'], cur_size)
                                prev_size = curr_section.size

                            elif cur_size == prev_size:
                                curr_section.extend(lines['text'])

                            else:  
                                cur = Section(lines['text'], cur_size)   
                                curr_section.add_child(cur)
                                cur.set_parent(curr_section)
                                curr_section = cur
                                prev_size = round(lines['size'], 2)


    final_sections = main_section.children[-1].children
    
    if len(final_sections) > 1:
        return final_sections[-2].children + final_sections[-1].children
    else:
        return final_sections[-1].children

def preprocess_sections(sections):
    # Preprocess sections
    first_section = Section("Keywords", 0)
    first_index = sections.index(first_section)
    sections = sections[first_index:]
    
    return sections

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references = " ".join(references_dirty.split())
    pattern = "[A-Z][A-Za-z, ]+[A-Z]{1,3}\. \d{4}\."
    references_clean = re.findall(pattern, references)
    
    for idx, ref in enumerate(references_clean):
        if idx == len(references_clean) - 1:
            # All the way to the end
            references_clean[idx] = references[references.find(ref):]
        else:
            next_ref = references_clean[idx+1]
            references_clean[idx] = references[references.find(ref):references.find(next_ref)]

    return references_clean

def get_in_text_citations(text):
    IN_PARANTHESES_CITATION_REGEX = r"\([&\w\s., ]+\s\d{3,4}\)"
    AND_PATTERN = "\S+ & \S+ \(\d{3,4}\)"
    ONE_PATTERN = "[A-Z]\S+ \(\d{3,4}\)"
    ET_AL_PATTERN = "[A-Z][a-z] et al. \(\d{3,4}\)"
    IN_TEXT_CITATION_REGEX = f"{IN_PARANTHESES_CITATION_REGEX}|{AND_PATTERN}|{ONE_PATTERN}|{ET_AL_PATTERN}"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    # case 1: 2 authors
    if "&" in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split("&")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if year in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data

def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    
    references_text = sections[sections.index("LITERATURE CITED")].print_contents()
    references_clean = text_preprocess_for_reference_matching(references_text)
    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [

            item 

            for group in 

            [
                re.sub(r"\(|\)|see also|’s", "", item).split(",")
                for item in in_text_citations
            ]

            for item in group

        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    return references_df

In [507]:
import os
import pprint
from IPython.display import display

annurev_path = "../data/annurev-orgpsych"
annurev_pdfs = [f for f in os.listdir(annurev_path) if f.endswith('pdf')]
for i in range(len(annurev_pdfs)):
    sections_df, references_df = convert_pdf_to_dataframes(os.path.join(annurev_path, annurev_pdfs[i]))
    display(sections_df)
    display(references_df)

Unnamed: 0,text
Keywords,"Keywords\nSHRM, HPWS"
Abstract,Abstract\nThis article provides an overview of...
INTRODUCTION,INTRODUCTION\nAs ﬁrms seek to compete using al...
HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT: WHERE WE HAVE BEEN,HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT...
The Era of Conceptual Models,The Era of Conceptual Models\nFollowing these ...
The Era of Empirical Examination,"The Era of Empirical Examination\nHowever, for..."
The Era of Empirical Critiques,The Era of Empirical Critiques\nAs the volume ...
WHERE WE ARE TODAY,WHERE WE ARE TODAY\nSHRM research has grown bo...
Theoretical Foundations,Theoretical Foundations\nWright & McMahan (199...
Strategic Human Resource Management and Firm Performance,Strategic Human Resource Management and Firm P...


Unnamed: 0,reference,section
0,"Wright PM, McMahan GC. 1992. Theoretical persp...","INTRODUCTION,Theoretical Foundations,Fit and F..."
1,Kaufman BE. 2014. The historical development o...,HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT...
2,Wiley Foulkes FK. 1975. The expanding role of ...,HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT...
3,"Press Beer M, Spector B, Lawrence PR, Mills DQ...",HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT...
4,Dyer L. 1985. Strategic human resources manage...,HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT...
...,...,...
65,"Huselid MA, Becker BE. 2011. Bridging macro an...",Increasing Integration with Strategy Researchers
66,"Wright PM, Coff R, Moliterno TP. 2014. Strateg...",Increasing Integration with Strategy Researchers
67,"Molloy JC, Ployhart RE, Wright PM. 2010. The m...",Increasing Integration with Strategy Researchers
68,"Nyberg AJ, Wright PM. 2015. 50 years of human ...",Increasing Integration with Strategy Researchers


Unnamed: 0,text
Keywords,"Keywords\ncollective turnover, turnover rates,..."
Abstract,Abstract\nThis review builds from the last maj...
INTRODUCTION,INTRODUCTION\nCollective turnover represents “...
DEFINITIONS AND CONSTRUCT ISSUES,DEFINITIONS AND CONSTRUCT ISSUES\nAuthors have...
THEORETICAL ADVANCEMENTS,THEORETICAL ADVANCEMENTS\nA key research objec...
CHOOSING A COLLECTIVE TURNOVER MEASURE,CHOOSING A COLLECTIVE TURNOVER MEASURE\nThe tr...
Context-Emergent Turnover Theory,Context-Emergent Turnover Theory\nBeginning wi...
Turnover Capacity,Turnover Capacity\nHausknecht & Holwerda’s (20...
RECENT EMPIRICAL CONTRIBUTIONS,RECENT EMPIRICAL CONTRIBUTIONS\nShortly after ...
Meta-Analyses,Meta-Analyses\nTwo of the four meta-analytic s...


Unnamed: 0,reference,section
0,"Hausknecht JP, Trevor CO. 2011. Collective tur...","Abstract,INTRODUCTION,THEORETICAL ADVANCEMENTS..."
1,"Heavey AL, Holwerda JA, Hausknecht JP. 2013. C...","INTRODUCTION,DEFINITIONS AND CONSTRUCT ISSUES,..."
2,"Hom PW, Lee TW, Shaw JD, Hausknecht JP. 2017. ...",INTRODUCTION
3,"March JG, Simon HA. 1958. Organizations . Camb...",INTRODUCTION
4,"Mitchell TW, Holtom BC, Lee TW, Sablynski CJ, ...",INTRODUCTION
5,"LITERATURE CITED Bartunek JM, Huang Z, Walsh I...",INTRODUCTION
6,"Felps W, Mitchell TR, Hekman DR, Lee TW, Holto...","INTRODUCTION,Measure Process (Research Questio..."
7,"Morgeson FP, Hofmann DA. 1999. The structure a...",DEFINITIONS AND CONSTRUCT ISSUES
8,"Nyberg AJ, Ployhart RE. 2013. Context-Emergent...","DEFINITIONS AND CONSTRUCT ISSUES,THEORETICAL A..."
9,"Hausknecht JP, Holwerda JA. 2013. When does em...","DEFINITIONS AND CONSTRUCT ISSUES,THEORETICAL A..."


Unnamed: 0,text
Keywords,"Keywords\nperformance ratings, performance man..."
Abstract,Abstract\nThis article reviews the history of ...
INTRODUCTION,INTRODUCTION\nNo other talent management syste...
PERFORMANCE EVALUATION,PERFORMANCE EVALUATION\nThe early history of P...
Performance Evaluation Challenges,Performance Evaluation Challenges\nUnderlying ...
Summary and Next Steps for Performance Evaluation Research and Practice,Summary and Next Steps for Performance Evaluat...
SUMMARY: PERFORMANCE EVALUATION,SUMMARY: PERFORMANCE EVALUATION\nBelow is a su...
PERFORMANCE MANAGEMENT,"PERFORMANCE MANAGEMENT\nWith ﬂatter, leaner or..."
New Approaches to Performance Management,New Approaches to Performance Management\nThe ...
Streamlining the Formal Performance Management System,Streamlining the Formal Performance Management...


Unnamed: 0,reference,section
0,"Leary RS, Meyrowitz MM. 2012. Practice Guideli...",INTRODUCTION
1,"Routledge DeNisi AS, Murphy KR. 2017. Performa...",INTRODUCTION
2,CEB. 2012. Driving breakthrough performance in...,"INTRODUCTION,PERFORMANCE MANAGEMENT,Summary an..."
3,"Buckingham M, Goodall A. 2015. Reinventing per...","INTRODUCTION,New Approaches to Performance Man..."
4,"Culbert SA, Rout L. 2010. Get Rid of the Perfo...","INTRODUCTION,PERFORMANCE MANAGEMENT,Streamlini..."
...,...,...
98,"Heath D, Heath C. 2010. Switch: How to Change ...",Successful Implementation of Performance Manag...
99,Cohen D. 2005. The Heart of Change Field Guide...,Successful Implementation of Performance Manag...
100,Bock L. 2015. Work Rules! Insights from Google...,Successful Implementation of Performance Manag...
101,Pink DH. 2009. Drive: The Surprising Truth Abo...,Summary and Next Steps for Performance Managem...


Unnamed: 0,text
Keywords,"Keywords\norganizational citizenship behavior,..."
Abstract,"Abstract\nFor decades, the accepted view in or..."
INTRODUCTION,"INTRODUCTION\nBy the early 1970s, the broad co..."
THE FORESHADOWING OF ORGANIZATIONAL CITIZENSHIP BEHAVIOR IN EARLIER THEORIES OF ORGANIZATION,THE FORESHADOWING OF ORGANIZATIONAL CITIZENSHI...
ORGANIZATIONAL CITIZENSHIP BEHAVIOR: 1983–2005,ORGANIZATIONAL CITIZENSHIP BEHAVIOR: 1983–2005...
ORGANIZATIONAL CITIZENSHIP BEHAVIOR: 2005–2016 Culture and Organizational Citizenship Behavior,ORGANIZATIONAL CITIZENSHIP BEHAVIOR: 2005–2016...
Focused Research on Speciﬁc Forms of Organizational Citizenship Behavior,Focused Research on Speciﬁc Forms of Organizat...
Organizational Citizenship Behavior and Organizational Effectiveness,Organizational Citizenship Behavior and Organi...
Job Satisfaction Versus Personality,Job Satisfaction Versus Personality\nAs noted ...
The Justice Motif,The Justice Motif\nOnce the idea of job satisf...


Unnamed: 0,reference,section
0,"Cherrington DJ, Reitz HJ, Scott WE. 1971. Effe...",INTRODUCTION
1,"Lawler EL III, Porter LW. 1967. The effect of ...",INTRODUCTION
2,"Gannon M, Noon JP. 1971. Management’s critical...",INTRODUCTION
3,"Press Smith CA, Organ DW, Near JP. 1983. Organ...",INTRODUCTION
4,Barnard CI. 1938. The Functions of the Executi...,THE FORESHADOWING OF ORGANIZATIONAL CITIZENSHI...
5,"Roethlisberger FJ, Dickson WJ. 1939. Managemen...",THE FORESHADOWING OF ORGANIZATIONAL CITIZENSHI...
6,"Maynes TD, Podsakoff PM. 2014. An examination ...",THE FORESHADOWING OF ORGANIZATIONAL CITIZENSHI...
7,Krebs DJ. 1970. Altruism: an examination of th...,THE FORESHADOWING OF ORGANIZATIONAL CITIZENSHI...
8,"Cialdini RB, Kenrick DT. 1976. Altruism as hed...",THE FORESHADOWING OF ORGANIZATIONAL CITIZENSHI...
9,Cohen S. 1980. Aftereffects of stress on human...,THE FORESHADOWING OF ORGANIZATIONAL CITIZENSHI...


Unnamed: 0,text
Keywords,"Keywords\nsocial networks, structural holes, b..."
Abstract,Abstract\nThis review of social network analys...
INTRODUCTION,INTRODUCTION\nIt has been almost 100 years sin...
SOCIAL NETWORK FOUNDATIONS,SOCIAL NETWORK FOUNDATIONS\nA social network i...
OLD NEW DIRECTIONS,OLD NEW DIRECTIONS\nI would be remiss if I did...
STRUCTURAL HOLES AND BROKERAGE,STRUCTURAL HOLES AND BROKERAGE\nStructural hol...
Beyond the Triad,Beyond the Triad\nAlthough almost all the stru...
Alters as Brokers,"Alters as Brokers\nPerhaps more importantly, m..."
NATURE OF TIES,NATURE OF TIES\nI have thus far focused on str...
Strength of Ties,Strength of Ties\nI begin with strength of tie...


Unnamed: 0,reference,section
0,"Roethlisberger FJ, Dixon WJ. 1939. Management ...",INTRODUCTION
1,Moreno JL. 1934. Who Shall Survive?: A New App...,INTRODUCTION
2,"Borgatti SP, Everett MG, Freeman LC. 2002. UCI...",INTRODUCTION
3,Burt RS. 1992. Structural Holes: The Social St...,"INTRODUCTION,SOCIAL NETWORK FOUNDATIONS,OLD NE..."
4,Coleman JS. 1990. Foundations of Social Theory...,"INTRODUCTION,SOCIAL NETWORK FOUNDATIONS,STRUCT..."
...,...,...
109,"Methot JR, Lepine JA, Podsakoff NP, Christian ...",Multiplex Ties
110,"Routledge Casciaro T, Lobo MS. 2005. Competent...",Multiplex Ties
111,"Levin DZ, Walter J, Murnighan JK. 2011. Dorman...",Dormant Ties
112,Press Burt RS. 2002. Bridge decay. Soc. Netw. ...,Dormant Ties


Unnamed: 0,text
Keywords,"Keywords\nsocial networks, personality, cognit..."
Abstract,Abstract\nSocial networks involve ties (and th...
INTRODUCTION,INTRODUCTION\nSocial network research in organ...
TWO APPROACHES TO INTEGRATION,TWO APPROACHES TO INTEGRATION\nEmerging from t...
Structure Dominates,Structure Dominates\nMuch social network resea...
Bringing People Back In,Bringing People Back In\nSocial networks invol...
DEBATES,"DEBATES\nGiven its diverse origins, social net..."
Strength of Ties,Strength of Ties\nTie strength was brought to ...
Open versus Closed Networks,Open versus Closed Networks\nJust as researche...
LEVELS OF ANALYSIS,LEVELS OF ANALYSIS\nSocial network theory and ...


Unnamed: 0,reference,section
0,"Borgatti SP, Mehra A, Brass DJ, Labianca G. 20...",INTRODUCTION
1,Brands RA. 2013. Cognitive social structures i...,"INTRODUCTION,Bringing People Back In,CONCLUSION"
2,"Burt RS, Kilduff M, Tasselli S. 2013. Social n...","INTRODUCTION,Bringing People Back In,Individua..."
3,"Carter DR, DeChurch LA, Braun MT, Contractor N...","INTRODUCTION,PRACTICAL IMPLICATIONS"
4,"Fang R, Landis B, Zhang Z, Anderson MH, Shaw J...","INTRODUCTION,Structure Dominates,Bringing Peop..."
...,...,...
111,Smith JE. 2006. Social yet creative: the role ...,PRACTICAL IMPLICATIONS
112,"Fernandez RM, Sosa ML. 2005. Gendering the job...",PRACTICAL IMPLICATIONS
113,"Balkundi P, Kilduff M, Harrison DA. 2011. Cent...",PRACTICAL IMPLICATIONS
114,Landis B. 2016. Personality and social network...,CONCLUSION


# DEBUGGING

In [469]:
references_dictionary = {}
references_text = sections[sections.index("LITERATURE CITED")].print_contents()
references_clean = text_preprocess_for_reference_matching(references_text)
for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
    in_text_citations = get_in_text_citations(text.item())
    cleaned_in_text_citations = [
        
        item 
        
        for group in 
    
        [
            re.sub(r"\(|\)|see also|’s", "", item).split(",")
            for item in in_text_citations
        ]
        
        for item in group
        
    ]
    # print(cleaned_in_text_citations)
    author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
    # pprint.pprint(author_year_pairs)
    # print("===")
    references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)
    
references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
references_df

Unnamed: 0,reference,section
0,"Wright PM, McMahan GC. 1992. Theoretical persp...","INTRODUCTION,Theoretical Foundations,Fit and F..."
1,Kaufman BE. 2014. The historical development o...,HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT...
2,Wiley Foulkes FK. 1975. The expanding role of ...,HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT...
3,"Press Beer M, Spector B, Lawrence PR, Mills DQ...",HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT...
4,Dyer L. 1985. Strategic human resources manage...,HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT...
5,"Miles RE, Snow CC. 1984. Designing strategic h...",The Era of Conceptual Models
6,"Baird L, Meshoulam I. 1988. Managing two ﬁts o...",The Era of Conceptual Models
7,Snell SA. 1992. Control theory in strategic hu...,The Era of Empirical Examination
8,"Snell SA, Dean JW. 1992. Integrated manufactur...","The Era of Empirical Examination,INTERNATIONAL..."
9,Huselid MA. 1995. The impact of human resource...,"The Era of Empirical Examination,Theoretical F..."
