# OrgSci

In [234]:
class Section:
    def __init__(self, content, size):
        self.children = []
        self.size = size
        self.parent = None
        self.content = content
        
    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self.content.strip() == other.content.strip()
        return False
    
    def __repr__(self):
        return self.content
    
    def add_child(self, new_child):
        self.children.append(new_child)
        
    def has_child(self):
        return len(self.children) != 0
    
    def set_parent(self, new_parent):
        self.parent = new_parent
    
    def extend(self, content):
        self.content += f" {content}"
    
    def backtrack_add(self, content, size):
        curr = self
        
        while curr.size <= size:
            curr = curr.parent
        
        parent = curr
        cs = Section(content, size)
        parent.add_child(cs)
        cs.set_parent(parent)
                
        return cs
    def print_contents(self):
        if len(self.children) == 0:
            return self.content
        
        return self.content + "\n" + " \n\n ".join([child.print_contents() for child in self.children])
    
import pprint
import fitz
from fitz import Rect
import re
import pandas as pd



def get_sections(doc):

    main_section = Section("", 100)
    main_section   

    prev_size = 100
    curr_section = main_section
    nest = {}

    for page in doc:
        rect = Rect(page.rect.x0 + 40, page.rect.y0 + 60, page.rect.x1 - 40, page.rect.y1 - 40) 
        
        dict = page.get_text("dict", clip = rect)

        blocks = dict["blocks"]
        for block in blocks:
            if "lines" in block.keys():
                spans = block['lines']
                for span in spans:
                    data = span['spans']
                    for lines in data:
                        cur_size = round(lines['size'], 1)

                        # Manual Override for References
                        if lines['text'].strip() in ("Acknowledgements", "References", "Appendix", "Endnotes"):
                            cur = Section(lines['text'], cur_size)
                            curr_section = main_section.children[-1].children[-1]
                            curr_section.add_child(cur)
                            cur.set_parent(curr_section)
                            curr_section = cur

                            prev_size = round(lines['size'], 1)

                        elif cur_size > prev_size:
                            curr_section = curr_section.backtrack_add(lines['text'], cur_size)
                            prev_size = curr_section.size

                        elif cur_size == prev_size:
                            curr_section.extend(lines['text'])

                        else:  
                            cur = Section(lines['text'], cur_size)   
                            curr_section.add_child(cur)
                            cur.set_parent(curr_section)
                            curr_section = cur
                            prev_size = round(lines['size'], 1)

    # orgsci                            
    return main_section.children[-1].children[-1].children

def make_sections_dataframe(path):
    doc = fitz.open(path) # open a document
    
    # Get sections
    sections = get_sections(doc)
    sections = preprocess_sections(sections)
    
    content_nest = {}

    for section in sections:
        content_nest[section.content] = [section.print_contents()]

    sections_df = pd.DataFrame(content_nest, index = ["text"]).T
    sections_df.name = doc.name
    return sections, sections_df

def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    references_clean = text_preprocess_for_reference_matching(sections[-1].print_contents())

    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [
            item 
            for group in 
                [  
                    [c.strip()                             # Remove whitespace
                     for c in citation[1:-1].split(',')    # Remove '(' and ')'
                     if any(char.isdigit() for char in c)] # Remove any that doesn't have digits (year)

                    for citation in in_text_citations
                ]
            for item in group 
        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    
    return references_df

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references_dirty = " ".join(references_dirty.split())
    references = re.sub("([0-9]|html|\))\s?\.", r"\g<0>\n", references_dirty)

    # Make list of references
    pattern = r"[A-ZÆØÅæøå][ÆØÅæøåA-Za-z]+.*[A-Z]{1,3},? .*\(\d{4}\).*[html|\d|\)]\."
    references_clean = re.findall(pattern, references)
    return references_clean
    

def get_in_text_citations(text):
    IN_TEXT_CITATION_REGEX = r"\([\w\s.,]+\s\d{3,4}\s?\)"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    # case 1: 2 authors
    if " and " in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split(" and ")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if f"({year})" in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data

def convert_pdf_to_dataframes(path) -> (pd.DataFrame, pd.DataFrame):
    """Returns (sections_df, references_df)"""
    sections, sections_df = make_sections_dataframe(path)
    references_df = make_references_dataframe(sections, sections_df)
    return sections_df, references_df

def preprocess_sections(sections):
    add_new_section = False
    
    # Preprocess sections
    abstract_section = Section("Abstract. ", 0)
    abstract_index = sections.index(abstract_section)
    abstract_text = sections.pop(abstract_index + 1)
    sections[abstract_index].add_child(abstract_text)
    
    sections_tmp = sections.copy()
    earliest_index = 1000
    txt = ""
    
    # Fit all text that belongs in paragraph into one "Introduction" paragraph
    for idx, section in enumerate(sections_tmp):
        if len(section.content) > 100:
            add_new_section = True
            if min(earliest_index, idx) != earliest_index:
                earliest_index = idx
            sections.remove(section)
            txt += section.content
            
    
    if add_new_section:
        new_section = Section("Introduction", 20)
        new_section.add_child(Section(txt, 15))
        sections[earliest_index] = new_section

    sections = sections[abstract_index:]
    return sections

import fitz
import os

ORGSCI_PATH = "../data/orgsci"
orgsci_pdfs = [f for f in os.listdir(ORGSCI_PATH) if f.endswith('pdf')]
for i in range(len(orgsci_pdfs) - 6):
    path = os.path.join(ORGSCI_PATH, orgsci_pdfs[i])
    sections_df, references_df = convert_pdf_to_dataframes(path)


# Annurev

In [235]:
class Section:
    def __init__(self, content, size):
        self.children = []
        self.size = size
        self.parent = None
        self.content = content
        
    def __eq__(self, other):
        if type(other) == str:
            return self.content.strip() == other
        if isinstance(other, self.__class__):
            return self.content.strip() == other.content.strip()
        return False
    
    def __repr__(self):
        return self.content
    
    def add_child(self, new_child):
        self.children.append(new_child)
        
    def has_child(self):
        return len(self.children) != 0
    
    def set_parent(self, new_parent):
        self.parent = new_parent
    
    def extend(self, content):
        self.content += f" {content}"
    
    def backtrack_add(self, content, size):
        curr = self
        
        while curr.size <= size:
            curr = curr.parent
        
        parent = curr
        cs = Section(content, size)
        parent.add_child(cs)
        cs.set_parent(parent)
                
        return cs
    def print_contents(self):
        if len(self.children) == 0:
            return self.content
        
        return self.content + "\n" + " \n\n ".join([child.print_contents() for child in self.children])
    
import pprint
import fitz
from fitz import Rect
import re
import pandas as pd



def get_sections(doc):

    main_section = Section("", 100)

    prev_size = 100
    curr_section = main_section
    nest = {}

    for page in doc:
        if page.number not in (doc[-1].number, doc[-2].number):
            if page.number == 0:
                rect = Rect(page.rect.x0 + 20, page.rect.y0 + 250, page.rect.x1 - 20, page.rect.y1 - 20) 
            else:
                rect = Rect(page.rect.x0 + 20, page.rect.y0 + 20, page.rect.x1 - 20, page.rect.y1 - 20) 
            
            dict = page.get_text("blocks", clip = rect)

            blocks = dict["blocks"]
            for block in blocks:
                if "lines" in block.keys():
                    spans = block['lines']
                    for span in spans:
                        data = span['spans']
                        for lines in data:
                            cur_size = round(lines['size'], 1)

                            # Manual Override for References
                            if lines['text'].strip() in ("Acknowledgements", "References", "Appendix", "Endnotes"):
                                if lines['text'].strip() == "References":
                                    print_flag = True
                                cur = Section(lines['text'], cur_size)
                                curr_section = main_section.children[-1].children[-1]
                                curr_section.add_child(cur)
                                cur.set_parent(curr_section)
                                curr_section = cur

                                prev_size = round(lines['size'], 1)

                            elif cur_size > prev_size:
                                curr_section = curr_section.backtrack_add(lines['text'], cur_size)
                                prev_size = curr_section.size

                            elif cur_size == prev_size:
                                curr_section.extend(lines['text'])

                            else:  
                                cur = Section(lines['text'], cur_size)   
                                curr_section.add_child(cur)
                                cur.set_parent(curr_section)
                                curr_section = cur
                                prev_size = round(lines['size'], 1)

    return main_section

def preprocess_sections(sections):
    # Preprocess sections
    abstract_section = Section("Abstract", 0)

    abstract_index = sections.index(abstract_section)
    abstract_text = sections.pop(abstract_index + 1)
    sections[abstract_index].add_child(abstract_text)

    sections = sections[abstract_index:]
    
    return sections

def make_sections_dataframe(path):
    doc = fitz.open(path) # open a document
    
    # Get sections
    sections = get_sections(doc)
    sections = preprocess_sections(sections)
    
    content_nest = {}

    for section in sections:
        content_nest[section.content] = [section.print_contents()]

    sections_df = pd.DataFrame(content_nest, index = ["text"]).T
    sections_df.name = doc.name
    return sections, sections_df

def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    
    references_clean = text_preprocess_for_reference_matching(sections[-1].print_contents())

    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [
            item 
            for group in 
                [  
                    [c.strip()                             # Remove whitespace
                     for c in citation[1:-1].split(',')    # Remove '(' and ')'
                     if any(char.isdigit() for char in c)] # Remove any that doesn't have digits (year)

                    for citation in in_text_citations
                ]
            for item in group 
        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    
    return references_df

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references_dirty = " ".join(references_dirty.split())
    references = re.sub("([0-9]|html|\))\s?\.", r"\g<0>\n", references_dirty)

    # Make list of references
    pattern = r"[A-ZÆØÅæøå][ÆØÅæøåA-Za-z]+.*[A-Z]{1,3},? .*\(\d{4}\).*[html|\d|\)]\."
    references_clean = re.findall(pattern, references)
    return references_clean
    

def get_in_text_citations(text):
    IN_TEXT_CITATION_REGEX = r"\([\w\s.,]+\s\d{3,4}\s?\)"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    # case 1: 2 authors
    if " and " in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split(" and ")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if f"({year})" in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data

def convert_pdf_to_dataframes(path) -> (pd.DataFrame, pd.DataFrame):
    """Returns (sections_df, references_df)"""
    sections, sections_df = make_sections_dataframe(path)
    references_df = make_references_dataframe(sections, sections_df)
    return sections_df, references_df 

In [482]:
def get_sections(doc):

    main_section = Section("", 100)

    prev_size = 100
    curr_section = main_section
    nest = {}
    print_flag = False

    for page in doc:
        if page.number not in (doc[-1].number, doc[-2].number):
            rect = Rect(page.rect.x0 + 20, page.rect.y0 + 20, page.rect.x1 - 20, page.rect.y1 - 30)
            
            dict = page.get_text("dict", clip = rect)
            blocks = dict["blocks"]
            for block in blocks:
                if "lines" in block.keys():
                    spans = block['lines']
                    for span in spans:
                        data = span['spans']
                        for lines in data:
                            cur_size = round(lines['size'], 2)

                            # Manual Override for References
                            if lines['text'].strip() in ("Abstract", "Keywords", "LITERATURE CITED"):
                                if lines['text'].strip() == "LITERATURE CITED":
                                    print_flag = True
                                cur = Section(lines['text'], cur_size)
                                curr_section = main_section.children[-1].children[-1]
                                curr_section.add_child(cur)
                                cur.set_parent(curr_section)
                                curr_section = cur

                                prev_size = round(lines['size'], 2)

                            elif cur_size > prev_size:
                                curr_section = curr_section.backtrack_add(lines['text'], cur_size)
                                prev_size = curr_section.size

                            elif cur_size == prev_size:
                                curr_section.extend(lines['text'])

                            else:  
                                cur = Section(lines['text'], cur_size)   
                                curr_section.add_child(cur)
                                cur.set_parent(curr_section)
                                curr_section = cur
                                prev_size = round(lines['size'], 2)
                                
                            # if print_flag:
                            #     print(repr(lines['text']))

    final_sections = main_section.children[-1].children
    
    if len(final_sections) > 1:
        return final_sections[-2].children + final_sections[-1].children
    else:
        return final_sections[-1].children

def preprocess_sections(sections):
    # Preprocess sections
    first_section = Section("Keywords", 0)

    first_index = sections.index(first_section)
    # abstract_text = sections.pop(abstract_index + 1)
    # sections[abstract_index].add_child(abstract_text)

    sections = sections[first_index:]
    
    return sections

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references = " ".join(references_dirty.split())
    pattern = "[A-Z][A-Za-z, ]+[A-Z]{1,3}\. \d{4}\."
    references_clean = re.findall(pattern, references)
    
    for idx, ref in enumerate(references_clean):
        if idx == len(references_clean) - 1:
            # All the way to the end
            references_clean[idx] = references[references.find(ref):]
        else:
            next_ref = references_clean[idx+1]
            references_clean[idx] = references[references.find(ref):references.find(next_ref)]

    return references_clean

def get_in_text_citations(text):
    IN_PARANTHESES_CITATION_REGEX = r"\([\w\s.,]+\s\d{3,4}\s?\)"
    AND_PATTERN = "\S+ & \S+ \(\d{3,4}\)"
    ONE_PATTERN = "[A-Z]\S+ \(\d{3,4}\)"
    ET_AL_PATTERN = "[A-Z][a-z] et al. \(\d{3,4}\)"
    IN_TEXT_CITATION_REGEX = f"{IN_PARANTHESES_CITATION_REGEX}|{AND_PATTERN}|{ONE_PATTERN}|{ET_AL_PATTERN}"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    
    # case 1: 2 authors
    if "&" in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split("&")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if year in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data

def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    references_text = sections[sections.index("LITERATURE CITED")].print_contents()
    references_clean = text_preprocess_for_reference_matching(references_text)
    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [

            item 

            for group in 

            [
                re.sub(r"\(|\)|see also|’s", "", item).split(",")
                for item in in_text_citations
            ]

            for item in group

        ]
        pprint.pprint(cleaned_in_text_citations)
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    return references_df

In [483]:
import os
import pprint

annurev_path = "../data/annurev-orgpsych"
annurev_pdfs = [f for f in os.listdir(annurev_path) if f.endswith('pdf')]
for i in range(len(annurev_pdfs)):
    doc = fitz.open(os.path.join(annurev_path, annurev_pdfs[i])) # open a document
    sections = get_sections(doc)
    sections = preprocess_sections(sections)
    
    content_nest = {}

    for section in sections:
        content_nest[section.content] = [section.print_contents()]

    sections_df = pd.DataFrame(content_nest, index = ["text"]).T
    references_df = make_references_dataframe(sections, sections_df)

[]
[]
['Wright & McMahan 1992', 'p. 298']
['Kaufman 2014',
 'Foulkes 1975',
 'Beer et al. 1984',
 ' Fombrun et al. 1984',
 'Dyer 1985']
['Miles & Snow 1984',
 'Baird & Meshoulam 1988',
 'Lengnick-Hall & Lengnick-Hall 1988']
['Snell 1992',
 'Snell & Dean 1992',
 'Huselid 1995',
 'MacDufﬁe 1995',
 'Delery & Doty 1996',
 'Aguinis et al. 2010',
 'Huselid 1995',
 'p. 518']
['Huselid & Becker 2000',
 'Huselid & Becker 2000',
 'p. 851',
 'Wall & Wood 2005',
 'p. 454']
[]
['Wright & McMahan 1992',
 'Wright et al. 1994',
 ' Huselid 1995',
 ' Jiang et al. 2012',
 ' Messersmith et al. 2011',
 'Barney & Wright 1998',
 'Chadwick & Dabu 2008',
 'Barney & Clark 2007',
 'Becker 1964',
 'Lepak & Snell 1999',
 'Becker 1964',
 'Chadwick & Dabu 2008',
 'Blau 1964',
 'Kehoe & Wright 2013']
['Huselid 1995',
 'Wright et al. 1998',
 'Becker & Gerhart 1996',
 'Arthur 1994',
 'Huselid 1995',
 'Appelbaum et al. 2001',
 'Su & Wright 2011']
['Ostroff & Bowen 2000']
['Nadler & Tushman 1980',
 'Wright & Snell 1998',

# DEBUGGING

In [469]:
references_dictionary = {}
references_text = sections[sections.index("LITERATURE CITED")].print_contents()
references_clean = text_preprocess_for_reference_matching(references_text)
for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
    in_text_citations = get_in_text_citations(text.item())
    cleaned_in_text_citations = [
        
        item 
        
        for group in 
    
        [
            re.sub(r"\(|\)|see also|’s", "", item).split(",")
            for item in in_text_citations
        ]
        
        for item in group
        
    ]
    # print(cleaned_in_text_citations)
    author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
    # pprint.pprint(author_year_pairs)
    # print("===")
    references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)
    
references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
references_df

Unnamed: 0,reference,section
0,"Wright PM, McMahan GC. 1992. Theoretical persp...","INTRODUCTION,Theoretical Foundations,Fit and F..."
1,Kaufman BE. 2014. The historical development o...,HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT...
2,Wiley Foulkes FK. 1975. The expanding role of ...,HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT...
3,"Press Beer M, Spector B, Lawrence PR, Mills DQ...",HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT...
4,Dyer L. 1985. Strategic human resources manage...,HISTORY OF STRATEGIC HUMAN RESOURCE MANAGEMENT...
5,"Miles RE, Snow CC. 1984. Designing strategic h...",The Era of Conceptual Models
6,"Baird L, Meshoulam I. 1988. Managing two ﬁts o...",The Era of Conceptual Models
7,Snell SA. 1992. Control theory in strategic hu...,The Era of Empirical Examination
8,"Snell SA, Dean JW. 1992. Integrated manufactur...","The Era of Empirical Examination,INTERNATIONAL..."
9,Huselid MA. 1995. The impact of human resource...,"The Era of Empirical Examination,Theoretical F..."
