# OrgSci

In [688]:
class Section:
    def __init__(self, content, size):
        self.children = []
        self.size = size
        self.parent = None
        self.content = content
        
    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self.content.strip() == other.content.strip()
        return False
    
    def __repr__(self):
        return self.content
    
    def add_child(self, new_child):
        self.children.append(new_child)
        
    def has_child(self):
        return len(self.children) != 0
    
    def set_parent(self, new_parent):
        self.parent = new_parent
    
    def extend(self, content):
        self.content += f" {content}"
    
    def backtrack_add(self, content, size):
        curr = self
        
        while curr.size <= size:
            curr = curr.parent
        
        parent = curr
        cs = Section(content, size)
        parent.add_child(cs)
        cs.set_parent(parent)
                
        return cs
    def print_contents(self):
        if len(self.children) == 0:
            return self.content
        
        return self.content + "\n" + " \n\n ".join([child.print_contents() for child in self.children])


def get_sections(doc):

    main_section = Section("", 100)
    main_section   

    prev_size = 100
    curr_section = main_section
    nest = {}

    for page in doc:
        rect = Rect(page.rect.x0 + 40, page.rect.y0 + 60, page.rect.x1 - 40, page.rect.y1 - 40) 
        
        dict = page.get_text("dict", clip = rect)

        blocks = dict["blocks"]
        for block in blocks:
            if "lines" in block.keys():
                spans = block['lines']
                for span in spans:
                    data = span['spans']
                    for lines in data:
                        cur_size = round(lines['size'], 1)

                        # Manual Override for References
                        if lines['text'].strip() in ("Acknowledgements", "References", "Appendix", "Endnotes"):
                            cur = Section(lines['text'], cur_size)
                            curr_section = main_section.children[-1].children[-1]
                            curr_section.add_child(cur)
                            cur.set_parent(curr_section)
                            curr_section = cur

                            prev_size = round(lines['size'], 1)

                        elif cur_size > prev_size:
                            curr_section = curr_section.backtrack_add(lines['text'], cur_size)
                            prev_size = curr_section.size

                        elif cur_size == prev_size:
                            curr_section.extend(lines['text'])

                        else:  
                            cur = Section(lines['text'], cur_size)   
                            curr_section.add_child(cur)
                            cur.set_parent(curr_section)
                            curr_section = cur
                            prev_size = round(lines['size'], 1)

    # orgsci                            
    return main_section.children[-1].children[-1].children

def make_sections_dataframe(path):
    doc = fitz.open(path) # open a document
    
    # Get sections
    sections = get_sections(doc)
    sections = preprocess_sections(sections)
    
    content_nest = {}

    for section in sections:
        content_nest[section.content] = [section.print_contents()]

    sections_df = pd.DataFrame(content_nest, index = ["text"]).T
    sections_df.name = doc.name
    return sections, sections_df

def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    references_clean = text_preprocess_for_reference_matching(sections[-1].print_contents())

    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [
            item 
            for group in 
                [  
                    [c.strip()                             # Remove whitespace
                     for c in citation[1:-1].split(',')    # Remove '(' and ')'
                     if any(char.isdigit() for char in c)] # Remove any that doesn't have digits (year)

                    for citation in in_text_citations
                ]
            for item in group 
        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    
    return references_df

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references_dirty = " ".join(references_dirty.split())
    references = re.sub("([0-9]|html|\))\s?\.", r"\g<0>\n", references_dirty)

    # Make list of references
    pattern = r"[A-ZÆØÅæøå][ÆØÅæøåA-Za-z]+.*[A-Z]{1,3},? .*\(\d{4}\).*[html|\d|\)]\."
    references_clean = re.findall(pattern, references)
    return references_clean
    

def get_in_text_citations(text):
    IN_TEXT_CITATION_REGEX = r"\([\w\s.,]+\s\d{3,4}\s?\)"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    # case 1: 2 authors
    if " and " in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split(" and ")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if f"({year})" in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data

def convert_pdf_to_dataframes(path) -> (pd.DataFrame, pd.DataFrame):
    """Returns (sections_df, references_df)"""
    sections, sections_df = make_sections_dataframe(path)
    references_df = make_references_dataframe(sections, sections_df)
    return sections_df, references_df

def preprocess_sections(sections):
    add_new_section = False

    # Preprocess sections
    abstract_section = Section("Abstract. ", 0)
    abstract_index = sections.index(abstract_section)
    abstract_text = sections.pop(abstract_index + 1)
    sections[abstract_index].add_child(abstract_text)

    sections_tmp = sections.copy()
    earliest_index = 1000
    txt = ""
    # pprint.pprint(sections[abstract_index:])
    # Fit all text that belongs in paragraph into one "Introduction" paragraph
    for idx, section in enumerate(sections_tmp):
        if len(section.content) > 200:
            print(section)
            add_new_section = True
            if min(earliest_index, idx) != earliest_index:
                earliest_index = idx
            sections.remove(section)
            txt += section.content

    if add_new_section:
        new_section = Section("Introduction", 20)
        new_section.add_child(Section(txt, 15))
        sections[earliest_index] = new_section

    sections = sections[abstract_index:]

    return sections

import fitz
import os

ORGSCI_PATH = "../data/orgsci"
orgsci_pdfs = [f for f in os.listdir(ORGSCI_PATH) if f.endswith('pdf')]
for i in range(len(orgsci_pdfs)):
    path = os.path.join(ORGSCI_PATH, orgsci_pdfs[2])
    sections_df, references_df = convert_pdf_to_dataframes(path)
    abstract_text = sections_df.iloc[0].item()
    abstract, keywords = abstract_text.split("Keywords")
    cleaned_keywords = [keyword.strip() for keyword in keywords.split("•")]
    new_row = pd.DataFrame({"text":str(cleaned_keywords)}, index = ["Keywords"])
    sections_df = pd.concat([new_row, sections_df])

sections_df


Unnamed: 0,text
Keywords,"[': status', 'status-quality coupling', 'netwo..."
Abstract.,Abstract.\n Previous research has demonstrated...
Introduction,Introduction\nResearch on social networks has ...
Social Status and Network-Broadening,Social Status and Network-Broadening\nResearch...
A Critical Enabler: Belief in Status-Quality Coupling,A Critical Enabler: Belief in Status-Quality C...
Network-Broadening Behavior and Network Size,Network-Broadening Behavior and Network Size\n...
Research Overview,Research Overview\nWe tested our hypotheses ac...
Pilot Study: A Preliminary Demonstration of the Interaction Effect of Status X Coupling on Network Size in the GSS,Pilot Study: A Preliminary Demonstration of th...
Study 1: Status is Positively Associated with Network-Broadening,Study 1: Status is Positively Associated with ...
Study 2: Belief in Status-Quality Coupling Moderates the Relationship Between Status and Network-Broadening,Study 2: Belief in Status-Quality Coupling Mod...


# Annurev

In [505]:
class Section:
    def __init__(self, content, size):
        self.children = []
        self.size = size
        self.parent = None
        self.content = content
        
    def __eq__(self, other):
        if type(other) == str:
            return self.content.strip() == other
        if isinstance(other, self.__class__):
            return self.content.strip() == other.content.strip()
        return False
    
    def __repr__(self):
        return self.content
    
    def add_child(self, new_child):
        self.children.append(new_child)
        
    def has_child(self):
        return len(self.children) != 0
    
    def set_parent(self, new_parent):
        self.parent = new_parent
    
    def extend(self, content):
        self.content += f" {content}"
    
    def backtrack_add(self, content, size):
        curr = self
        
        while curr.size <= size:
            curr = curr.parent
        
        parent = curr
        cs = Section(content, size)
        parent.add_child(cs)
        cs.set_parent(parent)
                
        return cs
    def print_contents(self):
        if len(self.children) == 0:
            return self.content
        
        return self.content + "\n" + " \n\n ".join([child.print_contents() for child in self.children])


def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    
    references_clean = text_preprocess_for_reference_matching(sections[-1].print_contents())

    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [
            item 
            for group in 
                [  
                    [c.strip()                             # Remove whitespace
                     for c in citation[1:-1].split(',')    # Remove '(' and ')'
                     if any(char.isdigit() for char in c)] # Remove any that doesn't have digits (year)

                    for citation in in_text_citations
                ]
            for item in group 
        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    
    return references_df

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references_dirty = " ".join(references_dirty.split())
    references = re.sub("([0-9]|html|\))\s?\.", r"\g<0>\n", references_dirty)

    # Make list of references
    pattern = r"[A-ZÆØÅæøå][ÆØÅæøåA-Za-z]+.*[A-Z]{1,3},? .*\(\d{4}\).*[html|\d|\)]\."
    references_clean = re.findall(pattern, references)
    return references_clean
    

def get_in_text_citations(text):
    IN_TEXT_CITATION_REGEX = r"\([\w\s.,]+\s\d{3,4}\s?\)"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    # case 1: 2 authors
    if " and " in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split(" and ")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if f"({year})" in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data

def convert_pdf_to_dataframes(path) -> (pd.DataFrame, pd.DataFrame):
    """Returns (sections_df, references_df)"""
    sections, sections_df = make_sections_dataframe(path)
    references_df = make_references_dataframe(sections, sections_df)
    return sections_df, references_df 

In [None]:
import pprint
import fitz
from fitz import Rect
import re
import pandas as pd

def preprocess_sections(sections):
    # Preprocess sections
    abstract_section = Section("Abstract", 0)

    abstract_index = sections.index(abstract_section)
    abstract_text = sections.pop(abstract_index + 1)
    sections[abstract_index].add_child(abstract_text)

    sections = sections[abstract_index:]
    
    return sections

def make_sections_dataframe(path):
    doc = fitz.open(path) # open a document
    
    # Get sections
    sections = get_sections(doc)
    sections = preprocess_sections(sections)
    
    content_nest = {}

    for section in sections:
        content_nest[section.content] = [section.print_contents()]

    sections_df = pd.DataFrame(content_nest, index = ["text"]).T
    sections_df.name = doc.name
    return sections, sections_df

def get_sections(doc):

    main_section = Section("", 100)

    prev_size = 100
    curr_section = main_section
    nest = {}

    for page in doc:
        if page.number not in (doc[-1].number, doc[-2].number):
            rect = Rect(page.rect.x0 + 20, page.rect.y0 + 20, page.rect.x1 - 20, page.rect.y1 - 30)
            
            dict = page.get_text("dict", clip = rect)
            blocks = dict["blocks"]
            for block in blocks:
                if "lines" in block.keys():
                    spans = block['lines']
                    for span in spans:
                        data = span['spans']
                        for lines in data:
                            cur_size = round(lines['size'], 2)

                            # Manual Override for References
                            if lines['text'].strip() in ("Abstract", "Keywords", "LITERATURE CITED"):

                                cur = Section(lines['text'], cur_size)
                                curr_section = main_section.children[-1].children[-1]
                                curr_section.add_child(cur)
                                cur.set_parent(curr_section)
                                curr_section = cur

                                prev_size = round(lines['size'], 2)

                            elif cur_size > prev_size:
                                curr_section = curr_section.backtrack_add(lines['text'], cur_size)
                                prev_size = curr_section.size

                            elif cur_size == prev_size:
                                curr_section.extend(lines['text'])

                            else:  
                                cur = Section(lines['text'], cur_size)   
                                curr_section.add_child(cur)
                                cur.set_parent(curr_section)
                                curr_section = cur
                                prev_size = round(lines['size'], 2)


    final_sections = main_section.children[-1].children
    
    if len(final_sections) > 1:
        return final_sections[-2].children + final_sections[-1].children
    else:
        return final_sections[-1].children

def preprocess_sections(sections):
    # Preprocess sections
    first_section = Section("Keywords", 0)
    first_index = sections.index(first_section)
    sections = sections[first_index:]
    
    return sections

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references = " ".join(references_dirty.split())
    pattern = "[A-Z][A-Za-z, ]+[A-Z]{1,3}\. \d{4}\."
    references_clean = re.findall(pattern, references)
    
    for idx, ref in enumerate(references_clean):
        if idx == len(references_clean) - 1:
            # All the way to the end
            references_clean[idx] = references[references.find(ref):]
        else:
            next_ref = references_clean[idx+1]
            references_clean[idx] = references[references.find(ref):references.find(next_ref)]

    return references_clean

def get_in_text_citations(text):
    IN_PARANTHESES_CITATION_REGEX = r"\([&\w\s., ]+\s\d{3,4}\)"
    AND_PATTERN = "\S+ & \S+ \(\d{3,4}\)"
    ONE_PATTERN = "[A-Z]\S+ \(\d{3,4}\)"
    ET_AL_PATTERN = "[A-Z][a-z] et al. \(\d{3,4}\)"
    IN_TEXT_CITATION_REGEX = f"{IN_PARANTHESES_CITATION_REGEX}|{AND_PATTERN}|{ONE_PATTERN}|{ET_AL_PATTERN}"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    # case 1: 2 authors
    if "&" in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split("&")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if year in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data

def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    
    references_text = sections[sections.index("LITERATURE CITED")].print_contents()
    references_clean = text_preprocess_for_reference_matching(references_text)
    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [

            item 

            for group in 

            [
                re.sub(r"\(|\)|see also|’s", "", item).split(",")
                for item in in_text_citations
            ]

            for item in group

        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    return references_df

In [None]:
# import os
# import pprint
# from IPython.display import display

# annurev_path = "../data/annurev-orgpsych"
# annurev_pdfs = [f for f in os.listdir(annurev_path) if f.endswith('pdf')]
# for i in range(len(annurev_pdfs)):
#     sections_df, references_df = convert_pdf_to_dataframes(os.path.join(annurev_path, annurev_pdfs[i]))

## DEBUGGING

In [538]:
references_dictionary = {}
references_text = sections[sections.index("LITERATURE CITED")].print_contents()
references_clean = text_preprocess_for_reference_matching(references_text)
for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
    in_text_citations = get_in_text_citations(text.item())
    cleaned_in_text_citations = [
        
        item 
        
        for group in 
    
        [
            re.sub(r"\(|\)|see also|’s", "", item).split(",")
            for item in in_text_citations
        ]
        
        for item in group
        
    ]
    # print(cleaned_in_text_citations)
    author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
    # pprint.pprint(author_year_pairs)
    # print("===")
    references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)
    
references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")


# AOM

In [12]:
AOM_HEADER_SIZE = 9.96

In [215]:
import pprint
import fitz
from fitz import Rect
import re
import pandas as pd

class Section:
    def __init__(self, content, size):
        self.children = []
        self.size = size
        self.parent = None
        self.content = content
        
    def __eq__(self, other):
        if type(other) == str:
            return self.content.strip() == other
        if isinstance(other, self.__class__):
            return self.content.strip() == other.content.strip()
        return False
    
    def __repr__(self):
        return self.content
    
    def add_child(self, new_child):
        self.children.append(new_child)
        
    def has_child(self):
        return len(self.children) != 0
    
    def set_parent(self, new_parent):
        self.parent = new_parent
    
    def extend(self, content):
        self.content += f" {content}"
    
    def backtrack_add(self, content, size):
        curr = self
        
        while curr.size <= size:
            curr = curr.parent
        
        parent = curr
        cs = Section(content, size)
        parent.add_child(cs)
        cs.set_parent(parent)
                
        return cs
    def print_contents(self):
        if len(self.children) == 0:
            return self.content
        
        return self.content + "\n" + " \n\n ".join([child.print_contents() for child in self.children])
    
def preprocess_sections(sections):
    # Preprocess sections
    abstract_section = Section("Abstract", 0)

    abstract_index = sections.index(abstract_section)
    abstract_text = sections.pop(abstract_index + 1)
    sections[abstract_index].add_child(abstract_text)

    sections = sections[abstract_index:]
    
    return sections

def make_sections_dataframe(path):
    doc = fitz.open(path) # open a document
    
    # Get sections
    sections = get_sections(doc)
    sections = preprocess_sections(sections)
    
    content_nest = {}

    for section in sections:
        content_nest[section.content] = [section.print_contents()]

    sections_df = pd.DataFrame(content_nest, index = ["text"]).T
    sections_df.name = doc.name
    return sections, sections_df



def preprocess_sections(sections):
    # Preprocess sections
    first_section = Section("Keywords", 0)
    first_index = sections.index(first_section)
    sections = sections[first_index:]
    
    return sections


def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    
    references_clean = text_preprocess_for_reference_matching(sections[-1].print_contents())

    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [
            item 
            for group in 
                [  
                    [c.strip()                             # Remove whitespace
                     for c in citation[1:-1].split(',')    # Remove '(' and ')'
                     if any(char.isdigit() for char in c)] # Remove any that doesn't have digits (year)

                    for citation in in_text_citations
                ]
            for item in group 
        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    
    return references_df

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references = " ".join(references_dirty.split())
    pattern = "[A-Z][A-Za-z, ]+[A-Z]{1,3}\. \d{4}\."
    references_clean = re.findall(pattern, references)
    
    for idx, ref in enumerate(references_clean):
        if idx == len(references_clean) - 1:
            # All the way to the end
            references_clean[idx] = references[references.find(ref):]
        else:
            next_ref = references_clean[idx+1]
            references_clean[idx] = references[references.find(ref):references.find(next_ref)]

    return references_clean
    

def get_in_text_citations(text):
    IN_TEXT_CITATION_REGEX = r"\([\w\s.,]+\s\d{3,4}\s?\)"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    # case 1: 2 authors
    if "&" in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split("&")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if f"({year})" in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data



In [286]:
def structure_doc_by_size_and_font(doc):
    first_page_fonts = {}
    rest_fonts = {}
    seqs = []
    prev_size, prev_font = 0,0
    for page in doc:      
        d = page.get_text("dict")
        blocks = d["blocks"]
        for block in blocks:
            if "lines" in block.keys():
                spans = block['lines']
                for span in spans:
                    data = span['spans']
                    for lines in data:                            

                        cur_size = round(lines['size'], 2)
                        cur_font = lines['font'].split("+")[0]

                        key = (cur_font, cur_size)
                        # print(lines['text'], key)

                        if cur_size == prev_size and cur_font == prev_font:
                            latest_item = rest_fonts[key][-1]
                            
                            if page.number == 0:
                                first_page_fonts[key][-1] = latest_item + " " + lines['text']  
                                
                            rest_fonts[key][-1] = latest_item + " " + lines['text']  
                            seqs[-1] = seqs[-1] + " " + lines['text']
                            
                        else:   
                            if page.number == 0:
                                first_page_fonts[key] = first_page_fonts.get(key, []) + [lines['text']]
                            rest_fonts[key] = rest_fonts.get(key, []) + [lines['text']]
                            seqs.append(lines['text'])

                        prev_size = cur_size
                        prev_font = cur_font
    
    sorted_first_page_fonts = dict(sorted(first_page_fonts.items(), key = lambda x: x[0][1], reverse = True))
    sorted_rest_fonts = dict(sorted(rest_fonts.items(), key = lambda x: x[0][1], reverse = True))
    return seqs, sorted_first_page_fonts, sorted_rest_fonts

def get_headers(fonts):
    """Returns list of text (headers) that has size equal to AOM-standard headers"""
    for key, val in fonts.items():
        font, size = key
        if size == AOM_HEADER_SIZE:
            return val[1:]
    return None


def get_abstract(first_page_fonts):
    first_page_fonts = dict(reversed(first_page_fonts.items()))
    dict_items = first_page_fonts.items()
    for idx, ((font, font_size), blocks) in enumerate(dict_items):
        # Get item right before authors
        if font_size == AOM_HEADER_SIZE:
            return {"Abstract": list(dict_items)[idx-1][1][0]}
            
    return first_page_fonts

    

def get_text_nest(seqs, starting_text_nest, pdf_headers):
    cur_header = "Intro"
    for sequence in seqs:
        if sequence in pdf_headers:
            starting_text_nest[sequence] = ""
            cur_header = sequence
        else:
            starting_text_nest[cur_header] = starting_text_nest.get(cur_header, "") + " " + sequence      
    return starting_text_nest

def get_sections(doc):
    seqs, first_page_fonts, rest_fonts = structure_doc_by_size_and_font(doc)
    starting_text_nest = get_abstract(first_page_fonts)
    pdf_headers = get_headers(rest_fonts)
    text_nest = get_text_nest(seqs, starting_text_nest, pdf_headers)
    return text_nest

def make_sections_dataframe(doc):
    text_nest = get_sections(doc)
    sections_df = pd.DataFrame(text_nest, index = ["text"]).T
    sections_df.name = doc.name
    return text_nest, sections_df

def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if year in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data
def process_citations(citation_group:str) -> (list,str):
    citations = citation_group.split(";")
    for citation in citations:
        try:
            # case 1: & 
            if "&" in citation:
                tokens = citation.split(",")
                year = tokens[-1]
                names = ",".join(tokens[:-1])
                names = names.replace("&", ",")
                names_split = names.split(",")
                return ([name.strip() for name in names_split if name.strip() not in ("", "e.g.")], year.strip())

            # case 2: et al
            if "et al." in citation:
                citation = citation.replace("et al.", "")
                tokens = citation.split(",")
                return ([token.strip() for token in tokens[:-1] if token.strip() != "" ], tokens[-1].strip())

            # case 3: 1 author
            else:
                if "(" in citation:
                    author, year = citation.split()
                    return ([author], year[1:-1])
                else:
                    citation_split = citation.split(",")
                    return ([citation_split[-2]], citation_split[-1])
        except:
            return ([""], "")
                
                

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references = " ".join(references_dirty.split())
    pattern = "[A-Z][a-z]+, [A-Z]*[A-Za-z,\-’&.ˇ ]*[A-Z]{1,3}\.\s\d{4}\."
    references_clean = re.findall(pattern, references)
    
    for idx, ref in enumerate(references_clean):
        if idx == len(references_clean) - 1:
            # All the way to the end
            references_clean[idx] = references[references.find(ref):]
        else:
            next_ref = references_clean[idx+1]
            references_clean[idx] = references[references.find(ref):references.find(next_ref)]

    return references_clean

def make_references_dataframe(text_nest, sections_df):
    references_dictionary = {}
    references_clean = text_preprocess_for_reference_matching(text_nest["REFERENCES"])
    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [
            citation 
            if citation[0] != "("
            else citation[1:-1]
            for citation in in_text_citations 

        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))        
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    
    return references_df
    

def get_in_text_citations(text):
    IN_PARANTHESES_CITATION_REGEX = r"\([&\w\s.,\-; ]+\s\d{3,4}\)"
    AND_PATTERN = "\S+ & \S+ \(\d{3,4}\)"
    ONE_PATTERN = "[A-Z]\S+ \(\d{3,4}\)"
    ET_AL_PATTERN = "[A-Z][a-z] et al. \(\d{3,4}\)"
    IN_TEXT_CITATION_REGEX = f"{IN_PARANTHESES_CITATION_REGEX}|{AND_PATTERN}|{ONE_PATTERN}|{ET_AL_PATTERN}"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def convert_pdf_to_dataframes(doc):
    """Returns (sections_df, references_df)"""
    sections, sections_df = make_sections_dataframe(doc)
    references_df = make_references_dataframe(sections, sections_df)
    return sections_df, references_df 

In [293]:
import os
import pprint
import fitz
from IPython.display import display

aom_path = "../data/aom"
aom_pdfs = [f for f in os.listdir(aom_path) if f.endswith('pdf')]

fonts = {}
path = os.path.join(aom_path, aom_pdfs[5])
doc = fitz.open(path) # open a document
print(doc)
display(*convert_pdf_to_dataframes(doc))

Document('../data/aom/Ray_2023_AOMReview_Emergence Theory the role of social capital.pdf')


Unnamed: 0,text
Abstract,The value of human capital resources (HCR) is ...
Intro,HUMAN CAPITAL RESOURCES EMERGENCE THEORY: THE...
THEORETICAL BACKGROUND Human Capital Resources,"HCR emanates from human capital (Becker, 1964..."
Emergence,Emergence is predicated on the idea that a wh...
Emergence-Enabling States,"Currently, literature relies on explaining HC..."
Social Processes in HCR Emergence,Social interactions are a critical definition...
HCR EMERGENCE THEORY,Our aim is to explain precisely how the struc...
Central Assumptions,We build HCR emergence theory using three sim...
FIGURE 1 Overview of HCR Emergence Theory,Individual Employee 3 Individual KSAOs Indivi...
Emergence Mechanisms,Social interactions are a central feature of ...


Unnamed: 0,reference,section
0,"Call, M. L., Nyberg, A. J., Ployhart, R. E., &...","Intro,Inputs and Outputs of the HCR Emergence ..."
1,"Ployhart, R. E., & Moliterno, T. P. 2011. Emer...","Intro,Emergence-Enabling States,Central Assump..."
2,"Brymer, R. A., & Hitt, M. A. 2019. Agonistic r...","Intro,Emergence,DISCUSSION"
3,"Cannella, A. A., Jr., & Sy, V. A. 2019. Human ...","Intro,THEORETICAL BACKGROUND Human Capital Res..."
4,"Eckardt, R., & Jiang, K. 2019. Human capital r...","Intro,THEORETICAL BACKGROUND Human Capital Res..."
...,...,...
117,"Gerhart, B., & Feng, J. 2021. The resource-bas...",Limitations and Future Research
118,"Oldroyd, J. B., & Morris, S. S. 2012. Catching...",Limitations and Future Research
119,"Kehoe, R. R., Lepak, D. P., & Bentley, F. S. 2...",Limitations and Future Research
120,"Kang, S. C., Oldroyd, J. B., Morris, S. S., & ...",Limitations and Future Research


# ASQ

In [202]:
def make_sections_dataframe(doc):
    text_nest = get_sections(doc)
    sections_df = pd.DataFrame(text_nest, index = ["text"]).T
    sections_df.name = doc.name
    return text_nest, sections_df

def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if year in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data
def process_citations(citation_group:str) -> (list,str):
    citations = citation_group.split(";")
    for citation in citations:
        print(citation)
        try:
            # case 1: & 
            if " and " in citation:
                tokens = citation.split(",")
                year = tokens[-1]
                names = ",".join(tokens[:-1])
                names = names.replace(" and ", ",")
                names_split = names.split(",")
                print(names_split)
                return ([name.strip() for name in names_split if name.strip() not in ("", "e.g.")], year.strip())

            # case 2: et al
            if "et al." in citation:
                citation = citation.replace("et al.", "")
                tokens = citation.split(",")
                return ([token.strip() for token in tokens[:-1] if token.strip() != "" ], tokens[-1].strip())

            # case 3: 1 author
            else:
                if "(" in citation:
                    author, year = citation.split()
                    return ([author], year[1:-1])
                else:
                    citation_split = citation.split(",")
                    return ([citation_split[-2]], citation_split[-1])
        except:
            return ([""], "")
                
                

def remove_prefix(citation):
    is_parantheses = False
    for idx, char in enumerate(citation):
        if char == "." and citation[idx - 1].islower() and not is_parantheses:
            return citation[idx+2:]
        if char == "(":
            is_parantheses = True
        if char == ")":
            is_parantheses = False
            
    return citation

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references = " ".join(references_dirty.split())
    pattern = "[A-Z][A-Za-z,\-’.ˇ() ]+ \d{4} "
    references_clean = list(map(remove_prefix, re.findall(pattern, references)))

    for idx, ref in enumerate(references_clean):
        if idx == len(references_clean) - 1:
            # All the way to the end
            references_clean[idx] = references[references.find(ref):]
        else:
            next_ref = references_clean[idx+1]
            references_clean[idx] = references[references.find(ref):references.find(next_ref)]

    return references_clean

def make_references_dataframe(text_nest, sections_df):
    references_dictionary = {}
    references_clean = text_preprocess_for_reference_matching(text_nest["REFERENCES"])
    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        # print(location, "\n")
        in_text_citations = get_in_text_citations(text.item())
        # print(in_text_citations)
        cleaned_in_text_citations = [
            citation 
            if citation[0] != "("
            else citation[1:-1]
            for citation in in_text_citations 

        ]
        author_year_pairs_nested = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))  
        author_year_pairs = [item for group in author_year_pairs_nested for item in group]
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)
        # print()

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    references_df
    

def get_in_text_citations(text):
    IN_PARANTHESES_CITATION_REGEX = r"\([&\w\s.,\-; ]+\s\d{3,4}\)"
    AND_PATTERN = "\S+ and \S+ \(\d{3,4}\)"
    ONE_PATTERN = "[A-Z]\S+ \(\d{3,4}\)"
    ET_AL_PATTERN = "[A-Z][a-z] et al. \(\d{3,4}\)"
    IN_TEXT_CITATION_REGEX = f"{IN_PARANTHESES_CITATION_REGEX}|{AND_PATTERN}|{ONE_PATTERN}|{ET_AL_PATTERN}"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def convert_pdf_to_dataframes(doc):
    """Returns (sections_df, references_df)"""
    sections, sections_df = make_sections_dataframe(doc)
    references_df = make_references_dataframe(sections, sections_df)
    return sections_df, references_df 

In [203]:
ABSTRACT_KEY = ('AdvPSA35F', 10.0)
HEADERS_KEY = ('AdvP2A83', 10.0)

In [199]:
def structure_doc_by_size_and_font(doc):
    # first_page_fonts = {}
    rest_fonts = {}
    seqs = []
    prev_size, prev_font = 0,0
    for page in doc:      
        d = page.get_text("dict")
        blocks = d["blocks"]
        for block in blocks:
            if "lines" in block.keys():
                spans = block['lines']
                for span in spans:
                    data = span['spans']
                    for lines in data:                            

                        cur_size = round(lines['size'], 2)
                        cur_font = lines['font'].split("+")[0]

                        key = (cur_font, cur_size)
                        # print(lines['text'], key)

                        if cur_size == prev_size and cur_font == prev_font:
                            latest_item = rest_fonts[key][-1]
                            
                            # if page.number == 0:
                                # first_page_fonts[key][-1] = latest_item + " " + lines['text']  
                                
                            rest_fonts[key][-1] = latest_item + " " + lines['text']  
                            seqs[-1] = seqs[-1] + " " + lines['text']
                            
                        else:   
                            # if page.number == 0:
                                # first_page_fonts[key] = first_page_fonts.get(key, []) + [lines['text']]
                            rest_fonts[key] = rest_fonts.get(key, []) + [lines['text']]
                            seqs.append(lines['text'])

                        prev_size = cur_size
                        prev_font = cur_font
    
    # sorted_first_page_fonts = dict(sorted(first_page_fonts.items(), key = lambda x: x[0][1], reverse = True))
    sorted_rest_fonts = sorted(rest_fonts.items(), key = lambda x: x[0][1], reverse = True)
    return seqs, sorted_rest_fonts

def get_headers(fonts):
    """Returns list of text (headers) that has size equal to AOM-standard headers"""
    first_part = []
    second_part = []
    for key, val in fonts:
        if key == ABSTRACT_KEY:
            first_part = val
        if key == HEADERS_KEY:
            second_part = val
    return first_part[:2] + second_part + first_part[2:]

def find_earliest_uppercase_index(s):
    for i, char in enumerate(s):
        if char.isalpha() and char.upper() == char:
            return i
    return len(s)

def get_text_nest(seqs, starting_text_nest, pdf_headers):
    cur_header = "Other"
    for sequence in seqs[1:]:
        if sequence in pdf_headers:
            starting_text_nest[sequence] = ""
            cur_header = sequence
        else:
            if cur_header.startswith("Keyword"):
                earliest_idx = find_earliest_uppercase_index(sequence)
                keyword_part = sequence[:earliest_idx]
                intro_part = sequence[earliest_idx:]
                starting_text_nest[cur_header] = starting_text_nest.get(cur_header, "") + " " + keyword_part   
                cur_header = "Introduction"
                starting_text_nest[cur_header] = starting_text_nest.get(cur_header, "") + " " + intro_part   
            else:
                starting_text_nest[cur_header] = starting_text_nest.get(cur_header, "") + " " + sequence   
            
    return starting_text_nest

def get_sections(doc):
    seqs, fonts = structure_doc_by_size_and_font(doc)
    pdf_headers = get_headers(fonts)
    text_nest = get_text_nest(seqs, {}, pdf_headers)
    return text_nest

def make_sections_dataframe(doc):
    text_nest = get_sections(doc)
    sections_df = pd.DataFrame(text_nest, index = ["text"]).T
    sections_df.name = doc.name
    return text_nest, sections_df

In [200]:
import os
import pprint
import fitz
import pandas as pd
import re
from IPython.display import display

aom_path = "../data/asq"
aom_pdfs = [f for f in os.listdir(aom_path) if f.endswith('pdf')]

fonts = {}
path = os.path.join(aom_path, aom_pdfs[1])
doc = fitz.open(path) # open a document
print(doc)

text_nest, sections_df = make_sections_dataframe(doc)


Document('../data/asq/Feldberg_2022_ASQ_The Task Bind_ Explaining Gender Differences in Managerial Tasks and Performance.pdf')


In [210]:
def process_citations(citation_group:str) -> (list,str):
    citations = citation_group.split(";")
    results = []
    for citation in citations:
        try:
            # case 1: & 
            if " and " in citation:
                tokens = citation.split(",")
                year = tokens[-1]
                names = ",".join(tokens[:-1])
                names = names.replace(" and ", ",")
                names_split = names.split(",")
                results.append(([name.strip() for name in names_split if name.strip() not in ("", "e.g.")], year.strip()))

            # case 2: et al
            if "et al." in citation:
                citation = citation.replace("et al.", "")
                tokens = citation.split(",")
                results.append(([token.strip() for token in tokens[:-1] if token.strip() != "" ], tokens[-1].strip()))

            # case 3: 1 author
            else:
                if "(" in citation:
                    author, year = citation.split()
                    results.append(([author], year[1:-1]))
                else:
                    citation_split = citation.split(",")
                    results.append(([citation_split[-2]], citation_split[-1]))
        except:
            results.append(([""], ""))
    
    return results
                

In [215]:
references_dictionary = {}
references_clean = text_preprocess_for_reference_matching(text_nest["REFERENCES"])
for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
    print(location, "\n")
    in_text_citations = get_in_text_citations(text.item())
    print(in_text_citations)
    cleaned_in_text_citations = [
        citation 
        if citation[0] != "("
        else citation[1:-1]
        for citation in in_text_citations 

    ]
    author_year_pairs_nested = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))  
    author_year_pairs = [item for group in author_year_pairs_nested for item in group]
    references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)
    print()

references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
references_df

Other 

[]

Abstract 

[]

Keywords: 

[]

Introduction 

['(Bailyn, 1987; Robinson and McIlwee, 1991; Kanter, 2008; Kuehn, 2012)', '(Singh et al., 2013; Dresden et al., 2018)', '(Berger, Cohen, and Zelditch, 1972; Bailyn, 1987; Reskin, 1993; Fletcher, 1999; Ridgeway, 2001; Cech, 2013)', '(Eagly, Makhijani, and Klonsky, 1992; Heilman, 2001; Ridgeway, 2001; Rudman and Glick, 2001; Eagly and Karau, 2002; Heilman and Eagly, 2008; Rudman et al., 2012)', '(Ellemers et al., 2004; Sheppard and Aquino, 2013)', '(Halpern, 1992; Nancarrow and Borthwick, 2005; Ashcraft, 2007)', '(Freeland and Hoey, 2018)', '(Reskin, 1993)', '(Shaw et al., 2018)', '(Freeland and Hoey, 2018)', '(Lively, 2001)', '(Truelove and Kellogg, 2016)', '(DiBenigno and Kellogg, 2014)', '(McMurray, 2011)']

CHALLENGES TO WOMEN IN MALE-DOMINATED OCCUPATIONS 

['(Reskin, 1993; Acker, 1998; Ashcraft, 2013)', '(Treiman and Hartmann, 1981; Reskin and Roos, 1990)', '(Ridgeway and Berger, 1986; Berger, Ridgeway, and Zelditch, 2002)',

Unnamed: 0,reference,section
0,"Bailyn, L. 1987 ‘‘Experiencing technical work:...","Introduction,CHALLENGES TO WOMEN IN MALE-DOMIN..."
1,"Singh, R., N. A. Fouad, M. E. Fitzpatrick, J. ...","Introduction,CHALLENGES TO WOMEN IN MALE-DOMIN..."
2,"Dresden, B. E., A. Y. Dresden, R. D. Ridge, an...","Introduction,CHALLENGES TO WOMEN IN MALE-DOMIN..."
3,"Eagly, A. H., M. G. Makhijani, and B. G. Klons...","Introduction,CHALLENGES TO WOMEN IN MALE-DOMIN..."
4,"Rudman, L. A., and P. Glick 2001 ‘‘Prescriptiv...","Introduction,CHALLENGES TO WOMEN IN MALE-DOMIN..."
...,...,...
118,"Wingfield, A. H., and R. S. Alston 2012 ‘‘The ...",CHALLENGES TO WOMEN IN MALE-DOMINATED OCCUPATI...
119,"Wrong, D. H. 1979 Power: Its Forms, Bases and ...",CHALLENGES TO WOMEN IN MALE-DOMINATED OCCUPATI...
120,"Yin, R. K. 2013 Case Study Research: Design an...",CHALLENGES TO WOMEN IN MALE-DOMINATED OCCUPATI...
121,"Zelek, B., and S. P. Phillips 2003 ‘‘Gender an...",CHALLENGES TO WOMEN IN MALE-DOMINATED OCCUPATI...


In [196]:
def make_references_dataframe(text_nest, sections_df):
    references_dictionary = {}
    references_clean = text_preprocess_for_reference_matching(text_nest["REFERENCES"])
    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [
            citation 
            if citation[0] != "("
            else citation[1:-1]
            for citation in in_text_citations 

        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))        
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    
    return references_df

In [177]:
make_references_dataframe(sections, sections_df)

Unnamed: 0,reference,section
