# OrgSci

In [688]:
class Section:
    def __init__(self, content, size):
        self.children = []
        self.size = size
        self.parent = None
        self.content = content
        
    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self.content.strip() == other.content.strip()
        return False
    
    def __repr__(self):
        return self.content
    
    def add_child(self, new_child):
        self.children.append(new_child)
        
    def has_child(self):
        return len(self.children) != 0
    
    def set_parent(self, new_parent):
        self.parent = new_parent
    
    def extend(self, content):
        self.content += f" {content}"
    
    def backtrack_add(self, content, size):
        curr = self
        
        while curr.size <= size:
            curr = curr.parent
        
        parent = curr
        cs = Section(content, size)
        parent.add_child(cs)
        cs.set_parent(parent)
                
        return cs
    def print_contents(self):
        if len(self.children) == 0:
            return self.content
        
        return self.content + "\n" + " \n\n ".join([child.print_contents() for child in self.children])
    
import pprint
import fitz
from fitz import Rect
import re
import pandas as pd



def get_sections(doc):

    main_section = Section("", 100)
    main_section   

    prev_size = 100
    curr_section = main_section
    nest = {}

    for page in doc:
        rect = Rect(page.rect.x0 + 40, page.rect.y0 + 60, page.rect.x1 - 40, page.rect.y1 - 40) 
        
        dict = page.get_text("dict", clip = rect)

        blocks = dict["blocks"]
        for block in blocks:
            if "lines" in block.keys():
                spans = block['lines']
                for span in spans:
                    data = span['spans']
                    for lines in data:
                        cur_size = round(lines['size'], 1)

                        # Manual Override for References
                        if lines['text'].strip() in ("Acknowledgements", "References", "Appendix", "Endnotes"):
                            cur = Section(lines['text'], cur_size)
                            curr_section = main_section.children[-1].children[-1]
                            curr_section.add_child(cur)
                            cur.set_parent(curr_section)
                            curr_section = cur

                            prev_size = round(lines['size'], 1)

                        elif cur_size > prev_size:
                            curr_section = curr_section.backtrack_add(lines['text'], cur_size)
                            prev_size = curr_section.size

                        elif cur_size == prev_size:
                            curr_section.extend(lines['text'])

                        else:  
                            cur = Section(lines['text'], cur_size)   
                            curr_section.add_child(cur)
                            cur.set_parent(curr_section)
                            curr_section = cur
                            prev_size = round(lines['size'], 1)

    # orgsci                            
    return main_section.children[-1].children[-1].children

def make_sections_dataframe(path):
    doc = fitz.open(path) # open a document
    
    # Get sections
    sections = get_sections(doc)
    sections = preprocess_sections(sections)
    
    content_nest = {}

    for section in sections:
        content_nest[section.content] = [section.print_contents()]

    sections_df = pd.DataFrame(content_nest, index = ["text"]).T
    sections_df.name = doc.name
    return sections, sections_df

def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    references_clean = text_preprocess_for_reference_matching(sections[-1].print_contents())

    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [
            item 
            for group in 
                [  
                    [c.strip()                             # Remove whitespace
                     for c in citation[1:-1].split(',')    # Remove '(' and ')'
                     if any(char.isdigit() for char in c)] # Remove any that doesn't have digits (year)

                    for citation in in_text_citations
                ]
            for item in group 
        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    
    return references_df

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references_dirty = " ".join(references_dirty.split())
    references = re.sub("([0-9]|html|\))\s?\.", r"\g<0>\n", references_dirty)

    # Make list of references
    pattern = r"[A-ZÆØÅæøå][ÆØÅæøåA-Za-z]+.*[A-Z]{1,3},? .*\(\d{4}\).*[html|\d|\)]\."
    references_clean = re.findall(pattern, references)
    return references_clean
    

def get_in_text_citations(text):
    IN_TEXT_CITATION_REGEX = r"\([\w\s.,]+\s\d{3,4}\s?\)"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    # case 1: 2 authors
    if " and " in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split(" and ")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if f"({year})" in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data

def convert_pdf_to_dataframes(path) -> (pd.DataFrame, pd.DataFrame):
    """Returns (sections_df, references_df)"""
    sections, sections_df = make_sections_dataframe(path)
    references_df = make_references_dataframe(sections, sections_df)
    return sections_df, references_df

def preprocess_sections(sections):
    add_new_section = False

    # Preprocess sections
    abstract_section = Section("Abstract. ", 0)
    abstract_index = sections.index(abstract_section)
    abstract_text = sections.pop(abstract_index + 1)
    sections[abstract_index].add_child(abstract_text)

    sections_tmp = sections.copy()
    earliest_index = 1000
    txt = ""
    # pprint.pprint(sections[abstract_index:])
    # Fit all text that belongs in paragraph into one "Introduction" paragraph
    for idx, section in enumerate(sections_tmp):
        if len(section.content) > 200:
            print(section)
            add_new_section = True
            if min(earliest_index, idx) != earliest_index:
                earliest_index = idx
            sections.remove(section)
            txt += section.content

    if add_new_section:
        new_section = Section("Introduction", 20)
        new_section.add_child(Section(txt, 15))
        sections[earliest_index] = new_section

    sections = sections[abstract_index:]

    return sections

import fitz
import os

ORGSCI_PATH = "../data/orgsci"
orgsci_pdfs = [f for f in os.listdir(ORGSCI_PATH) if f.endswith('pdf')]
for i in range(len(orgsci_pdfs)):
    path = os.path.join(ORGSCI_PATH, orgsci_pdfs[2])
    sections_df, references_df = convert_pdf_to_dataframes(path)
    abstract_text = sections_df.iloc[0].item()
    abstract, keywords = abstract_text.split("Keywords")
    cleaned_keywords = [keyword.strip() for keyword in keywords.split("•")]
    new_row = pd.DataFrame({"text":str(cleaned_keywords)}, index = ["Keywords"])
    sections_df = pd.concat([new_row, sections_df])

sections_df


Unnamed: 0,text
Keywords,"[': status', 'status-quality coupling', 'netwo..."
Abstract.,Abstract.\n Previous research has demonstrated...
Introduction,Introduction\nResearch on social networks has ...
Social Status and Network-Broadening,Social Status and Network-Broadening\nResearch...
A Critical Enabler: Belief in Status-Quality Coupling,A Critical Enabler: Belief in Status-Quality C...
Network-Broadening Behavior and Network Size,Network-Broadening Behavior and Network Size\n...
Research Overview,Research Overview\nWe tested our hypotheses ac...
Pilot Study: A Preliminary Demonstration of the Interaction Effect of Status X Coupling on Network Size in the GSS,Pilot Study: A Preliminary Demonstration of th...
Study 1: Status is Positively Associated with Network-Broadening,Study 1: Status is Positively Associated with ...
Study 2: Belief in Status-Quality Coupling Moderates the Relationship Between Status and Network-Broadening,Study 2: Belief in Status-Quality Coupling Mod...


In [703]:
path = os.path.join(ORGSCI_PATH, orgsci_pdfs[2])
doc = fitz.open(path)
sections = get_sections(doc)
final = preprocess_sections(sections)
final[1].children

[Research on social networks has demonstrated time and time again that the size, breadth, and reach of people ’ s social networks are positively correlated with several bene ﬁ cial outcomes, including access to valuable opportunities, resources, and information (e.g., Burt 1992, Wasserman and Faust  1994 ). Con- sequently, an important ongoing research agenda for social, organizational, and network science in- volves identifying differences in the kinds of social networks people have and the origins of these dif- ferences (e.g., Hall and Wellman  1985 , Campbell et al. 1986 , McPherson et al.  2001 ). Among the many  ﬁ nd- ings from this research, one of the most common, and most robust, is that across a range of scales and contexts, high-status people tend to have larger, more expansive social networks than low-status people (e.g., Wasserman and Faust,  1994 , Brashears  2011 , Smith et al.  2012 ).,
   The motivating question for the present paper is, in a word, why? Existing researc

Social Status and Network-Broadening

# Annurev

In [505]:
class Section:
    def __init__(self, content, size):
        self.children = []
        self.size = size
        self.parent = None
        self.content = content
        
    def __eq__(self, other):
        if type(other) == str:
            return self.content.strip() == other
        if isinstance(other, self.__class__):
            return self.content.strip() == other.content.strip()
        return False
    
    def __repr__(self):
        return self.content
    
    def add_child(self, new_child):
        self.children.append(new_child)
        
    def has_child(self):
        return len(self.children) != 0
    
    def set_parent(self, new_parent):
        self.parent = new_parent
    
    def extend(self, content):
        self.content += f" {content}"
    
    def backtrack_add(self, content, size):
        curr = self
        
        while curr.size <= size:
            curr = curr.parent
        
        parent = curr
        cs = Section(content, size)
        parent.add_child(cs)
        cs.set_parent(parent)
                
        return cs
    def print_contents(self):
        if len(self.children) == 0:
            return self.content
        
        return self.content + "\n" + " \n\n ".join([child.print_contents() for child in self.children])


def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    
    references_clean = text_preprocess_for_reference_matching(sections[-1].print_contents())

    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [
            item 
            for group in 
                [  
                    [c.strip()                             # Remove whitespace
                     for c in citation[1:-1].split(',')    # Remove '(' and ')'
                     if any(char.isdigit() for char in c)] # Remove any that doesn't have digits (year)

                    for citation in in_text_citations
                ]
            for item in group 
        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    
    return references_df

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references_dirty = " ".join(references_dirty.split())
    references = re.sub("([0-9]|html|\))\s?\.", r"\g<0>\n", references_dirty)

    # Make list of references
    pattern = r"[A-ZÆØÅæøå][ÆØÅæøåA-Za-z]+.*[A-Z]{1,3},? .*\(\d{4}\).*[html|\d|\)]\."
    references_clean = re.findall(pattern, references)
    return references_clean
    

def get_in_text_citations(text):
    IN_TEXT_CITATION_REGEX = r"\([\w\s.,]+\s\d{3,4}\s?\)"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    # case 1: 2 authors
    if " and " in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split(" and ")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if f"({year})" in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data

def convert_pdf_to_dataframes(path) -> (pd.DataFrame, pd.DataFrame):
    """Returns (sections_df, references_df)"""
    sections, sections_df = make_sections_dataframe(path)
    references_df = make_references_dataframe(sections, sections_df)
    return sections_df, references_df 

In [540]:
import pprint
import fitz
from fitz import Rect
import re
import pandas as pd

def preprocess_sections(sections):
    # Preprocess sections
    abstract_section = Section("Abstract", 0)

    abstract_index = sections.index(abstract_section)
    abstract_text = sections.pop(abstract_index + 1)
    sections[abstract_index].add_child(abstract_text)

    sections = sections[abstract_index:]
    
    return sections

def make_sections_dataframe(path):
    doc = fitz.open(path) # open a document
    
    # Get sections
    sections = get_sections(doc)
    sections = preprocess_sections(sections)
    
    content_nest = {}

    for section in sections:
        content_nest[section.content] = [section.print_contents()]

    sections_df = pd.DataFrame(content_nest, index = ["text"]).T
    sections_df.name = doc.name
    return sections, sections_df

def get_sections(doc):

    main_section = Section("", 100)

    prev_size = 100
    curr_section = main_section
    nest = {}

    for page in doc:
        if page.number not in (doc[-1].number, doc[-2].number):
            rect = Rect(page.rect.x0 + 20, page.rect.y0 + 20, page.rect.x1 - 20, page.rect.y1 - 30)
            
            dict = page.get_text("dict", clip = rect)
            blocks = dict["blocks"]
            for block in blocks:
                if "lines" in block.keys():
                    spans = block['lines']
                    for span in spans:
                        data = span['spans']
                        for lines in data:
                            cur_size = round(lines['size'], 2)

                            # Manual Override for References
                            if lines['text'].strip() in ("Abstract", "Keywords", "LITERATURE CITED"):

                                cur = Section(lines['text'], cur_size)
                                curr_section = main_section.children[-1].children[-1]
                                curr_section.add_child(cur)
                                cur.set_parent(curr_section)
                                curr_section = cur

                                prev_size = round(lines['size'], 2)

                            elif cur_size > prev_size:
                                curr_section = curr_section.backtrack_add(lines['text'], cur_size)
                                prev_size = curr_section.size

                            elif cur_size == prev_size:
                                curr_section.extend(lines['text'])

                            else:  
                                cur = Section(lines['text'], cur_size)   
                                curr_section.add_child(cur)
                                cur.set_parent(curr_section)
                                curr_section = cur
                                prev_size = round(lines['size'], 2)


    final_sections = main_section.children[-1].children
    
    if len(final_sections) > 1:
        return final_sections[-2].children + final_sections[-1].children
    else:
        return final_sections[-1].children

def preprocess_sections(sections):
    # Preprocess sections
    first_section = Section("Keywords", 0)
    first_index = sections.index(first_section)
    sections = sections[first_index:]
    
    return sections

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references = " ".join(references_dirty.split())
    pattern = "[A-Z][A-Za-z, ]+[A-Z]{1,3}\. \d{4}\."
    references_clean = re.findall(pattern, references)
    
    for idx, ref in enumerate(references_clean):
        if idx == len(references_clean) - 1:
            # All the way to the end
            references_clean[idx] = references[references.find(ref):]
        else:
            next_ref = references_clean[idx+1]
            references_clean[idx] = references[references.find(ref):references.find(next_ref)]

    return references_clean

def get_in_text_citations(text):
    IN_PARANTHESES_CITATION_REGEX = r"\([&\w\s., ]+\s\d{3,4}\)"
    AND_PATTERN = "\S+ & \S+ \(\d{3,4}\)"
    ONE_PATTERN = "[A-Z]\S+ \(\d{3,4}\)"
    ET_AL_PATTERN = "[A-Z][a-z] et al. \(\d{3,4}\)"
    IN_TEXT_CITATION_REGEX = f"{IN_PARANTHESES_CITATION_REGEX}|{AND_PATTERN}|{ONE_PATTERN}|{ET_AL_PATTERN}"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    # case 1: 2 authors
    if "&" in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split("&")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if year in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data

def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    
    references_text = sections[sections.index("LITERATURE CITED")].print_contents()
    references_clean = text_preprocess_for_reference_matching(references_text)
    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [

            item 

            for group in 

            [
                re.sub(r"\(|\)|see also|’s", "", item).split(",")
                for item in in_text_citations
            ]

            for item in group

        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    return references_df

In [542]:
# import os
# import pprint
# from IPython.display import display

# annurev_path = "../data/annurev-orgpsych"
# annurev_pdfs = [f for f in os.listdir(annurev_path) if f.endswith('pdf')]
# for i in range(len(annurev_pdfs)):
#     sections_df, references_df = convert_pdf_to_dataframes(os.path.join(annurev_path, annurev_pdfs[i]))

## DEBUGGING

In [538]:
references_dictionary = {}
references_text = sections[sections.index("LITERATURE CITED")].print_contents()
references_clean = text_preprocess_for_reference_matching(references_text)
for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
    in_text_citations = get_in_text_citations(text.item())
    cleaned_in_text_citations = [
        
        item 
        
        for group in 
    
        [
            re.sub(r"\(|\)|see also|’s", "", item).split(",")
            for item in in_text_citations
        ]
        
        for item in group
        
    ]
    # print(cleaned_in_text_citations)
    author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
    # pprint.pprint(author_year_pairs)
    # print("===")
    references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)
    
references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")


# ASQ

In [635]:
class Section:
    def __init__(self, content, size):
        self.children = []
        self.size = size
        self.parent = None
        self.content = content
        
    def __eq__(self, other):
        if type(other) == str:
            return self.content.strip() == other
        if isinstance(other, self.__class__):
            return self.content.strip() == other.content.strip()
        return False
    
    def __repr__(self):
        return self.content
    
    def add_child(self, new_child):
        self.children.append(new_child)
        
    def has_child(self):
        return len(self.children) != 0
    
    def set_parent(self, new_parent):
        self.parent = new_parent
    
    def extend(self, content):
        self.content += f" {content}"
    
    def backtrack_add(self, content, size):
        curr = self
        
        while curr.size <= size:
            curr = curr.parent
        
        parent = curr
        cs = Section(content, size)
        parent.add_child(cs)
        cs.set_parent(parent)
                
        return cs
    def print_contents(self):
        if len(self.children) == 0:
            return self.content
        
        return self.content + "\n" + " \n\n ".join([child.print_contents() for child in self.children])
    
def preprocess_sections(sections):
    # Preprocess sections
    abstract_section = Section("Abstract", 0)

    abstract_index = sections.index(abstract_section)
    abstract_text = sections.pop(abstract_index + 1)
    sections[abstract_index].add_child(abstract_text)

    sections = sections[abstract_index:]
    
    return sections

def make_sections_dataframe(path):
    doc = fitz.open(path) # open a document
    
    # Get sections
    sections = get_sections(doc)
    sections = preprocess_sections(sections)
    
    content_nest = {}

    for section in sections:
        content_nest[section.content] = [section.print_contents()]

    sections_df = pd.DataFrame(content_nest, index = ["text"]).T
    sections_df.name = doc.name
    return sections, sections_df



def preprocess_sections(sections):
    # Preprocess sections
    first_section = Section("Keywords", 0)
    first_index = sections.index(first_section)
    sections = sections[first_index:]
    
    return sections


def make_references_dataframe(sections, sections_df):
    references_dictionary = {}
    
    references_clean = text_preprocess_for_reference_matching(sections[-1].print_contents())

    for location, text in zip(sections_df.index[:-1], sections_df.values[:-1]):
        in_text_citations = get_in_text_citations(text.item())
        cleaned_in_text_citations = [
            item 
            for group in 
                [  
                    [c.strip()                             # Remove whitespace
                     for c in citation[1:-1].split(',')    # Remove '(' and ')'
                     if any(char.isdigit() for char in c)] # Remove any that doesn't have digits (year)

                    for citation in in_text_citations
                ]
            for item in group 
        ]
        author_year_pairs = list(filter(lambda x: x != None, map(process_citations, cleaned_in_text_citations)))
        references_dictionary = find_citation_matches(author_year_pairs, references_clean, references_dictionary, location)

    references_df = pd.DataFrame({k:[",".join(v)] for k,v in references_dictionary.items()}, index = ["section"]).T.reset_index(names = "reference")
    
    return references_df

def text_preprocess_for_reference_matching(references_text):
    # START searching ONCE References tag found
    references_dirty = re.sub("\n", " ", references_text)
    references_dirty = " ".join(references_dirty.split())
    references = re.sub("([0-9]|html|\))\s?\.", r"\g<0>\n", references_dirty)

    # Make list of references
    pattern = r"[A-ZÆØÅæøå][ÆØÅæøåA-Za-z]+.*[A-Z]{1,3},? .*\(\d{4}\).*[html|\d|\)]\."
    references_clean = re.findall(pattern, references)
    return references_clean
    

def get_in_text_citations(text):
    IN_TEXT_CITATION_REGEX = r"\([\w\s.,]+\s\d{3,4}\s?\)"
    return re.findall(IN_TEXT_CITATION_REGEX, text)

def process_citations(citation:str) -> (list,str):
    # case 1: 2 authors
    if " and " in citation:
        tokens = citation.split()
        year = tokens[-1]
        names = " ".join(tokens[:-1])
        names_split = names.split(" and ")
        return ((names_split[0].strip(), names_split[1].strip()), year.strip())

    # case 2: et al
    if "et al." in citation:
        tokens = citation.split("et al.")
        return ([tokens[0].strip()], tokens[1].strip())
    
    # case 3: 1 author
    else:
        split = citation.split()
        if len(split) == 1:
            return None
        else:
            tokens = citation.split()
            year = tokens[-1]
            author = " ".join(tokens[:-1])
            return ([author.strip()], year.strip())
        
        
def find_citation_matches(author_year_pairs, full_references, data, location):
    for author_year_pair in author_year_pairs:
        authors, year = author_year_pair
        for reference in full_references:
            match = True
            if f"({year})" in reference:
                for author in authors:
                    if author not in reference:
                        match = False
                if match:
                    dict_value = data.get(reference, [])
                    if dict_value == []:
                        data[reference] = []
                    if location not in dict_value:
                        data[reference] = data.get(reference, []) + [location]
            else:
                continue
    return data

def convert_pdf_to_dataframes(path) -> (pd.DataFrame, pd.DataFrame):
    """Returns (sections_df, references_df)"""
    sections, sections_df = make_sections_dataframe(path)
    references_df = make_references_dataframe(sections, sections_df)
    return sections_df, references_df 

In [636]:


def get_sections(doc):
    global fonts
    main_section = Section("", 100)

    prev_size = 100
    prev_font = ""
    curr_section = main_section
    nest = {}

    for page in doc:
        if page.number not in (doc[-1].number, doc[-2].number):
            rect = Rect(page.rect.x0 + 20, page.rect.y0 + 20, page.rect.x1 - 20, page.rect.y1 - 30)
            
            dict = page.get_text("dict", clip = rect)
            blocks = dict["blocks"]
            for block in blocks:
                if "lines" in block.keys():
                    spans = block['lines']
                    for span in spans:
                        data = span['spans']
                        for lines in data:
                            cur_size = round(lines['size'], 2)
                            cur_font = lines['font']
                            
                            
                            # print(lines['text'], lines['font'])
                            k = (lines['font'])
                            fonts[k] = fonts.get(k, []) + [lines['text']]
                            

#                             # Manual Override for References
#                             if lines['text'].strip() in ("Abstract", "Keywords", "LITERATURE CITED"):

#                                 cur = Section(lines['text'], cur_size)
#                                 curr_section = main_section.children[-1].children[-1]
#                                 curr_section.add_child(cur)
#                                 cur.set_parent(curr_section)
#                                 curr_section = cur

#                                 prev_size = round(lines['size'], 2)

                            if cur_size > prev_size:
                                curr_section = curr_section.backtrack_add(lines['text'], cur_size)
                                prev_size = curr_section.size
                                prev_font = cur_font
 
                            elif cur_size == prev_size and cur_font == prev_font:
                                curr_section.extend(lines['text'])

                            else:  
                                cur = Section(lines['text'], cur_size)   
                                curr_section.add_child(cur)
                                cur.set_parent(curr_section)
                                curr_section = cur
                                prev_size = cur_size
                                prev_font = cur_font


    final_sections = main_section.children[-1].children
    
    if len(final_sections) > 1:
        return final_sections[-2].children + final_sections[-1].children
    else:
        return final_sections[-1].children
    

('AdvOT958dcb08.B', 9.962599754333496)
('AdvOT3a8a92a3.B', 9.962599754333496)
('AdvOT3a8a92a3.B', 9.962599754333496)
('AdvOT5af5756d.B', 9.962599754333496)

In [None]:
import os
import pprint
from IPython.display import display

aom_path = "../data/aom"
aom_pdfs = [f for f in os.listdir(aom_path) if f.endswith('pdf')]
# for i in range(len(aom_pdfs)):
fonts = {}
print(doc)
path = os.path.join(aom_path, aom_pdfs[3])
doc = fitz.open(path) # open a document
sections = get_sections(doc)
pprint.pprint(fonts)


Document('../data/aom/Methot_2018_AOMReview_The Network Architecture of Human Capital- A Relational Identity Perspective.pdf')
{'AdvOT5af5756d.B': ['THE NETWORK ARCHITECTURE OF HUMAN CAPTIAL:',
                     'A RELATIONAL IDENTITY PERSPECTIVE',
                     'JESSICA R. METHOT',
                     'EMILY H. ROSADO-SOLOMON',
                     'Rutgers University',
                     'DAVID G. ALLEN',
                     'Texas Christian University and University of Warwick',
                     'Managing constellations of employee relationships is a '
                     'core competency in knowledge-',
                     'based organizations. It is timely, then, that human '
                     'resource management scholars and',
                     'practitioners are adopting an increasingly relational '
                     'view of human resources. Whereas',
                     'this burgeoning stream of research predominantly '
                     'pos