In [1]:
from bs4 import BeautifulSoup
import re
import pandas as pd

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
#helper function to find lines in html document with DOI and titles of articles
def has_doi(href):
    return href and re.compile("chapter/").search(href)

In [4]:
#main mining function returning the initial dataframe 

def mining(html_doc, year, current_page, all_pages, part):
    #opening the html document (copy pasted and saved as a .doc file)
    doc = open(html_doc, "r", encoding = "utf-8") 
    soup = BeautifulSoup(doc, 'html.parser' )

    list_of_doi = soup.find_all(href=has_doi)
        
    #getting the titles and the doi's from list generated helper function

    titles = []
    doi_str = []

    for element in list_of_doi:
        titles.append(element.get_text()) #returns the titles as the only text in the list
        string = str(element)
        first_substring = '/chapter'
        second_substring ='">'
        #separates out the DOIS (added the +9 to remove /chapter/ from the beginning of all DOIS)
        doi_str.append(string[(string.find(first_substring)+9):string.find(second_substring)]) 
            
    ## now the lines containing author are found
    authors = soup.find_all("li", class_="c-author-list__item")
        
    #keeping only the author names
    authors_str = []
    for element in authors:
        string = str(element)
        first_substring = 'item">'
        second_substring ='</li>'
        authors_str.append(string[(string.find(first_substring)+6):string.find(second_substring)])
            
    #now the lines containing page numbers are found
    page_numbers= soup.find_all('div', class_ = "c-meta")
        
    #keeping only the page numbers
    page_numbers_str = []

    for element in page_numbers:
        string = element.get_text()[6:-1] #removes the white spaces and "Pages "
        both = string.split("-")
        if 'x' not in string: #filtering out front matters
            try: 
                if int(both[1])-int(both[0]) > 1: 
                    page_numbers_str.append(string)
            except:
                if "C" in string or "E" in string: #including corrections and erratum, are removed later
                    page_numbers_str.append(string)
    #filtering out back matters, only an issue in 2021 
    if year == 2021 and int(current_page) == int(all_pages):
        page_numbers_str = page_numbers_str[:-1]
                
            
    #need to create a list of the year of publication to add to dataframe 
    year_of_pub = []
    for element in titles:
        year_of_pub.append(year)
        
    #will add the part of the publication to the dataframe as well
    part_of_pub = []
    for element in titles:
        part_of_pub.append(part)
    
    #creating the column names and content for the dataframe        
    data = {'Title': titles,
        'Authors': authors_str,
        'Page numbers' : page_numbers_str,
        'DOI': doi_str,
        'Year of publication' : year_of_pub,
        'Part of publication' : part_of_pub
        }

    df = pd.DataFrame(data)

    return df 


In [5]:
#helper function to combine all df together
def data_together(data, year):
    combined_frame = pd.concat(data, ignore_index = True, sort = False)
    if year == 2012:
        combined_frame.drop(combined_frame[combined_frame["Title"].str.contains("Erratum")].index, inplace = True)
    elif year == 2021:
        combined_frame.drop(combined_frame[combined_frame["Title"].str.contains("Correction to")].index, inplace=True)
    combined_frame.to_csv('database_miccai_'+ str(year) +'.csv')
    return combined_frame

In [6]:
#saving all 2012 together as one with the helper function above
miccai =['miccai 2012 part 1 page 1 of 5.doc', 'miccai 2012 part 1 page 2 of 5.doc', 
         'miccai 2012 part 1 page 3 of 5.doc' , 'miccai 2012 part 1 page 4 of 5.doc', 
         'miccai 2012 part 1 page 5 of 5.doc', 
         
         'miccai 2012 part 2 page 1 of 5.doc', 
         'miccai 2012 part 2 page 2 of 5.doc', 'miccai 2012 part 2 page 3 of 5.doc' ,
         'miccai 2012 part 2 page 4 of 5.doc', 'miccai 2012 part 2 page 5 of 5.doc', 
         
         
         'miccai 2012 part 3 page 1 of 5.doc', 'miccai 2012 part 3 page 2 of 5.doc', 
         'miccai 2012 part 3 page 3 of 5.doc' , 'miccai 2012 part 3 page 4 of 5.doc', 
         'miccai 2012 part 3 page 5 of 5.doc']

data = []
for element in miccai:  
    data.append(mining(element, 2012, element[24:25], element[29:30], element[17:18]))

data_together(data, 2012)

252

In [7]:
#saving all 2021 together as one with the helper function above
miccai =['miccai 2021 part 1 page 1 of 4.doc', 'miccai 2021 part 1 page 2 of 4.doc', 
         'miccai 2021 part 1 page 3 of 4.doc' , 'miccai 2021 part 1 page 4 of 4.doc', 
         
         'miccai 2021 part 2 page 1 of 4.doc', 'miccai 2021 part 2 page 2 of 4.doc', 
         'miccai 2021 part 2 page 3 of 4.doc' , 'miccai 2021 part 2 page 4 of 4.doc',
         
         'miccai 2021 part 3 page 1 of 4.doc', 'miccai 2021 part 3 page 2 of 4.doc', 
         'miccai 2021 part 3 page 3 of 4.doc' , 'miccai 2021 part 3 page 4 of 4.doc',
         
        'miccai 2021 part 4 page 1 of 4.doc', 'miccai 2021 part 4 page 2 of 4.doc', 
         'miccai 2021 part 4 page 3 of 4.doc' , 'miccai 2021 part 4 page 4 of 4.doc',
         
        'miccai 2021 part 5 page 1 of 5.doc', 'miccai 2021 part 5 page 2 of 5.doc', 
         'miccai 2021 part 5 page 3 of 5.doc' , 'miccai 2021 part 5 page 4 of 5.doc', 
         'miccai 2021 part 5 page 5 of 5.doc',
        
        'miccai 2021 part 6 page 1 of 4.doc', 'miccai 2021 part 6 page 2 of 4.doc', 
         'miccai 2021 part 6 page 3 of 4.doc' , 'miccai 2021 part 6 page 4 of 4.doc',
        
        'miccai 2021 part 7 page 1 of 5.doc', 'miccai 2021 part 7 page 2 of 5.doc', 
         'miccai 2021 part 7 page 3 of 5.doc' , 'miccai 2021 part 7 page 4 of 5.doc', 
         'miccai 2021 part 7 page 5 of 5.doc', 
        
        'miccai 2021 part 8 page 1 of 4.doc', 'miccai 2021 part 8 page 2 of 4.doc',
        'miccai 2021 part 8 page 3 of 4.doc', 'miccai 2021 part 8 page 4 of 4.doc']

data = []
for element in miccai:  
    data.append(mining(element, 2021, element[24:25], element[29:30], element[17:18]))
data_together(data, 2021)

531