In [1]:
from bs4 import BeautifulSoup
import re
import pandas as pd

In [2]:
#helper function to find lines in html document with DOI and titles of articles
def has_doi(href):
    return href and re.compile("chapter/").search(href)

In [7]:
def mining(html_doc, year, current_page, all_pages, part):
    #opening the html document (copy pasted and saved as a .doc file)
    doc = open(html_doc, "r", encoding = "utf-8") #ensuring I get correct characters! (issues with ')
    #parsing the document
    soup = BeautifulSoup(doc, 'html.parser' )

    list_of_doi = soup.find_all(href=has_doi)
        
    #getting the titles and the doi's from list generated helper function

    titles = []
    doi_str = []

    for element in list_of_doi:
        titles.append(element.get_text()) #returns the titles as the only text in the list
        string = str(element)
        first_substring = '/chapter'
        second_substring ='">'
        #separates out the DOIS (added the +9 to remove /chapter/ from the beginning of all DOIS)
        doi_str.append(string[(string.find(first_substring)+9):string.find(second_substring)]) 
            
    ## now the lines containing author are found
    authors = soup.find_all("li", class_="c-author-list__item")
        
    #keeping only the author names
    authors_str = []
    for element in authors:
        string = str(element)
        first_substring = 'item">'
        second_substring ='</li>'
        authors_str.append(string[(string.find(first_substring)+6):string.find(second_substring)])
            
    #now the lines containing page numbers are found
    page_numbers= soup.find_all('div', class_ = "c-meta")
        
    #keeping only the page numbers
    page_numbers_str = []

    for element in page_numbers:
        string = element.get_text()[6:-1] #removes the white spaces and "Pages "
        both = string.split("-")
        if 'x' not in string: #filtering out front matters
            try: 
                if int(both[1])-int(both[0]) > 1: 
                    page_numbers_str.append(string)
            except:
                if "C" in string or "E" in string: #including corrections and erratum, are removed later
                    page_numbers_str.append(string)
    #filtering out back matters, only an issue in 2021 
    if year == 2021 and int(current_page) == int(all_pages):
        page_numbers_str = page_numbers_str[:-1]
                
    #a check that I've found the same amount of authors, titles, dois and page numbers
    #can be commented out if it works
    #print('page', len(page_numbers_str))
    #print('author', len(authors_str))
    #print(page_numbers_str)
    #if len(page_numbers_str) == len(authors_str) == len(titles) == len(doi_str):
    #    print("true")
    #else:
    #    print("You have an error!")
            
    #need to create a list of the year of publication to add to dataframe 
    year_of_pub = []
    for element in titles:
        year_of_pub.append(year)
        
    #will add the part of the publication to the dataframe as well
    part_of_pub = []
    for element in titles:
        part_of_pub.append(part)
    
    #creating the column names and content for the dataframe        
    data = {'Title': titles,
        'Authors': authors_str,
        'Page numbers' : page_numbers_str,
        'DOI': doi_str,
        'Year of publication' : year_of_pub,
        'Part of publication' : part_of_pub
        }

    df = pd.DataFrame(data)

    return df 


In [8]:
#helper function to combine all df together
def data_together(data, year):
    combined_frame = pd.concat(data, ignore_index = True, sort = False)
    if year == 2012:
        combined_frame.drop(combined_frame[combined_frame["Title"].str.contains("Erratum")].index, inplace = True)
    elif year == 2021:
        combined_frame.drop(combined_frame[combined_frame["Title"].str.contains("Correction to")].index, inplace=True)
    combined_frame.to_csv('database_miccai_'+ str(year) +'.csv')
    return combined_frame

In [9]:
#saving all 2012 together as one with the helper function above
miccai =['miccai 2012 part 1 page 1 of 5.doc', 'miccai 2012 part 1 page 2 of 5.doc', 
         'miccai 2012 part 1 page 3 of 5.doc' , 'miccai 2012 part 1 page 4 of 5.doc', 
         'miccai 2012 part 1 page 5 of 5.doc', 
         
         'miccai 2012 part 2 page 1 of 5.doc', 
         'miccai 2012 part 2 page 2 of 5.doc', 'miccai 2012 part 2 page 3 of 5.doc' ,
         'miccai 2012 part 2 page 4 of 5.doc', 'miccai 2012 part 2 page 5 of 5.doc', 
         
         
         'miccai 2012 part 3 page 1 of 5.doc', 'miccai 2012 part 3 page 2 of 5.doc', 
         'miccai 2012 part 3 page 3 of 5.doc' , 'miccai 2012 part 3 page 4 of 5.doc', 
         'miccai 2012 part 3 page 5 of 5.doc']

data = []
for element in miccai:  
    data.append(mining(element, 2012, element[24:25], element[29:30], element[17:18]))

data_together(data, 2012)

Unnamed: 0,Title,Authors,Page numbers,DOI,Year of publication,Part of publication
0,Reliable Assessment of Perfusivity and Diffusi...,"M. Freiman, S. D. Voss, R. V. Mulkern, J. M. P...",1-9,10.1007/978-3-642-33415-3_1,2012.0,1
1,Multi-organ Abdominal CT Segmentation Using Hi...,"Robin Wolz, Chengwen Chu, Kazunari Misawa, Ken...",10-17,10.1007/978-3-642-33415-3_2,2012.0,1
2,Radiation-Free Drill Guidance in Interlocking ...,"Benoit Diotte, Pascal Fallavollita, Lejing Wan...",18-25,10.1007/978-3-642-33415-3_3,2012.0,1
3,Developing Essential Rigid-Flexible Outer Shea...,"Siyang Zuo, Takeshi Ohdaira, Kenta Kuwana, Yos...",26-33,10.1007/978-3-642-33415-3_4,2012.0,1
4,Surgical Gesture Classification from Video Data,"Benjamín Béjar Haro, Luca Zappella, René Vidal",34-41,10.1007/978-3-642-33415-3_5,2012.0,1
...,...,...,...,...,...,...
248,An Invariant Shape Representation Using the An...,"A. A. Joshi, S. Ashrafulla, D. W. Shattuck, H....",607-614,10.1007/978-3-642-33454-2_75,2012.0,3
249,Phase Contrast Image Restoration via Dictionar...,"Hang Su, Zhaozheng Yin, Takeo Kanade, Seungil Huh",615-622,10.1007/978-3-642-33454-2_76,2012.0,3
250,Context-Constrained Multiple Instance Learning...,"Yan Xu, Jianwen Zhang, Eric I-Chao Chang, Maod...",623-630,10.1007/978-3-642-33454-2_77,2012.0,3
251,Structural-Flow Trajectories for Unravelling 3...,"Katerina Fragkiadaki, Weiyu Zhang, Jianbo Shi,...",631-638,10.1007/978-3-642-33454-2_78,2012.0,3


In [10]:
#saving all 2021 together as one with the helper function above
miccai =['miccai 2021 part 1 page 1 of 4.doc', 'miccai 2021 part 1 page 2 of 4.doc', 
         'miccai 2021 part 1 page 3 of 4.doc' , 'miccai 2021 part 1 page 4 of 4.doc', 
         
         'miccai 2021 part 2 page 1 of 4.doc', 'miccai 2021 part 2 page 2 of 4.doc', 
         'miccai 2021 part 2 page 3 of 4.doc' , 'miccai 2021 part 2 page 4 of 4.doc',
         
         'miccai 2021 part 3 page 1 of 4.doc', 'miccai 2021 part 3 page 2 of 4.doc', 
         'miccai 2021 part 3 page 3 of 4.doc' , 'miccai 2021 part 3 page 4 of 4.doc',
         
        'miccai 2021 part 4 page 1 of 4.doc', 'miccai 2021 part 4 page 2 of 4.doc', 
         'miccai 2021 part 4 page 3 of 4.doc' , 'miccai 2021 part 4 page 4 of 4.doc',
         
        'miccai 2021 part 5 page 1 of 5.doc', 'miccai 2021 part 5 page 2 of 5.doc', 
         'miccai 2021 part 5 page 3 of 5.doc' , 'miccai 2021 part 5 page 4 of 5.doc', 
         'miccai 2021 part 5 page 5 of 5.doc',
        
        'miccai 2021 part 6 page 1 of 4.doc', 'miccai 2021 part 6 page 2 of 4.doc', 
         'miccai 2021 part 6 page 3 of 4.doc' , 'miccai 2021 part 6 page 4 of 4.doc',
        
        'miccai 2021 part 7 page 1 of 5.doc', 'miccai 2021 part 7 page 2 of 5.doc', 
         'miccai 2021 part 7 page 3 of 5.doc' , 'miccai 2021 part 7 page 4 of 5.doc', 
         'miccai 2021 part 7 page 5 of 5.doc', 
        
        'miccai 2021 part 8 page 1 of 4.doc', 'miccai 2021 part 8 page 2 of 4.doc',
        'miccai 2021 part 8 page 3 of 4.doc', 'miccai 2021 part 8 page 4 of 4.doc']

data = []
for element in miccai:  
    data.append(mining(element, 2021, element[24:25], element[29:30], element[17:18]))
data_together(data, 2021)

Unnamed: 0,Title,Authors,Page numbers,DOI,Year of publication,Part of publication
0,Noisy Labels are Treasure: Mean-Teacher-Assist...,"Zhe Xu, Donghuan Lu, Yixin Wang, Jie Luo, Jaga...",3-13,10.1007/978-3-030-87193-2_1,2021,1
1,TransFuse: Fusing Transformers and CNNs for Me...,"Yundong Zhang, Huiye Liu, Qiang Hu",14-24,10.1007/978-3-030-87193-2_2,2021,1
2,Pancreas CT Segmentation by Predictive Phenoty...,"Yucheng Tang, Riqiang Gao, Hohin Lee, Qi Yang,...",25-35,10.1007/978-3-030-87193-2_3,2021,1
3,Medical Transformer: Gated Axial-Attention for...,"Jeya Maria Jose Valanarasu, Poojan Oza, Ilker ...",36-46,10.1007/978-3-030-87193-2_4,2021,1
4,Anatomy-Constrained Contrastive Learning for S...,"Bo Zhou, Chi Liu, James S. Duncan",47-56,10.1007/978-3-030-87193-2_5,2021,1
...,...,...,...,...,...,...
530,Weakly-Supervised Ultrasound Video Segmentatio...,"Ruiheng Chang, Dong Wang, Haiyan Guo, Jia Ding...",648-658,10.1007/978-3-030-87237-3_62,2021,8
531,Content-Preserving Unpaired Translation from S...,"Devavrat Tomar, Lin Zhang, Tiziano Portenier, ...",659-669,10.1007/978-3-030-87237-3_63,2021,8
532,Visual-Assisted Probe Movement Guidance for Ob...,"Cheng Zhao, Richard Droste, Lior Drukker, Aris...",670-679,10.1007/978-3-030-87237-3_64,2021,8
533,Training Deep Networks for Prostate Cancer Dia...,"Golara Javadi, Samareh Samadi, Sharareh Bayat,...",680-689,10.1007/978-3-030-87237-3_65,2021,8
