In [1]:
import pandas as pd
import re

In [2]:
#reading in files
df_2012 = pd.read_csv(r'C:\Users\chris\Desktop\Value-Analysis-Thesis\02-mining-inital-data\database_miccai_2012.csv')
df_2021 = pd.read_csv(r'C:\Users\chris\Desktop\Value-Analysis-Thesis\02-mining-inital-data\database_miccai_2021.csv')

In [3]:
#removing unnecessary index column
df_2012 = df_2012.drop("Unnamed: 0", axis = 1)
df_2021 = df_2021.drop("Unnamed: 0", axis = 1)

## Defining categories
Making a set of helper methods to create the dictionaries for each article with words to search for, defining the keywords and the rules, checking everything and adding a threshold check to filter out categories with insufficient information found

In [4]:
#first I create a dictionary of the words in the string
#having it as a separate method means only running it once
def create_string_dic(string):
    list_of_words = string.split(" ")
    string_dictionary = {}
    for word in list_of_words:
        word = word.lower().strip() #making all words lower case and removing all white spaces
        if word in string_dictionary: #will return true if word already is assigned a value
            string_dictionary[word] = string_dictionary.get(word) + 1
        else:
            string_dictionary[word] = 1
    return string_dictionary

In [5]:
#function that checks for keyword matches 
def check_for_keywords(string_dic, keywords_dic):
    running_counter = 0
    for word in keywords_dic.keys():
        if word in string_dic: #will add value if it exists, otherwise will give the keyword a 0
            keywords_dic.get(word).append(string_dic.get(word))
            running_counter += string_dic.get(word)
            #running_counter += 1
        else:
            keywords_dic.get(word).append(0)
        if 'category' in word: #the last keyword is the category which should
            #correspond to the running counter instead of 0 (as I am reasonably sure the word 'category: keyword_category')
            #does not occur in the strings I am searching
            keywords_dic.get(word)[-1] = (running_counter)

    return keywords_dic

In [6]:
#finding the abstracts
def find_abstract(year, place):
    abstract_list = []
    with open(place, "r", encoding = 'utf-8') as part:
        article = part.read()           
        abstracts = [i.start() for i in re.finditer("Abstract", article)]
        for index in abstracts: 
            if year == 2012 and article[index+8] != "s": #removing the "Abstracts", aka the references in 2012
                abstract_list.append(article[index:index+2000])
            elif year == 2021 and article[index+9] != "T": #removing the references in 2021 ("abstract track")
                abstract_list.append(article[index:index+2000])
    return abstract_list


In [7]:
#method creating dictionary of abstract and title to search through 
def creating_dic(year, place, df):
    #saving the abstract
    abstract_list = find_abstract(year, place)
    #creating the list of dictionaries for each article
    list_of_dic = []
    #getting the title to combine with the abstract
    titles = df['Title'].to_list()
    #combining title and abstract
    for index in range(len(titles)):
        titles[index] = titles[index] + " " + abstract_list[index]
    #creating dictionary of strings to search through - only run once(technically)
    for title in titles:
        list_of_dic.append(create_string_dic(title))
    return list_of_dic

In [8]:
dic2012 = creating_dic(2012, r"C:\Users\chris\Desktop\Value-Analysis-Thesis\01-combining-proceedings-txt\miccai_2012_full_txt.txt", df_2012)
dic2021 = creating_dic(2021, r"C:\Users\chris\Desktop\Value-Analysis-Thesis\01-combining-proceedings-txt\miccai_2021_full_txt.txt", df_2021)

In [9]:
#initialising the keyword dictionary and searching the strings for the given key words
#creating a dataframe where each column is a keyword and the value is the number of occurences of that word 
#each row corresponds to the index of the article
def generate_keyword_search_df(keywords, list_of_dic):
    initial_keywords_dic = {key:[] for key in keywords}
    for index in range(len(list_of_dic)):
        keyword_dic = (check_for_keywords(list_of_dic[index], initial_keywords_dic))
    df = pd.DataFrame(keyword_dic) 
    return df

In [10]:
#reading in the keywords file and creating the list of lowercase words to check for
def reading_keywords(place):
    with open(place, "r", encoding = 'utf-8') as part:
        string = part.read()
        string = string.lower()
        keywords = string.split('\n') 
    return keywords

In [11]:
#method for checking the rules
def check_rules(df, rules_file):
    category = 'category: ' + rules_file[:len(rules_file)-6]
    with open(rules_file, "r", encoding = 'utf-8') as part:
        string = part.read()
        string = string.lower()
        rules = string.split('\n') 
        
        for index in range(len(df)):
            for rule in rules:
                threshold = 0
                element_exists = True
                elements = rule.split(' + ')
                for i in range(len(elements)):   
                    threshold += df.loc[index, elements[i].strip()]
                    if df.loc[index, elements[i].strip()] == 0:
                        element_exists = False #if just one isn't there, set to false and do not allow rule to add one
                if threshold == len(elements) and element_exists: #meaning each element in the rule was present 
                    df.loc[index, category] +=1
                           
    return df[category]

In [12]:
#method combining the helper methods above and adding the threshold check defined below

def search_and_check(keywords_list, rules_list, dic):
    #checking the classification keywords first and their rules
    dfA = check_rules(generate_keyword_search_df(reading_keywords(keywords_list[0]), dic), rules_list[0])
    #then checking the other keywords and their rules
    dfB = check_rules(generate_keyword_search_df(reading_keywords(keywords_list[1]), dic), rules_list[1])
    
    #merging the two found dataframes
    df = pd.merge(dfA, dfB, right_index = True, left_index = True)
    
    #adding the category of the column with the most highest value found in the search
    category = df.idxmax(axis = 1)
    category.name = 'category'
    df= df.join(category)
    #doing the threshold check to add the unknown category for both columns less than two (meaning low indication of either)
    #or same value in both columns (meaning no indiation either way)
    df = threshold_check(df)
    #only the final category series is returned (to be added to original database)
    
    return df['category']




In [13]:
#helper method to ensure keyword/rule results are not arbitrarily added if not enough information 
#was found
def threshold_check(df):
    unknown = 'category: unknown'
    for i in range(len(df)):
        if df.iloc[i, 0] < 2 and df.iloc[i, 1] < 2: #checking if both columns are below 2, meaning low indication of either category
            df.loc[i, 'category'] = unknown
        elif df.iloc[i, 0] == df.iloc[i, 1]: #adding a check if they are the same, then no indication either way
            df.loc[i, 'category'] = unknown
    return df  

In [14]:
keywords_list= ['classification-keywords', 'other-keywords']
rules_list = ['classification-rules', 'other-rules']

df2012_category=search_and_check(keywords_list, rules_list, dic2012)
df2021_category=search_and_check(keywords_list, rules_list, dic2021)

In [15]:
#adding the newly found categories
df_2012 = df_2012.join(df2012_category)
df_2021 = df_2021.join(df2021_category)

In [16]:
#filtering on classification only to be saved as a csv
df2012_class = df_2012[df_2012['category'] == 'category: classification']
df2021_class = df_2021[df_2021['category'] == 'category: classification']

In [17]:
#saving updated database to csv - used in references mining and analysis notebook
df_2012.to_csv("database_miccai_2012_with_cat.csv")
df_2021.to_csv("database_miccai_2021_with_cat.csv")

## Some numbers about the category outcome

In [17]:
len(df_2012[df_2012['category'] == 'category: classification'])

90

In [19]:
len(df_2012[df_2012['category'] == 'category: other'])

101

In [20]:
len(df_2012[df_2012['category'] == 'category: unknown'])

61

In [21]:
len(df_2021[df_2021['category'] == 'category: classification'])

286

In [22]:
len(df_2021[df_2021['category'] == 'category: other'])

175

In [23]:
len(df_2021[df_2021['category'] == 'category: unknown'])

70

## Viewing the different parts of proceedings

Used to manually annotate articles

In [19]:
pd.set_option('display.max_rows', None)

In [25]:
df2021_class[df2021_class['Part of publication']==5]

Unnamed: 0,Title,Authors,Page numbers,DOI,Year of publication,Part of publication,category
255,CA-Net: Leveraging Contextual Features for Lun...,"Mingzhou Liu, Fandong Zhang, Xinwei Sun, Yizho...",23-32,10.1007/978-3-030-87240-3_3,2021,5,category: classification
257,DAE-GCN: Identifying Disease-Related Features ...,"Churan Wang, Xinwei Sun, Fandong Zhang, Yizhou...",43-52,10.1007/978-3-030-87240-3_5,2021,5,category: classification
258,Enhanced Breast Lesion Classification via Know...,"Kun Chen, Yuanfan Guo, Canqian Yang, Yi Xu, Ru...",53-63,10.1007/978-3-030-87240-3_6,2021,5,category: classification
259,Multiple Meta-model Quantifying for Medical Vi...,"Tuong Do, Binh X. Nguyen, Erman Tjiputra, Minh...",64-74,10.1007/978-3-030-87240-3_7,2021,5,category: classification
262,A Coherent Cooperative Learning Framework Base...,"Xinxin Shan, Ying Wen, Qingli Li, Yue Lu, Haib...",96-106,10.1007/978-3-030-87240-3_10,2021,5,category: classification
264,A Segmentation-Assisted Model for Universal Le...,"Fei Lyu, Baoyao Yang, Andy J. Ma, Pong C. Yuen",117-127,10.1007/978-3-030-87240-3_12,2021,5,category: classification
265,Constrained Contrastive Distribution Learning ...,"Yu Tian, Guansong Pang, Fengbei Liu, Yuanhong ...",128-140,10.1007/978-3-030-87240-3_13,2021,5,category: classification
266,Conditional Training with Bounding Map for Uni...,"Han Li, Long Chen, Hu Han, Ying Chi, S. Kevin ...",141-152,10.1007/978-3-030-87240-3_14,2021,5,category: classification
267,Focusing on Clinically Interpretable Features:...,"Chong Yin, Siqi Liu, Rui Shao, Pong C. Yuen",153-162,10.1007/978-3-030-87240-3_15,2021,5,category: classification
268,Categorical Relation-Preserving Contrastive Kn...,"Xiaohan Xing, Yuenan Hou, Hang Li, Yixuan Yuan...",163-173,10.1007/978-3-030-87240-3_16,2021,5,category: classification


In [26]:
df2021_class[df2021_class['Part of publication']==2]

Unnamed: 0,Title,Authors,Page numbers,DOI,Year of publication,Part of publication,category
69,SSLP: Spatial Guided Self-supervised Learning ...,"Jiajun Li, Tiancheng Lin, Yi Xu",3-12,10.1007/978-3-030-87196-3_1,2021,2,category: classification
72,Imbalance-Aware Self-supervised Learning for 3...,"Hongwei Li, Fei-Fei Xue, Krishna Chaitanya, Sh...",36-46,10.1007/978-3-030-87196-3_4,2021,2,category: classification
73,Self-supervised Visual Representation Learning...,"Pengshuai Yang, Zhiwei Hong, Xiaoxu Yin, Cheng...",47-57,10.1007/978-3-030-87196-3_5,2021,2,category: classification
74,Contrastive Learning with Continuous Proxy Met...,"Benoit Dufumier, Pietro Gori, Julie Victor, An...",58-68,10.1007/978-3-030-87196-3_6,2021,2,category: classification
76,Self-supervised Longitudinal Neighbourhood Emb...,"Jiahong Ouyang, Qingyu Zhao, Ehsan Adeli, Edit...",80-89,10.1007/978-3-030-87196-3_8,2021,2,category: classification
78,SimTriplet: Simple Triplet Representation Lear...,"Quan Liu, Peter C. Louis, Yuzhe Lu, Aadarsh Jh...",102-112,10.1007/978-3-030-87196-3_10,2021,2,category: classification
79,Lesion-Based Contrastive Learning for Diabetic...,"Yijin Huang, Li Lin, Pujin Cheng, Junyan Lyu, ...",113-123,10.1007/978-3-030-87196-3_11,2021,2,category: classification
80,SAR: Scale-Aware Restoration Learning for 3D T...,"Xiaoman Zhang, Shixiang Feng, Yuhang Zhou, Ya ...",124-133,10.1007/978-3-030-87196-3_12,2021,2,category: classification
82,SpineGEM: A Hybrid-Supervised Model Generation...,"Xihe Kuang, Jason Pui Yin Cheung, Xiaowei Ding...",145-154,10.1007/978-3-030-87196-3_14,2021,2,category: classification
83,Contrastive Learning of Relative Position Regr...,"Wenhui Lei, Wei Xu, Ran Gu, Hao Fu, Shaoting Z...",155-165,10.1007/978-3-030-87196-3_15,2021,2,category: classification


In [21]:
df2021_class[df2021_class['Part of publication']==3]

Unnamed: 0,Title,Authors,Page numbers,DOI,Year of publication,Part of publication,category
131,Targeted Gradient Descent: A Novel Method for ...,"Junyu Chen, Evren Asma, Chung Chan",25-35,/chapter/10.1007/978-3-030-87199-4_3,2021,3,category: classification
132,A Hierarchical Feature Constraint to Camouflag...,"Qingsong Yao, Zecheng He, Yi Lin, Kai Ma, Yefe...",36-47,/chapter/10.1007/978-3-030-87199-4_4,2021,3,category: classification
135,AlignTransformer: Hierarchical Alignment of Vi...,"Di You, Fenglin Liu, Shen Ge, Xiaoxia Xie, Jin...",72-82,/chapter/10.1007/978-3-030-87199-4_7,2021,3,category: classification
136,Continuous-Time Deep Glioma Growth Models,"Jens Petersen, Fabian Isensee, Gregor Köhler, ...",83-92,/chapter/10.1007/978-3-030-87199-4_8,2021,3,category: classification
137,Spine-Transformers: Vertebra Detection and Loc...,"Rong Tao, Guoyan Zheng",93-103,/chapter/10.1007/978-3-030-87199-4_9,2021,3,category: classification
138,Multi-view Analysis of Unregistered Medical Im...,"Gijs van Tulder, Yao Tong, Elena Marchiori",104-113,/chapter/10.1007/978-3-030-87199-4_10,2021,3,category: classification
139,Stain Mix-Up: Unsupervised Domain Generalizati...,"Jia-Ren Chang, Min-Sheng Wu, Wei-Hsiang Yu, Ch...",117-126,/chapter/10.1007/978-3-030-87199-4_11,2021,3,category: classification
141,Generative Self-training for Cross-Domain Unsu...,"Xiaofeng Liu, Fangxu Xing, Maureen Stone, Jiac...",138-148,/chapter/10.1007/978-3-030-87199-4_13,2021,3,category: classification
145,Harmonization with Flow-Based Causal Inference,"Rongguang Wang, Pratik Chaudhari, Christos Dav...",181-190,/chapter/10.1007/978-3-030-87199-4_17,2021,3,category: classification
150,Reference-Relation Guided Autoencoder with Dee...,"Dan Hu, Weiyan Yin, Zhengwang Wu, Liangjun Che...",231-240,/chapter/10.1007/978-3-030-87199-4_22,2021,3,category: classification


In [20]:
df2012_class

Unnamed: 0,Title,Authors,Page numbers,DOI,Year of publication,Part of publication,category
4,Surgical Gesture Classification from Video Data,"Benjamín Béjar Haro, Luca Zappella, René Vidal",34-41,10.1007/978-3-642-33415-3_5,2012.0,1,category: classification
7,Efficient Optic Cup Detection from Intra-image...,"Yanwu Xu, Jiang Liu, Stephen Lin, Dong Xu, Car...",58-65,10.1007/978-3-642-33415-3_8,2012.0,1,category: classification
9,Thoracic Abnormality Detection with Data Adapt...,"Yang Song, Weidong Cai, Yun Zhou, Dagan Feng",74-81,10.1007/978-3-642-33415-3_10,2012.0,1,category: classification
10,Domain Transfer Learning for MCI Conversion Pr...,"Bo Cheng, Daoqiang Zhang, Dinggang Shen",82-90,10.1007/978-3-642-33415-3_11,2012.0,1,category: classification
12,Incremental Kernel Ridge Regression for the Pr...,"Binbin Pan, James J. Xia, Peng Yuan, Jaime Gat...",99-106,10.1007/978-3-642-33415-3_13,2012.0,1,category: classification
13,Fuzzy Multi-class Statistical Modeling for Eff...,"Jose George, Kathleen Vunckx, Elke Van de Cast...",107-114,10.1007/978-3-642-33415-3_14,2012.0,1,category: classification
14,Structure and Context in Prostatic Gland Segme...,"Kien Nguyen, Anindya Sarkar, Anil K. Jain",115-123,10.1007/978-3-642-33415-3_15,2012.0,1,category: classification
15,Quantitative Characterization of Trabecular Bo...,"Yinxiao Liu, Punam K. Saha, Ziyue Xu",124-131,10.1007/978-3-642-33415-3_16,2012.0,1,category: classification
16,"Genetic, Structural and Functional Imaging Bio...","Nikhil Singh, Angela Y. Wang, Preethi Sankaran...",132-140,10.1007/978-3-642-33415-3_17,2012.0,1,category: classification
17,Robust MR Spine Detection Using Hierarchical L...,"Yiqiang Zhan, Dewan Maneesh, Martin Harder, Xi...",141-148,10.1007/978-3-642-33415-3_18,2012.0,1,category: classification
