In [1]:
import pandas as pd
import re

In [2]:
#reading in files
df_2012 = pd.read_csv(r'C:\Users\chris\Desktop\Value-Analysis-Thesis\mining inital data\database_miccai_2012.csv')
df_2021 = pd.read_csv(r'C:\Users\chris\Desktop\Value-Analysis-Thesis\mining inital data\database_miccai_2021.csv')

In [3]:
#removing unnecessary index column
df_2012 = df_2012.drop("Unnamed: 0", axis = 1)
df_2021 = df_2021.drop("Unnamed: 0", axis = 1)

In [4]:
#first I create a dictionary of the words in the string
#having it as a separate method means only running it once
def create_string_dic(string):
    list_of_words = string.split(" ")
    string_dictionary = {}
    for word in list_of_words:
        word = word.lower().strip() #making all words lower case and removing all white spaces
        if word in string_dictionary: #will return true if word already is assigned a value
            string_dictionary[word] = string_dictionary.get(word) + 1
        else:
            string_dictionary[word] = 1
    return string_dictionary

In [5]:
def check_for_keywords(string_dic, keywords_dic):
    running_counter = 0
    for word in keywords_dic.keys():
        if word in string_dic: #will add value if it exists, otherwise will give the keyword a 0
            keywords_dic.get(word).append(string_dic.get(word))
            running_counter += string_dic.get(word)
        else:
            keywords_dic.get(word).append(0)
        if 'category' in word: #the last keyword is the category which should
            #correspond to the running counter instead of 0 (as I am reasonably sure the word 'category: keyword_category')
            #does not occur in the strings I am searching
            keywords_dic.get(word)[-1] = (running_counter)

    return keywords_dic

In [6]:
#finding the abstracts
def find_abstract(year, place):
    abstract_list = []
    with open(place, "r", encoding = 'utf-8') as part:
        article = part.read()           
        abstracts = [i.start() for i in re.finditer("Abstract", article)]
        for index in abstracts: 
            if year == 2012 and article[index+8] != "s": #removing the "Abstracts", aka the references in 2012
                abstract_list.append(article[index:index+2000])
            elif year == 2021 and article[index+9] != "T": #removing the references in 2021 ("abstract track")
                abstract_list.append(article[index:index+2000])
    return abstract_list


In [7]:
def creating_dic(year, place, df):
    #saving the abstract
    abstract_list = find_abstract(year, place)
    #creating the list of dictionaries for each article
    list_of_dic = []
    #getting the title to combine with the abstract
    titles = df['Title'].to_list()
    #combining title and abstract
    for index in range(len(titles)):
        titles[index] = titles[index] + " " + abstract_list[index]
    #creating dictionary of strings to search through - only run once(technically)
    for title in titles:
        list_of_dic.append(create_string_dic(title))
    return list_of_dic

In [8]:
dic2012 = creating_dic(2012, r"C:\Users\chris\Desktop\Value-Analysis-Thesis\combining proceedings txt\miccai_2012_full_txt.txt", df_2012)
dic2021 = creating_dic(2021, r"C:\Users\chris\Desktop\Value-Analysis-Thesis\combining proceedings txt\miccai_2021_full_txt.txt", df_2021)

In [9]:
#initialising the keyword dictionary and searching the strings for the given key words
#creating a dataframe where each column is a keyword and the value is the number of occurences of that word 
#each row corresponds to the index of the article
def generate_keyword_search_df(keywords, list_of_dic):
    initial_keywords_dic = {key:[] for key in keywords}
    for index in range(len(list_of_dic)):
        keyword_dic = (check_for_keywords(list_of_dic[index], initial_keywords_dic))
    df = pd.DataFrame(keyword_dic) 
    return df

In [15]:
#reading in the keywords file and creating the list of lowercase words to check for
def reading_keywords(place):
    with open(place, "r", encoding = 'utf-8') as part:
        string = part.read()
        string = string.lower()
        keywords = string.split('\n') 
    return keywords

In [16]:
def check_rules(df, rules_file):
    category = 'category: ' + rules_file[:len(rules_file)-6]
    with open(rules_file, "r", encoding = 'utf-8') as part:
        string = part.read()
        string = string.lower()
        rules = string.split('\n') 
        for index in range(len(df)):
            for rule in rules:
                threshold = 0
                element_exists = True
                elements = rule.split(' + ')
                for i in range(len(elements)):   
                    threshold += df.loc[index, elements[i].strip()]
                    if df.loc[index, elements[i].strip()] == 0:
                        element_exists = False #if just one isn't there, set to false and do not allow rule to add one
                        break 
                if element_exists == False:
                    break #no need to look at other elements of a rule if one already isn't there
                elif threshold == len(elements) and element_exists: #meaning each element in the rule was present 
                    df.loc[index, category] +=1
                           
    return df[category]

In [17]:
def search_and_check(keywords_list, rules_list, dic):
    #checking the classification keywords first and their rules
    dfA = check_rules(generate_keyword_search_df(reading_keywords(keywords_list[0]), dic), rules_list[0])
    #then checking the other keywords and their rules
    dfB = check_rules(generate_keyword_search_df(reading_keywords(keywords_list[1]), dic), rules_list[1])
    
    #merging the two found dataframes
    df = pd.merge(dfA, dfB, right_index = True, left_index = True)
    
    #adding the category of the column with the most highest value found in the search
    category = df.idxmax(axis = 1)
    category.name = 'category'
    df= df.join(category)
    #doing the threshold check to add the unknown category for both columns less than two (meaning low indication of either)
    #or same value in both columns (meaning no indiation either way)
    df = threshold_check(df)
    #only the final category series is returned (to be added to original database)
    
    return df['category']
    #return df



In [18]:
def threshold_check(df):
    unknown = 'category: unknown'
    for i in range(len(df)):
        if df.iloc[i, 0] < 2 and df.iloc[i, 1] < 2: #checking if both columns are below 2, meaning low indication of either category
            df.loc[i, 'category'] = unknown
        elif df.iloc[i, 0] == df.iloc[i, 1]: #adding a check if they are the same, then no indication either way
            df.loc[i, 'category'] = unknown
    return df  

In [19]:
keywords_list= ['classification keywords', 'other keywords']
rules_list = ['classification rules', 'other rules']

df2012_category=search_and_check(keywords_list, rules_list, dic2012)
df2021_category=search_and_check(keywords_list, rules_list, dic2021)

In [20]:
#adding the newly found categories
df_2012 = df_2012.join(df2012_category)
df_2021 = df_2021.join(df2021_category)

In [236]:
#saving updated database to csv
#df_2012.to_csv("database_miccai_2012_with_cat.csv")
#df_2021.to_csv("database_miccai_2021_with_cat.csv")

In [21]:
df_2012

Unnamed: 0,Title,Authors,Page numbers,DOI,Year of publication,Part of publication,category
0,Reliable Assessment of Perfusivity and Diffusi...,"M. Freiman, S. D. Voss, R. V. Mulkern, J. M. P...",1-9,/chapter/10.1007/978-3-642-33415-3_1,2012.0,1,category: other
1,Multi-organ Abdominal CT Segmentation Using Hi...,"Robin Wolz, Chengwen Chu, Kazunari Misawa, Ken...",10-17,/chapter/10.1007/978-3-642-33415-3_2,2012.0,1,category: other
2,Radiation-Free Drill Guidance in Interlocking ...,"Benoit Diotte, Pascal Fallavollita, Lejing Wan...",18-25,/chapter/10.1007/978-3-642-33415-3_3,2012.0,1,category: unknown
3,Developing Essential Rigid-Flexible Outer Shea...,"Siyang Zuo, Takeshi Ohdaira, Kenta Kuwana, Yos...",26-33,/chapter/10.1007/978-3-642-33415-3_4,2012.0,1,category: unknown
4,Surgical Gesture Classification from Video Data,"Benjamín Béjar Haro, Luca Zappella, René Vidal",34-41,/chapter/10.1007/978-3-642-33415-3_5,2012.0,1,category: classification
...,...,...,...,...,...,...,...
247,An Invariant Shape Representation Using the An...,"A. A. Joshi, S. Ashrafulla, D. W. Shattuck, H....",607-614,/chapter/10.1007/978-3-642-33454-2_75,2012.0,3,category: classification
248,Phase Contrast Image Restoration via Dictionar...,"Hang Su, Zhaozheng Yin, Takeo Kanade, Seungil Huh",615-622,/chapter/10.1007/978-3-642-33454-2_76,2012.0,3,category: classification
249,Context-Constrained Multiple Instance Learning...,"Yan Xu, Jianwen Zhang, Eric I-Chao Chang, Maod...",623-630,/chapter/10.1007/978-3-642-33454-2_77,2012.0,3,category: other
250,Structural-Flow Trajectories for Unravelling 3...,"Katerina Fragkiadaki, Weiyu Zhang, Jianbo Shi,...",631-638,/chapter/10.1007/978-3-642-33454-2_78,2012.0,3,category: other


In [23]:
len(df_2012[df_2012['category'] == 'category: other'])

99

In [24]:
len(df_2012[df_2012['category'] == 'category: classification'])

91

In [25]:
len(df_2012[df_2012['category'] == 'category: unknown'])

62

In [26]:
len(df_2021[df_2021['category'] == 'category: other'])

175

In [27]:
len(df_2021[df_2021['category'] == 'category: unknown'])

71

In [28]:
len(df_2021[df_2021['category'] == 'category: classification'])

285