In [35]:
import pandas as pd
import re

In [36]:
#reading in files
df_2012 = pd.read_csv(r'C:\Users\chris\Desktop\Value-Analysis-Thesis\mining inital data\database_miccai_2012.csv')
df_2021 = pd.read_csv(r'C:\Users\chris\Desktop\Value-Analysis-Thesis\mining inital data\database_miccai_2021.csv')

In [37]:
#removing unnecessary index column
df_2012 = df_2012.drop("Unnamed: 0", axis = 1)
df_2021 = df_2021.drop("Unnamed: 0", axis = 1)

In [38]:
#first I create a dictionary of the words in the string
#having it as a separate method means only running it once
def create_string_dic(string):
    list_of_words = string.split(" ")
    string_dictionary = {}
    for word in list_of_words:
        word = word.lower() #making all words lower case
        if word in string_dictionary: #will return true if word already is assigned a value
            string_dictionary[word] = string_dictionary.get(word) + 1
        else:
            string_dictionary[word] = 1
    return string_dictionary

In [39]:
def check_for_keywords(string_dic, keywords_dic):
    running_counter = 0
    for word in keywords_dic.keys():
        if word in string_dic: #will add value if it exists, otherwise will give the keyword a 0
            keywords_dic.get(word).append(string_dic.get(word))
            running_counter += string_dic.get(word)
        else:
            keywords_dic.get(word).append(0)
        if 'category' in word: #the last keyword is the category which should
            #correspond to the running counter instead of 0 (as I am reasonably sure the word 'category: keyword_category')
            #does not occur in the strings I am searching
            keywords_dic.get(word)[-1] = (running_counter)
    
    return keywords_dic

In [40]:
#finding the abstracts
def find_abstract(year, place):
    abstract_list = []
    with open(place, "r", encoding = 'utf-8') as part:
        article = part.read()           
        abstracts = [i.start() for i in re.finditer("Abstract", article)]
        for index in abstracts: 
            if year == 2012 and article[index+8] != "s": #removing the "Abstracts", aka the references in 2012
                abstract_list.append(article[index:index+2000])
            elif year == 2021 and article[index+9] != "T": #removing the references in 2021 ("abstract track")
                abstract_list.append(article[index:index+2000])
    return abstract_list


In [41]:
#saving the abstracts
abstract_list2012 = find_abstract(2012, r"C:\Users\chris\Desktop\Value-Analysis-Thesis\combining proceedings txt\miccai_2012_full_txt.txt")
abstract_list2021 = find_abstract(2021, r"C:\Users\chris\Desktop\Value-Analysis-Thesis\combining proceedings txt\miccai_2021_full_txt.txt")

In [42]:
#getting the titles to combine with abstracts
list_of_dic2012 = []
titles2012 = df_2012['Title'].to_list()

    
list_of_dic2021 = []
titles2021 = df_2021['Title'].to_list()


In [43]:
#combining title and abstract
for index in range(len(titles2012)):
    titles2012[index] = titles2012[index] + " " + abstract_list2012[index]

for index in range(len(titles2021)):
    titles2021[index] = titles2021[index] + " " + abstract_list2021[index] 

In [44]:
#creating dictionary of strings to search through - only run once(technically)
for title in titles2012:
    list_of_dic2012.append(create_string_dic(title))
    
for title in titles2021:
    list_of_dic2021.append(create_string_dic(title))

In [45]:
#initialising the keyword dictionary and searching the strings for the given key words
#creating a dataframe where each column is a keyword and the value is the number of occurences of that word 
#each row corresponds to the index of the article
def generate_keyword_search_df(keywords, list_of_dic):
    initial_keywords_dic = {key:[] for key in keywords}
    for index in range(len(list_of_dic)):
        keyword_dic = (check_for_keywords(list_of_dic[index], initial_keywords_dic))
    df = pd.DataFrame(keyword_dic) 
    return df

In [46]:
#reading in the keywords file and creating the list of lowercase words to check for
def reading_keywords(place):
    with open(place, "r", encoding = 'utf-8') as part:
        string = part.read()
        string = string.lower()
        keywords = string.split('\n') 
    return keywords

In [47]:
#calling the method above
df_2012_keywords = generate_keyword_search_df(reading_keywords('other keywords'), list_of_dic2012)
#df_2021_keywords = generate_keyword_search_df(keywords, list_of_dic2021)

In [48]:
def check_rules(df, rules_file):
    category = 'category: ' + rules_file[:len(rules_file)-6]
    with open(rules_file, "r", encoding = 'utf-8') as part:
        string = part.read()
        string = string.lower()
        rules = string.split('\n') 
        
        for rule in rules:
            elements = rule.split(' + ')

        for index in range(len(df)):
            if df.loc[index, elements[0]] > 0 and df.loc[index, elements[1]] > 0: #what about the instance where I have a rule
                                                                                    #consisting of more than two elements??
                df.loc[index, category] +=1
  
    return df[category]

In [135]:
keywords_list=['other keywords', 'classification keywords', 'segmentation keywords']
rules_list = ['other rules', 'classification rules', 'segmentation rules']

index = 0     
categoryA = check_rules(generate_keyword_search_df(reading_keywords(keywords_list[index]), list_of_dic2012), rules_list[index])
categoryB = check_rules(generate_keyword_search_df(reading_keywords(keywords_list[index+1]), list_of_dic2012), rules_list[index+1])
df = pd.merge(categoryA, categoryB, right_index = True, left_index = True)
categoryC = check_rules(generate_keyword_search_df(reading_keywords(keywords_list[index+2]), list_of_dic2012), rules_list[index+2])
df2012 = df.join(categoryC)

In [136]:
category = df2012.idxmax(axis = 1)
category.name = 'category'
category

0      category: classification
1               category: other
2      category: classification
3      category: classification
4      category: classification
                 ...           
247    category: classification
248             category: other
249             category: other
250             category: other
251             category: other
Name: category, Length: 252, dtype: object

In [137]:
df= df2012.join(category)

In [52]:
len(df[df['category'] == 'category: classification'])

112

In [138]:
def threshold_check(df):
    unknown = 'category: unknown'

    for element in df['category: classification']:
        if element < 2:
            df.loc[element] = 0
            row = df[element].index()
            df.loc[row, 'category'] = unknown
            
    return df  

In [139]:
df = threshold_check(df)

KeyError: 1

In [111]:
df[df['category'] == 'category: unknown']

Unnamed: 0,category: other,category: classification,category: segmentation,category


In [122]:
df.head()

Unnamed: 0,category: other,category: classification,category: segmentation,category
0,0,1,0,category: classification
1,10,2,7,category: other
2,0,1,0,category: classification
3,0,1,0,category: classification
4,0,7,0,category: classification
