### Extract Knowlede Units and Learning Outcomes from ACM 2023 Body of Knowledge

https://csed.acm.org/wp-content/uploads/2024/04/3.1-Body-of-Knowledge-1.pdf



In [17]:
import fitz  # PyMuPDF
import re
import pandas as pd

In [2]:
def get_doc(pdf_path):
    '''return pdf doc at given path'''
    doc = fitz.open(pdf_path)
    return doc

In [3]:
def get_next_line(doc) :
    '''generator to get next line from pdf doc'''
    for page in doc :
        text = page.get_text("text")  # Extract text as plain text        
        lines = text.split('\n')
        for line in lines :
            yield line


In [4]:
def getKAs(doc):
    ''' get KAs based on line prior to each Preamble'''
    back1 = None
    back2 = None
    ka = []
    for i, line in enumerate(get_next_line(doc)) :
            if line.strip() == 'Preamble' :           
                if back1.strip() == '' :
                    ka.append(back2.strip())
                else :
                    ka.append(back1.strip())
            back2 = back1
            back1 = line
    return ka 

In [5]:
def print_ku(doc, qry):    
     '''prints knowledge units for qry, e.g., 
        Software Development Fundamentals (SDF),
        returns text between query and Professional Dispositions
     '''
     printing = 0
     for line in get_next_line(doc) :
          if line.strip() == qry :                    
               printing = 1
          elif printing == 1 and line.strip() == 'Knowledge Units' :
               printing = 2                      
          elif printing == 2 and line.strip() == 'Professional Dispositions' :
               return
          elif printing == 2 :
               if (line.strip() != '' and not line.strip().isdigit()) :
                    if re.match('[ivx]+\.', line.strip()) :  
                         print(line, end = '')
                    else :
                         print(line)
                

In [None]:
pdf_file = "/Users/dancikg/Downloads/bok.pdf"  # Replace with your PDF file path
doc = get_doc(pdf_file)
doc

In [None]:
kaList = getKAs(doc)
kaList

### Save the output below to a file, e.g., test.csv

In [None]:
for k in kaList :
    print_ku(doc, k)

### Now process and format the file

In [9]:
def remove_blanks(lines) :
     return [line for line in lines if line != '']

def process_lines(lines):    
    lines = [line.strip() for line in lines]
    n = len(lines)
    for i in range(n) :
        # f. items are on multiple lines; fix this
        if re.match('^[a-h]\.$', lines[i]) :
                lines[i] = lines[i] + ' ' +  lines[i+1]
                lines[i+1] = ''
        # inconsistent KA Core notation  
        if lines[i] == 'KA-Core:' or lines[i] == 'KA core:' or lines[i] == 'KA Core':
             lines[i] = 'KA Core:'
    return remove_blanks(lines)


In [10]:
def is_heading(x) :
    ''' returns (type, x) for heading, or None'''
    if re.match('^(CS Core\:|KA Core\:|Non\-core\:)', x) :
        return 'core', x
    elif re.match('^Illustrative Learning Outcomes:', x) :
        return 'lo', x
    elif re.match('^[A-Z]{2,3}-[A-z]+: ', x) :        
        return 'ka', x
    elif re.match('^[A-Z]{2,3}-[A-z]+-[A-z]+: ', x) :        
        return 'ka', x
    return None

In [11]:
def extractItemNumber(s) : 
    ''' gets item in format x. '''
    r = re.match('^[0-9a-z]{1,4}\. ', s)
    if r :
        return r.group().strip(' .')
    return None

def getNextItemNumber(lines, i) :
    '''Returns index of next item number'''
    while i > 1 and i + 1 < len(lines) :   
        i = i + 1                    
        if is_heading(lines[i]) or extractItemNumber(lines[i]) :
            return i
    return i

In [12]:
# for debugging purposes
def getLineNumbers(lines, qry) :
    '''Gets line numbers where qry is found'''
    return [i for i,l in enumerate(lines) if qry in l]

In [13]:
## read in from file
with open('test.csv') as f :
    lines = f.readlines()

lines = process_lines(lines)

In [14]:
# Combine items that span multiple lines
i = 0
while i < len(lines) :
    next_i = getNextItemNumber(lines,i)
    if next_i > i + 1 :               
        lines[i] = ' '.join(lines[i:next_i])        
        for ii in range(i+1,next_i) :
            lines[ii] = ''        
        i = next_i - 1
    i = i + 1       

lines = remove_blanks(lines) 

In [15]:
# create heading lists
def assignHeading(i, h, h_col, h_type, line) :
    if h and h[0] == h_type :
        h_col[i] = line
    else :
        h_col[i] = h_col[i-1]

n = len(lines)
ka_col = ['']*n
core_col = ['']*n
lo_col = ['']*n

for i,l in enumerate(lines) :

    h = is_heading(l) 
    assignHeading(i, h, ka_col, 'ka', l)   
    assignHeading(i, h, core_col, 'core', l)
    assignHeading(i, h, lo_col, 'lo', l)
    if h and h[0] == 'lo' :
        core_col[i] = ''
    elif h and h[0] == 'ka' :
        lo_col[i] = ''
    


In [None]:
# create data frame
df = [(k,l,h,li) for k,l,h, li in zip(ka_col,lo_col,core_col,lines)]
df = pd.DataFrame(df)

# main heading list takes prefix, e.g., AL from AL-Foundational
main_ka = [x.split('-')[0] for x in df[0]]


In [None]:
# create dictionary to lookup KA description
ka_lookup = {k.split(' (')[1].strip(')'): k for k in kaList}
ka_lookup

In [20]:
# create tuple in form AL, Algorithmic Foundations to add to data frame
ka_labels = [(k, ka_lookup[k].split('(')[0].strip()) for k in main_ka]


In [21]:
# add ka_labels to data frame
ka_labels_df = pd.DataFrame(ka_labels)
final_df = pd.concat([ka_labels_df, df], axis = 1)
final_df.columns = ['KA', 'KA_DESC', 'KA_SUB', 'LO', 'CORE', 'TOPIC' ]
#final_df.to_csv('ACM_processed2.csv', sep = '\t')

In [22]:
# some topics contain headings, remove these
keep = [x == None for x in final_df['TOPIC'].apply(is_heading)]
final_df = final_df[keep]

# save file
final_df.to_csv('ACM_processed.csv', sep = '\t')
        

In [None]:
# Erorr checking -- are there any lines left that are not list items?

x = final_df['TOPIC']
x = list(x)[0]

def not_li(x) :
    m1 = re.match('^[0-9]+. ',x)
    m2 = re.match('^[a-z]{1,4}. ', x)
    return m1 == None and m2 == None

index = final_df['TOPIC'].apply(not_li)
final_df[index]
