In [None]:
from neo4j import GraphDatabase
from CurriculumDB.Modelsn4j import *
import docx
import os
from sklearn.metrics import pairwise_distances
from sklearn.cluster import  AgglomerativeClustering

In [None]:
# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "bolt://localhost:7687"
AUTH = ("curriculum", "mycurriculum")

driver = GraphDatabase.driver(URI, auth=AUTH)
print(driver)
print(driver.verify_connectivity())
#help(driver.verify_connectivity)


In [None]:
#Set this depending on location
factoryname='curriculumdb'

In [None]:

factory = CurriculumFactory(driver, factoryname)

In [None]:
routes = {'BIMS': ["BMS","Biomedical Sciences"],
 'NEUR': ["BMS","Neuroscience"],
 'PHAR': ["BMS","Pharmacology"],
 'PHSC': ["BMS","Physiological Sciences"],
 'BIOLOGSCI': ["BIO","Biological Sciences"],
 'BIOC': ["BIO","Biochemistry"],
 'BSBI' : ["BIO","Biological Sciences (Bioinformatics)"],
 'BSPS': ["BIO","Biological Sciences (Plant Sciences)"],
 'MBIO': ["BIO","Microbiology"],
 'MOLG': ["BIO","Molecular Genetics"],
 'MOLB':["BIO","Molecular Biology"],
 'BCDD' : ["BIO","Biological Chemistry and Drug Discovery"]}

Get the required fields from the Programme object

In [None]:
Programme.requiredParams


In [None]:
routeobj ={}
for p in routes:
    prog = factory.get_or_create_Element('Programme', code=p, name=routes[p][1])
    routeobj[p]=prog
    

Test extraction of data from one programme file.

In [None]:
doc = docx.Document("c:/Users/marti/Documents/LifeSciteaching/Curriculum/Programmes/UG/BSc Hons Biomedical Sciences Programme Specification 2324.docx")



In [None]:
modmap = []
structure ={}
for t in doc.tables:
    for r in range(len(t.rows)):
        #print(len(t.rows), len(t.row_cells(r)), t.row_cells(r)[0].text,[c.tables for c in t.row_cells(r)])
        structure[t.row_cells(r)[0].text.split()[0]] = t.row_cells(r)
tc=0
for m in structure['2.10'][0].tables:
    coretype=['Core','Elective'][tc%2]
    tc+=1
    for r in range(1,len(m.rows)):
        print([coretype]+[c.text for c in m.row_cells(r)])
        modmap.append([coretype]+[c.text for c in m.row_cells(r)])

Build a dictionary of modules from the database for mapping to programmes

In [None]:
mods = factory.get_all_elements('Module')
modules ={m.params['code']:m for m in mods} 

In [None]:
modules


List all the components extracted from the document tables.

In [None]:
{s:structure[s][-1].text for s in structure}

In [None]:
# Dictionary to hold the programmes
programmes={}

In [None]:
os.listdir(progdir)

Test mapping on one programme

In [None]:
for m in modmap:
    if modules.get(m[1].strip('*')):
        programmes['BIMS'].map_module(modules[m[1].strip('*')],m[0]=='Elective', year='23/24')

In [None]:
#List the programme spec documents
docs = {
    'BIO':'BSc Hons Biologsci Programme Specification QASv3.docx',
'BIMS': 'BSc Hons Biomedical Sciences Programme Specification 2324.docx',
 'NEUR': 'BSc Hons Neuroscience Programme Specification QASv8.docx',
 'PHAR': 'BSc Hons Pharmacology Programme Specification QASv6.docx',
 'PHYS': 'BSc Hons Physiological Sciences Programme Specification QASv6.docx',
'BIOBIMSNUS':'BSc (Hons) Joint Degree Biological and Biomedical Sciences QASv5.docx',
 'BIOHAN':'BSc (Hons) Joint Degree Molecular Biosciences QASv2.docx'
    
}
    

#progdir='c:/Users/marti/Documents/LifeSciteaching/Curriculum/Programmes/UG/'
progdir ='c:/Users/dmamartin/OneDrive - University of Dundee/Teaching/CurriculumDB/Programmes/UG/'

In [None]:
#Change the list to load and process specific programmes
for route in [ 'BIOHAN']:
    print(os.path.join(progdir,docs[route]))
    doc = docx.Document(os.path.join(progdir,docs[route]))
    modmap = []
    structure ={}
    for t in doc.tables:
        for r in range(len(t.rows)):
            #print(len(t.rows), len(t.row_cells(r)), t.row_cells(r)[0].text,[c.tables for c in t.row_cells(r)])
            structure[t.row_cells(r)[0].text.split()[0]] = t.row_cells(r)
    tc=0
    sy = structure['Applicability'].split()[-1]
    for m in structure['2.10'][0].tables:
        coretype=['Core','Elective'][tc%2]
        tc+=1
        for r in range(1,len(m.rows)):
            print([coretype]+[c.text for c in m.row_cells(r)])
            modmap.append([coretype]+[c.text for c in m.row_cells(r)])
    programmes[route]=factory.get_or_create_Element('Programme', code=route,name=structure['1.1'][-1].text )
    for m in modmap:
        if modules.get(m[1].strip('*')):
            programmes[route].map_module(modules[m[1].strip('*')],m[0]=='Elective', year=sy)

In [None]:
# Processing biologicals
doc = docx.Document(os.path.join(progdir,docs['BIO']))

Biological degrees other than Biological Sciences have the programme spec in appended tables. This maps degree route to the table number. Each degree has a table of core modules and a table of optional modules

In [None]:
tablecount = {'BSBI': (8,'Biological Sciences (Bioinformatics)'),
              'BSPS': (10,'Biological Sciences (Plant Sciences)'),
              'BIOC': (12,'Biochemistry'),
              'BCDD': (14,'Biological Chemistry and Drug Discovery'),
              'MBIO': (16,'Microbiology'),
              'MOLB': (18,'Molecular Biology'),
              'MOLG': (20,'Molecular Genetics')
             }
              # list tables
count=0
for t in doc.tables:
    print(count,len(t.rows), t.row_cells(0)[0].text)
    count+=1

Now link all modules for the biological programmes

In [None]:
tablelist = [t for t in doc.tables]
for rt in tablecount:
    print(rt, tablecount[rt][0])    
    modmap = []
    structure ={}
    for t in doc.tables:
        for r in range(len(t.rows)):
            #print(len(t.rows), len(t.row_cells(r)), t.row_cells(r)[0].text,[c.tables for c in t.row_cells(r)])
            structure[t.row_cells(r)[0].text.split()[0]] = t.row_cells(r)
    tc=0
    sy = structure['Applicability'].split()[-1]
    try:
        for m in tablelist[tablecount[rt][0]:tablecount[rt][0]+2]:
            coretype=['Core','Elective'][tc%2]
            tc+=1
            for r in range(1,len(m.rows)):
                print([coretype]+[c.text for c in m.row_cells(r)])
                modmap.append([coretype]+[c.text for c in m.row_cells(r)])
        programmes[rt]=factory.get_or_create_Element('Programme', code=rt,name=structure['1.1'][-1].text.replace('Biological Sciences', tablecount[rt][1]))
        for m in modmap:
            if modules.get(m[1].strip('*')):
                programmes[rt].map_module(modules[m[1].strip('*')],m[0]=='Elective', year=sy)
    except Exception as e:
        print(r, tablecount.get(r, 'Error'), e)

In [None]:
#Add new modules
#progdir='c:/Users/marti/Documents/LifeSciteaching/Curriculum/Programmes/UG/'
progdir ='c:/Users/dmamartin/OneDrive - University of Dundee/Teaching/CurriculumDB/New Biomed/Programmes/'

#List the programme spec documents
docs = {
    'BIO':'BSc Hons Biologsci Programme Specification QASv3.docx',
'BIMS': 'BSc Hons Biomedical Sciences Programme Specification 2324.docx',
 'NEUR': 'BSc Hons Neuroscience Programme Specification QASv8.docx',
 'PHAR': 'BSc Hons Pharmacology Programme Specification QASv6.docx',
 'PHYS': 'BSc Hons Physiological Sciences Programme Specification QASv6.docx',
'BIOBIMSNUS':'BSc (Hons) Joint Degree Biological and Biomedical Sciences QASv5.docx',
 'BIOHAN':'BSc (Hons) Joint Degree Molecular Biosciences QASv2.docx'
    
}
    



# Extracting ILOS

Extract the relevant section from the programme table (section 2.3) and then split so they are one ILO per line.

In [None]:
ilos ={}
for d in docs:
    structure={}
    doc = docx.Document(os.path.join(progdir, docs[d]))
    for t in doc.tables:
        for r in range(len(t.rows)):
            #print(len(t.rows), len(t.row_cells(r)), t.row_cells(r)[0].text,[c.tables for c in t.row_cells(r)])
            structure[t.row_cells(r)[0].text.split()[0]] = t.row_cells(r)
    ilos[d]=structure['2.3'][1].text.split('\n')

Reshape ILO extraction to tuples and dfine a simple wordbag comparison function.

In [None]:
 iloslong=[(m,i) for m in ilos for i in ilos[m] if i]

def ilocompare(a, b):
    '''Take in two (Module, ILO) tuples and compare them by intersection/union 
    of the sets of all words in ILO greater than 4 characters long.
    This does no stemming or thesaurus lookup.'''
    seta = set([ word for word in a[1].split() if len(word)>4])
    setb = set([ word for word in b[1].split() if len(word)>4])
    union =seta.union(setb)
    inter = seta.intersection(setb)
    if not union:
        return 1
    return 1-(len(inter)/len(union))

Create a full distance matrix for all ILOs in the list 

In [None]:
dm =[]
for y in range(len(iloslong)):
    ymat=[]
    for x in range(len(iloslong)):
        ymat.append(ilocompare(iloslong[x], iloslong[y]))
    dm.append(ymat)

Import necessary libraries to cluster the ILOs

In [None]:
import random
import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram, ward, fcluster,cut_tree
from scipy.spatial.distance import pdist, squareform
%matplotlib inline

In [None]:
#Convert full distance matrix to a condensed distance matrix
pdm=squareform(dm)
# Hierarchically cluster with Ward algorithm
iloh =ward(pdm)

In [None]:
#Calculate the dendrogram
dendrogram(iloh)
#Display the dendrogram
plt.show()
# Picked a cutoff by eye from the tree and list cluster number for each ILO 
clusters=cut_tree(iloh, height=1.0)

In [None]:
#Extend ILO list to give a unique number and the cluster number.
for p in range(len(iloslong)):
    if len(iloslong[p])==2:
        iloslong[p]= [p]+list(iloslong[p])+list(clusters[p])
    else:
        iloslong[p][3] = int(clusters[p])
    


In [None]:
#Group cluster terms together.
groupedterms ={}
for x in iloslong:
    if not groupedterms.get(x[3]):
        groupedterms[x[3]]=[]
    groupedterms[x[3]].append(x)
    

In [None]:
groupedterms[3]

In [None]:
#Output ILOS with cluster info to a tab separated file
ofh = open('programmeilos.txt','w')
print('Ref','Programme','ILO', 'Cluster',file=ofh, sep='\t')
for i in iloslong:
    i[2]=i[2].replace('\u2010','')
    print('\t'.join([f'{x}'.replace('\t',' ') for x in i]), file=ofh )
ofh.close()