In [2]:
from ankisync2 import Apkg
from tqdm import tqdm
import re
# https://github.com/rmarkello/snfpy to fuse similarity networks
# https://github.com/maxconway/SNFtool has more visualization options
# sklearn to do general network stuff
from bs4 import BeautifulSoup
import scipy as sp
import numpy as np

In [52]:
class Anki(object):
    def __init__(self, file):
        self.file = file
        self.res = []
        
    def process(self):
        self.read_file()
        # self.compute_similarity()
        
    def read_file(self):
        with Apkg(self.file) as apkg:
            for card in tqdm(apkg, f'Analyzing `{self.file}`: ', 
                             total=sum(1 for _ in apkg), 
                             position=0, leave=True):

                note = card['note']
                content = dict(zip(note['model']['flds'], note['flds']))

                # Extracting images
                images = []
                for field in ['Text', 'Extra', 'Image', 'Lecture Notes', 
                              'Missed Questions', 'Pathoma', 'Boards and Beyond', 
                              'First Aid', 'Sketchy', 'Pixorize', 'Physeo', 
                              'Additional Resources']:
                    imgs = Anki.get_images(content.get(field))
                    if imgs:
                        images += imgs

                # Regularizing the 'Text' and 'Extra' fields
                if 'Text' in content.keys():
                    text = Anki.clean_html_tags(Anki.remove_cloze(content['Text']))
                else:
                    try:
                        text = clean_html_tags(content['Header'])
                    except:
                        # A few of the in-house psych cards used a weird format
                        # Honestly just gonna skip
                        continue
                        
                if 'Extra' in content.keys():
                    extra = Anki.clean_html_tags(content['Extra'])
                else:
                    extra = ''

                out = {}
                out['id'] = note['id']
                out['data'] = f'{text} \n {extra}'
                out['images'] = list(set(images))
                out['tags'] = Anki.telescope_tags(note['tags'])

                self.res.append(out)
                
    def note_list_similarity(self, list_type, THRESHOLD=0.15):
        assert list_type.lower() in ['images', 'tags']
        nCards = len(self.res)
        res = []

        pairs = itertools.permutations(self.res, 2)
        perms = int(math.factorial(nCards)/math.factorial(nCards-2))
        
        for pair in tqdm(pairs, total = perms, position=0, leave=True):
            sim = Anki.jaccard_similarity(pair[0][list_type], pair[1][list_type])
            if sim > THRESH:
                res.append([pair[0]['id'], pair[1]['id'], sim])

        print(f'At theshold {THRESHOLD}, stored {len(res)} of {perms} combos '
              f'of {list_type} ({100*(1-len(res)/perms):.2f}% reduction)')
        
        return res

    def note_image_similarity(self, THRESHOLD=0.15):
        res = self.note_list_similarity('images', THRESHOLD)
        self.note_image_similarity_vals = res
    
    def note_tag_similarity(self, THRESHOLD=0.15):
        res = self.note_list_similarity('tags', THRESHOLD)
        self.note_tag_similarity_vals = res
    
    def note_text_similarity(self):
        pass
    
    def note_overall_similarity(self):
        pass

    @staticmethod
    def clean_html_tags(markup):
        soup = BeautifulSoup(markup, 'html.parser')
        for br in soup.find_all('br'):
            br.replace_with('\n')
        return soup.get_text()

    @staticmethod
    def get_images(markup):
        out = []
        if markup:
            soup = BeautifulSoup(markup, 'html.parser')
            images = soup.findAll('img')
            for image in images:
                out.append(image['src'])  
        return out

    @staticmethod
    def remove_cloze(markup):
        # txt = '<a href="blah"> Hello {{c1::world}} once {{c2::again::hint}} lol </a>'
        return re.sub('{{.*?::(.*?)(::.*?){0,}}}', '\\1', markup)
    
    @staticmethod
    def telescope_tags(taglist):
        out = []
        for tag in taglist:
            splt = tag.lower().split('::')
            for i in range(1, len(splt)):
                out.append('::'.join(splt[0:i]))
        return list(set(out))

    @staticmethod
    def jaccard_similarity(list1, list2):
        s1 = set(list1)
        s2 = set(list2)
        try:
            jaccard = float(len(s1.intersection(s2)) / len(s1.union(s2)))
        except ZeroDivisionError:
            jaccard = 0
        return jaccard

# https://stackoverflow.com/questions/36345324/how-calculate-list-python-into-matrix-similarity
# TODO: Calculate similarity based on text
# TODO: Calculate network fusion

# TODO: combine all the whitespace?
# TODO: similarity of tags/images rather than notes (easier to compute: simply count how many notes any 2 tags have in common)

In [None]:
x = Anki("Selected Notes.apkg")
x.process()
x.note_image_similarity()
x.note_tag_similarity()

Analyzing `Selected Notes.apkg`: 100%|██████████| 17052/17052 [00:25<00:00, 673.61it/s] 
 53%|█████▎    | 145960766/275311056 [05:00<03:51, 559318.55it/s]

In [22]:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.squareform.html#scipy.spatial.distance.squareform
# https://medium.com/neo4j/building-a-similarity-graph-with-neo4js-approximate-nearest-neighbors-algorithm-1398583b280b

import math
import itertools

# Generate pairwise distances
nCards = len(x.res)

res = []
THRESH = 0.15

pairs = itertools.permutations(x.res, 2)
perms = int(math.factorial(nCards)/math.factorial(nCards-2))
for pair in tqdm(pairs, total = perms, position=0, leave=True):
    sim = Anki.jaccard_similarity(pair[0]['tags'], pair[1]['tags'])
    if sim > THRESH:
        res.append([pair[0]['id'], pair[1]['id'], sim])
    
print(f'At theshold {THRESH}, stored {len(res)} of {perms} combos ({100*(1-len(res)/perms):.2f}% reduction)')
    

100%|██████████| 997002/997002 [00:02<00:00, 492033.01it/s]

At theshold 0.15, stored 105666 of 997002 combos (89.40% reduction)





In [20]:
%matplotlib inline

In [48]:
x.res[100]

{'id': 1461884807843,
 'data': 'Hypovolemia or volume contraction is another potent stimulus for ADH secretion\n \n sensed by baroreceptors; information transmitted through the vagus nerve to the hypothalamus',
 'images': ['Screen Shot 2019-11-17 at 6.45.09 PM.png',
  'zOverall_1566160514431.jpg',
  'tmpObYyEp.png',
  'paste-436192583614686.jpg',
  'Hypovolemia - ADH secretion_1566160514431.jpg',
  'tmpgYfLu_.png'],
 'tags': ['zz::cardio',
  '#ak_step1_v9::#b&b::19_renal',
  '#ak_step1_v9::#b&b::08_endocrinology',
  '#ak_step1_v9::#firstaid',
  '#ak_step1_v9::#b&b::08_endocrinology::05_other',
  '#ak_step1_v9::#sketchypath::04_renal::06_volume,_electrolyte_&_acid/base_disorders',
  '#ak_step1_v9::#firstaid::08_endocrine::03_physiology::03_antidiuretic_hormone',
  '#ak_step1_v9::endocrine',
  '#ak_step1_v9::#sketchypath',
  '#ak_step1_v9::#firstaid::08_endocrine::03_physiology',
  'zz::renal',
  '#ak_step1_v9::#b&b',
  'zz::neuro',
  '#ak_step1_v9',
  '#ak_step1_v9::#uworld',
  'zz',
  