In [12]:
%matplotlib inline

In [None]:
from ankisync2 import Apkg
from tqdm import tqdm
import re
from bs4 import BeautifulSoup
import scipy as sp
import numpy as np
import os
import itertools
import math

: 

In [17]:
class Anki(object):
    THRESHOLD = 0.15  # Only record similarity values above threshold
    PARTIAL = None  # Only look at first n entries
    
    def __init__(self, file):
        self.file = os.path.abspath(file)
        self.db = []
        self.note_similarity = {'images': [], 'tags': [], 'text': [], 'overall': []}
        
    def process(self):
        self.read_file()
        self.compute_note_similarity()
        
    def read_file(self):
        with Apkg(self.file) as apkg:
            for card in tqdm(apkg, f'Reading `{self.file}`: ', 
                             total=sum(1 for _ in apkg), 
                             position=0, leave=True):

                note = card['note']
                content = dict(zip(note['model']['flds'], note['flds']))

                # Extracting images
                images = []
                for field in ['Text', 'Extra', 'Image', 'Lecture Notes', 
                              'Missed Questions', 'Pathoma', 'Boards and Beyond', 
                              'First Aid', 'Sketchy', 'Pixorize', 'Physeo', 
                              'Additional Resources']:
                    imgs = Anki.get_images(content.get(field))
                    if imgs:
                        images += imgs

                # Regularizing the 'Text' and 'Extra' fields
                if 'Text' in content.keys():
                    text = Anki.clean_html_tags(Anki.remove_cloze(content['Text']))
                else:
                    try:
                        text = clean_html_tags(content['Header'])
                    except:
                        # A few of the in-house psych cards used a weird format
                        # Honestly just gonna skip
                        continue
                        
                if 'Extra' in content.keys():
                    extra = Anki.clean_html_tags(content['Extra'])
                else:
                    extra = ''

                out = {}
                out['id'] = note['id']
                out['data'] = f'{text} \n {extra}'
                out['images'] = list(set(images))
                out['tags'] = Anki.telescope_tags(note['tags'])

                self.db.append(out)
        self.nCards = len(self.db)
                
    def compute_note_similarity(self):
        self.note_image_similarity()
        self.note_tag_similarity()
        self.note_text_similarity()
        self.note_overall_similarity()

    def note_image_similarity(self):
        res = self.note_list_similarity('images')
        return res
    
    def note_tag_similarity(self):
        res = self.note_list_similarity('tags')
        return res
    
    def note_text_similarity(self):
        pass # TODO
    
    def note_overall_similarity(self):
        pass # TODO
    
    def note_list_similarity(self, list_type):
        assert list_type.lower() in ['images', 'tags']
        res = []

        if self.PARTIAL:
            pairs = itertools.permutations(self.db[0:self.PARTIAL], 2)
            perms = int(math.factorial(self.PARTIAL)/math.factorial(self.PARTIAL-2))
        else:
            pairs = itertools.permutations(self.db, 2)
            perms = int(math.factorial(self.nCards)/math.factorial(self.nCards-2))
        
        for pair in tqdm(pairs, f'Calculating similarity of {list_type}: ',
                         total = perms, position=0, leave=True):
            sim = Anki.jaccard_similarity(pair[0][list_type], pair[1][list_type])
            if sim > self.THRESHOLD:
                res.append([pair[0]['id'], pair[1]['id'], sim])

        print(f'At theshold {self.THRESHOLD}, stored {len(res)} of {perms} combos '
              f'of {list_type} ({100*(1-len(res)/perms):.2f}% reduction)')
        
        self.note_similarity[list_type] = res
        return res
    
    def set_threshold(self, threshold=0.15):
        self.THRESHOLD = threshold

    @staticmethod
    def clean_html_tags(markup):
        soup = BeautifulSoup(markup, 'html.parser')
        for br in soup.find_all('br'):
            br.replace_with('\n')
        return soup.get_text()

    @staticmethod
    def get_images(markup):
        out = []
        if markup:
            soup = BeautifulSoup(markup, 'html.parser')
            images = soup.findAll('img')
            for image in images:
                out.append(image['src'])  
        return out

    @staticmethod
    def remove_cloze(markup):
        # txt = '<a href="blah"> Hello {{c1::world}} once {{c2::again::hint}} lol </a>'
        return re.sub('{{.*?::(.*?)(::.*?){0,}}}', '\\1', markup)
    
    @staticmethod
    def telescope_tags(taglist):
        out = []
        for tag in taglist:
            splt = tag.lower().split('::')
            for i in range(1, len(splt)):
                out.append('::'.join(splt[0:i]))
        return list(set(out))

    @staticmethod
    def jaccard_similarity(list1, list2):
        s1 = set(list1)
        s2 = set(list2)
        try:
            jaccard = float(len(s1.intersection(s2)) / len(s1.union(s2)))
        except ZeroDivisionError:
            jaccard = 0
        return jaccard

# TODO: Calculate similarity based on text
# TODO: Calculate network fusion

# TODO: combine all the whitespace?
# TODO: similarity of tags/images rather than notes 
# (easier to compute: simply count how many notes any 2 tags have in common)

In [None]:
x = Anki("Selected Notes.apkg")
x.set_threshold(0.15)
x.read_file()
x.note_image_similarity()
x.note_tag_similarity()

Analyzing `Selected Notes.apkg`: 100%|██████████| 17052/17052 [00:26<00:00, 643.48it/s] 
100%|██████████| 275311056/275311056 [08:02<00:00, 570045.02it/s]
  0%|          | 24195/275311056 [00:00<18:57, 241949.65it/s]

At theshold 0.15, stored 459152 of 275311056 combos of images (99.83% reduction)


 50%|████▉     | 137324559/275311056 [11:13<4598:31:47,  8.34it/s] 

In [18]:
x = Anki("Selected Notes.apkg")
x.PARTIAL = 2000
x.read_file()
x.note_list_similarity('tags')

Reading `/mnt/c/Users/edrid.EDRIDGE-DSOUZA-/Documents/GitHub/anki-network/Selected Notes.apkg`: 100%|██████████| 17052/17052 [00:22<00:00, 752.50it/s] 
Calculating similarity of tags: 100%|██████████| 3998000/3998000 [00:17<00:00, 228415.93it/s]

At theshold 0.15, stored 3418700 of 3998000 combos of tags (14.49% reduction)





[[1368291917470, 1368292036212, 0.75],
 [1368291917470, 1368292090487, 0.8],
 [1368291917470, 1368292152411, 0.8],
 [1368291917470, 1368292167791, 1.0],
 [1368291917470, 1368292206900, 0.8],
 [1368291917470, 1368292228660, 0.631578947368421],
 [1368291917470, 1368292311289, 0.7058823529411765],
 [1368291917470, 1368292340959, 0.8],
 [1368291917470, 1368292373934, 0.8],
 [1368291917470, 1368292406341, 0.75],
 [1368291917470, 1368292432363, 0.6],
 [1368291917470, 1368292455207, 0.6666666666666666],
 [1368291917470, 1368292503910, 1.0],
 [1368291917470, 1368292582362, 0.5],
 [1368291917470, 1368292605513, 0.631578947368421],
 [1368291917470, 1368292627109, 0.8],
 [1368291917470, 1368292646852, 1.0],
 [1368291917470, 1368292731518, 0.75],
 [1368291917470, 1368292754232, 1.0],
 [1368291917470, 1368292780041, 0.75],
 [1368291917470, 1368292796303, 0.8],
 [1368291917470, 1368292822134, 1.0],
 [1368291917470, 1402176473561, 0.16666666666666666],
 [1368291917470, 1402177808321, 0.18181818181818

In [21]:
x.db[101]

{'id': 1461884819979,
 'data': 'What receptors sense a decrease in plasma volume, providing stimuli to increase ADH secretion?\nBaroreceptors \n ',
 'images': ['tmpObYyEp.png', 'tmpgYfLu_.png'],
 'tags': ['#ak_step1_v9::#costanzo::endocrine',
  'zz::renal',
  '#ak_step1_v9::#firstaid',
  '#ak_step1_v9::#firstaid::08_endocrine',
  '#ak_step1_v9::#b&b::08_endocrinology',
  '#ak_step1_v9::#b&b::19_renal::03_electrolytes',
  '#ak_step1_v9',
  '#ak_step1_v9::#firstaid::08_endocrine::03_physiology',
  '#ak_step1_v9::#firstaid::08_endocrine::03_physiology::03_antidiuretic_hormone',
  '#ak_step1_v9::endocrine',
  '#ak_step1_v9::#costanzo',
  'zz',
  'zz::cardio',
  'zz::neuro',
  '#ak_step1_v9::#b&b::19_renal',
  '#ak_step1_v9::#b&b::08_endocrinology::05_other',
  '#ak_step1_v9::#b&b']}

In [None]:
# Doc2Vec for similarity scores:
# https://medium.com/red-buffer/doc2vec-computing-similarity-between-the-documents-47daf6c828cd
# https://stackoverflow.com/questions/53503049/measure-similarity-between-two-documents-using-doc2vec
# https://github.com/jhlau/doc2vec#pre-trained-doc2vec-models

# https://github.com/rmarkello/snfpy to fuse similarity networks
# https://github.com/maxconway/SNFtool has more visualization options
# sklearn to do general network stuff
# https://towardsdatascience.com/visualising-similarity-clusters-with-interactive-graphs-20a4b2a18534

In [23]:
from gensim.models import doc2vec
from scipy import spatial

d2v_model = doc2vec.Doc2Vec.load(model_file)

fisrt_text = '..'
second_text = '..'

vec1 = d2v_model.infer_vector(fisrt_text.split())
vec2 = d2v_model.infer_vector(second_text.split())

cos_distance = spatial.distance.cosine(vec1, vec2)
# cos_distance indicates how much the two texts differ from each other:
# higher values mean more distant (i.e. different) texts

NameError: name 'model_file' is not defined