In [None]:
import pandas as pd
import json
import genanki as ga
from random import randrange

In [None]:
# general function to read csv properly
# quoting = 3 prevents quotes in the file from messing up data
corpus = "kan_wikipedia_2021_300K"
read_csv = lambda f, names: pd.read_csv(f"{corpus}/{corpus}-{f}",
                                    sep = "\t",
                                    names = names,
                                    quoting = 3)

In [None]:
# read in words from corpus
words = read_csv("words.txt", ["wid", "word", "freq"])
# drop the first 100, they are all symbols
words = words[words.wid > 100].iloc[:10000]

In [None]:
# read in sentences from corpus
sents = read_csv("sentences.txt", ["sid", "sent"])
sents = sents.set_index("sid")

In [None]:
# maps word ids to sentence ids in which they appear
sent_inv = read_csv("inv_w.txt", ["wid", "sid", "spos"])

In [None]:
# match each word with the sentence in which it appears
words = pd.merge(words, sent_inv, on = "wid").drop_duplicates("wid")
# get the sentences corresponding to the matched sentence
words = pd.merge(words, sents.loc[words.sid], on = "sid").drop_duplicates("wid")

In [None]:
# json file generated with yq.
alar = pd.read_json("alar.json")
alar = alar.drop(["id", "head", "origin", "info"], axis = 1)
# reformat multiple definitions into one html string
alar.defs = alar.defs.apply(lambda d: "<br>".join(
    [f"{n + 1} ({x['type']}). {x['entry']}" for n, x in enumerate(d)]))
# merge definitions for multiple entries
alar = alar.groupby("entry").agg({"phone": 'first', "defs": "<br>--<br>".join})
alar = alar.reset_index()

In [None]:
# match definitions and words by the word itself
words = pd.merge(words, alar, left_on = "word", right_on = "entry", how = "inner")
words = words.drop(columns = "entry")
# crop the first 2500
words = words.iloc[:2500]
words = words.drop(columns = ["wid", "sid"])

In [None]:
# helpful function to generate ID's for genanki
randrange(1<<30,1<<31)

In [None]:
# html used for the back of the anki card
backstr = \
"""
{{Expression}}

<hr id=answer>

{{Definition}}<br>
{{Sentence}}<br>
{{Phonetic}}<br>
"""
model = ga.Model(2144829245,
                 "Sentence pitch model",
                 fields=[{"name": "Expression"},
                         {"name": "Frequency"},
                         {"name": "Sentence Position"},
                         {"name": "Sentence"},
                         {"name": "Phonetic"},
                         {"name": "Definition"}
                        ],
                 templates=[
                     {
                         "name": "kannada vocab",
                         "qfmt": "{{Expression}}",
                         "afmt": backstr
                     }
                 ])

In [None]:
# For each sentence, wrap the word of interest in bold tags
def transform(data):
    word = data[0]
    sent = data[3]
    idx = sent.index(word)
    data[3] = f"{sent[:idx]}<b class = 'word'>{word}</b>{sent[idx + len(word):]}"
    return data

In [None]:
deck = ga.Deck(2138326564, 'Kannada Wikipedia 2.5k vocabulary')
for i in range(len(words)):
    data = transform(list(map(str, words.loc[i])))
    note = ga.Note(model = model, fields = data)
    deck.add_note(note)

In [None]:
pkg = ga.Package(deck)
pkg.write_to_file('output.apkg')