In [219]:
import pandas as pd
import json
import genanki as ga
from random import randrange

In [172]:
corpus = "kan_wikipedia_2021_300K"
read_csv = lambda f, names: pd.read_csv(f"{corpus}/{corpus}-{f}",
                                        sep="\t",
                                        names= names,
                                        quoting = 3)

In [173]:
words = read_csv("words.txt", ["wid", "word", "freq"])
words = words[words.wid > 100].iloc[:10000]

In [174]:
sents = read_csv("sentences.txt", ["sid", "sent"])
sents = sents.set_index("sid")

In [175]:
sent_inv = read_csv("inv_w.txt", ["wid", "sid", "spos"])

In [177]:
words = pd.merge(words, sent_inv, on = "wid").drop_duplicates("wid")
words = pd.merge(words, sents.loc[words.sid], on = "sid").drop_duplicates("wid")

In [181]:
alar = pd.read_json("alar.json")
alar = alar.drop(["id", "head", "origin", "info"], axis = 1)
alar.defs = alar.defs.apply(lambda d: "<br>".join(
    [f"{n + 1} ({x['type']}). {x['entry']}" for n, x in enumerate(d)]))
alar = alar.groupby("entry").agg({"phone": 'first', "defs": "<br>--<br>".join})
alar = alar.reset_index()

In [183]:
words = pd.merge(words, alar, left_on = "word", right_on = "entry", how = "inner")
words = words.drop(columns = "entry")
words = words.iloc[:2500]
words = words.drop(columns = ["wid", "sid"])

In [220]:
randrange(1<<30,1<<31)

1519689782

In [188]:
backstr = \
"""
{{Expression}}

<hr id=answer>

{{Definition}}<br>
{{Sentence}}<br>
{{Phonetic}}<br>
"""
model = ga.Model(2144829245,
                 "Sentence pitch model",
                 fields=[{"name": "Expression"},
                         {"name": "Frequency"},
                         {"name": "Sentence Position"},
                         {"name": "Sentence"},
                         {"name": "Phonetic"},
                         {"name": "Definition"}
                        ],
                 templates=[
                     {
                         "name": "kannada vocab",
                         "qfmt": "{{Expression}}",
                         "afmt": backstr
                     }
                 ])

In [213]:
def transform(data):
    word = data[0]
    sent = data[3]
    idx = sent.index(word)
    data[3] = f"{sent[:idx]}<b class = 'word'>{word}</b>{sent[idx + len(word):]}"
    return data

In [214]:
deck = ga.Deck(2138326564, 'Kannada Wikipedia 2.5k vocabulary')
for i in range(len(words)):
    data = transform(list(map(str, words.loc[i])))
    note = ga.Note(model = model, fields = data)
    deck.add_note(note)

In [215]:
pkg = ga.Package(deck)
pkg.write_to_file('output.apkg')