References:
- https://github.com/polm/fugashi#installing-a-dictionary
- https://github.com/kerrickstaley/genanki
- https://github.com/PokiDokika/jisho-py

Subtitle Sites:
https://kitsunekko.net

In [78]:
import fugashi
from fugashi import Tagger
import os
from collections import Counter
import random

tagger = Tagger()

sub_dir = 'Subtitles'
deck_dir = 'Anki_Decks'
ignore_dir = 'Ignore_Lists'
for dir_name in [sub_dir, deck_dir, ignore_dir]:
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)
        
use_filename = 'Naruto_Shippuuden_391.srt'
INCLUDE_KANA = True

In [79]:
ignore = []
for ignore_file in os.listdir(ignore_dir):
    ignore_file = os.path.join(ignore_dir, ignore_file)
    with open(ignore_file, 'r') as file:
        for line in file:
            ignore += [line.replace('\n', '')]

In [82]:
sub_extensions = ['txt', 'srt']
sub_files = [f for f in os.listdir(sub_dir) if f.split('.')[-1] in sub_extensions]
print(sub_files)

sub_file = sub_files[0]
print(sub_file)

['Naruto_Shippuuden_394.srt', 'Naruto_Shippuuden_393.srt', 'Naruto_Shippuuden_392.srt', 'Naruto_Shippuuden_387.srt', 'Naruto_Shippuuden_397.srt', 'Naruto_Shippuuden_391.srt', 'Naruto_Shippuuden_396.srt', 'Naruto_Shippuuden_400.srt', 'Naruto_Shippuuden_399.srt', 'Naruto_Shippuuden_383.srt', 'Naruto_Shippuuden_395.srt', 'Naruto_Shippuuden_386.srt', 'Naruto_Shippuuden_398.srt']
Naruto_Shippuuden_394.srt


In [83]:
all_words = []

for sub_file in sub_files:
    if sub_file == use_filename:
        with open(os.path.join(sub_dir, sub_file), 'r') as file:
            for line in file:
                for word in tagger(line):
                    kana_str = ''
                    if INCLUDE_KANA and word.feature.kana:
                        kana_str = ' ('+word.feature.kana+')'
                    all_words += [word.surface+kana_str]

print(len(all_words))

5438


In [84]:
filtered = [w for w in all_words if w.split()[0] not in ignore and not w.isdigit()]
print(len(filtered))

1030


In [85]:
word_counts = Counter(filtered)

In [86]:
word_counts.most_common()

[('俺 (オレ)', 29),
 ('お前 (オマエ)', 20),
 ('こと (コト)', 17),
 ('ない (ナイ)', 13),
 ('する (スル)', 13),
 ('い (イ)', 11),
 ('って (ッテ)', 10),
 ('から (カラ)', 10),
 ('この (コノ)', 9),
 ('そう (ソウ)', 9),
 ('火影 (ホカゲ)', 8),
 ('同じ (オナジ)', 8),
 ('ねえ (ネエ)', 8),
 ('ﾘﾝ', 8),
 ('ﾏﾀﾞﾗ', 8),
 ('いる (イル)', 7),
 ('だっ (ダッ)', 7),
 ('世界 (セカイ)', 7),
 ('力 (チカラ)', 7),
 ('誰 (ダレ)', 6),
 ('それ (ソレ)', 6),
 ('ﾔﾂ', 6),
 ('術 (ジュツ)', 6),
 ('これ (コレ)', 6),
 ('いい (イイ)', 6),
 ('まで (マデ)', 6),
 ('いう (イウ)', 5),
 ('ﾅﾙﾄ', 5),
 ('本当 (ホントウ)', 5),
 ('うち (ウチ)', 5),
 ('なっ (ナッ)', 5),
 ('もう (モウ)', 5),
 ('道 (ミチ)', 5),
 ('なら (ナラ)', 5),
 ('何 (ナン)', 5),
 ('よう (ヨウ)', 5),
 ('ば (バ)', 5),
 ('れ (レ)', 5),
 ('かつて (カツテ)', 5),
 ('今 (イマ)', 5),
 ('こちら (コチラ)', 5),
 ('もの (モノ)', 5),
 ('言っ (イッ)', 4),
 ('たかっ (タカッ)', 4),
 ('こそ (コソ)', 4),
 ('いや (イヤ)', 4),
 ('わかっ (ワカッ)', 4),
 ('仲間 (ナカマ)', 4),
 ('人 (ヒト)', 4),
 ('あなた (アナタ)', 4),
 ('ね (ネ)', 4),
 ('なかっ (ナカッ)', 4),
 ('尾 (ビ)', 4),
 ('できる (デキル)', 4),
 ('動け (ウゴケ)', 4),
 ('輪廻 (リンネ)', 4),
 ('ｾﾞﾂ', 4),
 ('ﾊｧ', 4),
 ('自分 (ジブン)', 3),
 ('ｶｶｼ'

In [148]:
bool('False')

True

# Create Anki Deck

In [140]:
import genanki
import jisho
from urllib.parse import quote  

WORD_COUNT = 10
LINE_COUNT = 10
deck_name = sub_file.split('.')[0]

In [141]:
model_id = random.randrange(1 << 30, 1 << 31)

my_model = genanki.Model(
  model_id,
  'Simple Model',
  fields=[
    {'name': 'Question'},
    {'name': 'Answer'},
  ],
  templates=[
    {
      'name': 'Card 1',
      'qfmt': '{{Question}}',
      'afmt': '{{FrontSide}}<hr id="answer">{{Answer}}',
    },
  ])

deck_id = random.randrange(1 << 30, 1 << 31)
deck = genanki.Deck(
  deck_id,
  deck_name)

In [142]:
def parse_answer(answers):
    answer_str = ''
    line_cnt = 0
    for def_idx in range(len(answers)):
        def_en = answers[def_idx].en
        def_jp = answers[def_idx].ja
        
        answer_str += f'{def_idx+1}. {def_jp[0].word} ({def_jp[0].reading})<br>'
        line_cnt += 1
        
        answer_str += '<ul>'
        for i, (en, jp) in enumerate(zip(def_en, def_jp)):
            jp_str = ' '.join(jp.reading)
            en_str = ' '.join(en.meaning)

            answer_str += '<li>'
            answer_str += f'({jp_str}) {en_str}' if len(answers[0].en) > 1 else f'({jp_str}) {en_str}'
            answer_str += '</li>'
            line_cnt += 1

            if line_cnt >= LINE_COUNT-1:
                break
        answer_str += '</ul>'
                
        if line_cnt >= LINE_COUNT-1:
            break
    return answer_str

In [143]:
add_cnt = 0
words_added = []
for word, _ in word_counts.most_common():
    word_no_kana = word.split()[0] # Ignore kana. Get just 俺 instead of 俺 (オレ)
    unk_word = False
    
    try:
        answers = jisho.search(quote(word_no_kana))
        if type(answers) != list:
            unk_word = True
        words_added += [word_no_kana]
    except:
        unk_word = True
        
    if unk_word:
        print(f'Could not find definition for {word}. Skipping...')
        continue
        
    card = genanki.Note(model=my_model, fields=[word, parse_answer(answers)])
    deck.add_note(card)
    
    add_cnt += 1
    
    if add_cnt >= WORD_COUNT:
        break

In [145]:
genanki.Package(deck).write_to_file(f'{os.path.join(deck_dir, deck_name)}.apkg')

In [147]:
ignore_file = 'previous_export_words.txt'

if os.path.isfile(ignore_file):
    with open(ignore_file, 'r') as file:
        for line in file:
            words_added += [line]
            
words_added = sorted(set(words_added))

with open(os.path.join(ignore_dir, ignore_file), 'w') as file:
    file.writelines([w + '\n' for w in words_added])

['俺 (オレ)',
 'お前 (オマエ)',
 'こと (コト)',
 'ない (ナイ)',
 'する (スル)',
 'い (イ)',
 'って (ッテ)',
 'から (カラ)',
 'この (コノ)',
 'そう (ソウ)']