## Code to generate MOST COMMONLY USED 3000 KANJIS

Visit [danyelkoca.com/en/blog/kanji](https://www.danyelkoca.com/en/blog/kanji) for a blogpost covering this code


In [1]:
# Import needed libraries
import pandas as pd
import json

In [2]:
# Read data from the appropriate sheet
orig_df = pd.read_excel(
    "data/VDRJ_Ver1_1_Research_Top60894.xlsx",
    sheet_name="重要度順語彙リスト60894語",
)

In [3]:
# Function that detects whether a word includes a kanji
def kanji_detector(word):
    kanji = False
    non_jp_char = False
    for char in word:
        if (
            "\u4e00" <= char <= "\u9faf"
            or "\u3400" <= char <= "\u4dbf"
            or "\uf900" <= char <= "\ufaff"
        ):
            kanji = True
        if (
            not ("\u4e00" <= char <= "\u9faf")
            and not ("\u3400" <= char <= "\u4dbf")
            and not ("\uf900" <= char <= "\ufaff")
            and not ("\u3040" <= char <= "\u30ff")
        ):
            non_jp_char = True

    return kanji and not (non_jp_char)

In [4]:
# Get the relevant column
df = (
    orig_df[["見出し語彙素\nLexeme"]]
    .astype(str)
    .rename(columns={"見出し語彙素\nLexeme": "word"})
)

# Get words that are at least 2 character long and includes a kanji
df = df[df["word"].str.len() > 1]
df["kanji"] = df["word"].apply(kanji_detector)
df = df[df["kanji"]].drop("kanji", axis=1).reset_index(drop=True)

In [5]:
# Loop through the data to calculate the prevalance score of each Kanji
kanjis = {}
for index, word in enumerate(df["word"].to_list()):
    for char in word:
        if "\u4e00" <= char <= "\u9faf" or "\u3400" <= char <= "\u4dbf":
            if char in kanjis:
                kanjis[char]["count"] += 1
                # Prevalence score is calculated as the inverse of commonness of a word
                # If a word is common, it will contribute higher to the rank of Kanji
                kanjis[char]["score"] += 1 / (index + 1)
                if word not in [i["word"] for i in kanjis[char]["words"]]:
                    ## Importance shows how commonly used that word is, lower the better
                    kanjis[char]["words"].append({"word": word, "importance": index})
            else:
                kanjis[char] = {
                    "count": 1,
                    "score": 1 / (index + 1),
                    "words": [{"word": word, "importance": index}],
                }

# Sort kanjis based on their prevalence 
kanjis_list = sorted(
    [{"kanji": i, **kanjis[i]} for i in kanjis], key=lambda x: x["score"], reverse=True
)

## Get kanjis only if they are used in at least 3 words
kanjis_list = [i for i in kanjis_list if len(i["words"]) > 3]

In [6]:
# Remove some ambiguous kanji that are not really commonly used
unwanted_kanjis = [
    "其",
    "御",
    "為",
    "此",
    "鱻",
    "詞",
    "掛",
    "遣",
    "又",
]

kanjis_list = [i for i in kanjis_list if i["kanji"] not in unwanted_kanjis]

# Exclude some more kanjis that include an ambiguous kanji
for i in kanjis_list:
    i["words"] = [j for j in i["words"] if "鱻" not in j["word"]]

In [7]:
# Check a snapshot of results
for index, i in enumerate(kanjis_list[:100]):
    print(f'{index}.{i["kanji"]}', end=" ")

0.居 1.有 2.言 3.十 4.無 5.出 6.来 7.成 8.分 9.良 10.見 11.百 12.行 13.名 14.大 15.何 16.一 17.代 18.二 19.入 20.人 21.方 22.処 23.下 24.自 25.私 26.三 27.上 28.千 29.時 30.五 31.子 32.日 33.取 34.四 35.生 36.間 37.物 38.聞 39.手 40.知 41.持 42.合 43.事 44.同 45.立 46.八 47.六 48.本 49.気 50.当 51.学 52.付 53.会 54.七 55.切 56.使 57.書 58.今 59.作 60.置 61.多 62.教 63.年 64.然 65.直 66.貴 67.話 68.九 69.地 70.食 71.中 72.部 73.前 74.少 75.長 76.先 77.味 78.小 79.意 80.国 81.問 82.引 83.電 84.月 85.明 86.未 87.高 88.仕 89.理 90.違 91.後 92.開 93.家 94.着 95.結 96.外 97.込 98.通 99.定 

In [8]:
# Save the top 3000 kanjis
with open("danyel_koca_most_common_3000_kanjis.json", "w") as f:
    json.dump(kanjis_list[:3000], f, ensure_ascii=False)