 # 📊 EDA for JPDB + Anki Vocabulary Project

 This notebook explores, cleans, and prepares:

 - `jpdb_novel_data.csv` (light novel dataset)

 - `anki_core.xlsx` and `anki_mining.xlsx` (Anki vocabulary decks)

 for further analysis and database population.

In [1]:
import pandas as pd
import time, random, datetime, math, json, sqlite3, os
from collections import defaultdict
from jpdb_functions import *
from sqlalchemy import create_engine, Column, Integer, String, MetaData, Boolean, Float, select, insert, update, delete, and_, or_, func, Table, Text, ForeignKey, text, delete, DateTime
from sklearn.decomposition import NMF, PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import numpy as np

SID = os.getenv("SID")


 ## Load JPDB Novel Data

In [2]:
df = pd.read_csv("jpdb_novel_data.csv")
df.columns = df.columns.str.strip()
print(df.head())
print(df.shape)
print(df.columns)
print(df.describe())
print(df.dtypes)


   Unnamed: 0  novel_id                                              Title  \
0           0      5706          Koguma no Kuuku Monogatari: Haru to Natsu   
1           1      5765         Koguma no Kuuku Monogatari: Birthday Party   
2           2      5768  Koguma no Kuuku Monogatari: Hajimete no Umi to...   
3           3      5705            Koguma no Kuuku Monogatari: Aki to Fuyu   
4           4      7701                                   Beruna no Shippo   

   Length (in words)  Unique words  Unique words (used once)  \
0               7055          1380                       669   
1               7072          1318                       671   
2               6400          1313                       666   
3               6799          1417                       695   
4              28809          3476                      1552   

  Unique words (used once %)  Unique kanji  Unique kanji (used once)  \
0                        48%           195                        48   
1 

 ## Drop Irrelevant Columns

In [3]:
df.drop(['Unnamed: 0', 'Unnamed: 15', 'Known unique words'], axis=1, inplace=True)
print(df.head())


   novel_id                                              Title  \
0      5706          Koguma no Kuuku Monogatari: Haru to Natsu   
1      5765         Koguma no Kuuku Monogatari: Birthday Party   
2      5768  Koguma no Kuuku Monogatari: Hajimete no Umi to...   
3      5705            Koguma no Kuuku Monogatari: Aki to Fuyu   
4      7701                                   Beruna no Shippo   

   Length (in words)  Unique words  Unique words (used once)  \
0               7055          1380                       669   
1               7072          1318                       671   
2               6400          1313                       666   
3               6799          1417                       695   
4              28809          3476                      1552   

  Unique words (used once %)  Unique kanji  Unique kanji (used once)  \
0                        48%           195                        48   
1                        50%           207                        46   
2 

 ## Check for Duplicates / NA

In [4]:
print(df.nunique())
print(df.isna().any())


novel_id                             1475
Title                                1475
Length (in words)                    1465
Unique words                         1425
Unique words (used once)             1303
Unique words (used once %)             43
Unique kanji                          991
Unique kanji (used once)              247
Unique kanji readings                1213
Average difficulty                     92
Peak difficulty (90th percentile)      93
Average sentence length               222
Characters                           1473
Volumes                                30
Blacklisted unique words              580
dtype: int64
novel_id                             False
Title                                False
Length (in words)                    False
Unique words                         False
Unique words (used once)             False
Unique words (used once %)           False
Unique kanji                         False
Unique kanji (used once)             False
Unique kanji 

 ## Load Anki Core Deck

In [5]:
anki_core_df = pd.read_excel("anki_core.xlsx", engine='openpyxl')
print(anki_core_df.tail())
print(anki_core_df.columns)


     Sort Field    Card        Due                Deck Percent Correct  \
1967         本当  Card 1 2028-03-11  Core2.3k Version 3            100%   
1968        愛する  Card 1 2028-04-01  Core2.3k Version 3            100%   
1969         我慢  Card 1 2028-06-12  Core2.3k Version 3            100%   
1970        おまけ  Card 1 2028-06-16  Core2.3k Version 3            100%   
1971        難しい  Card 1 2028-08-22  Core2.3k Version 3            100%   

      Again Count Reading  Reviews  WordReadingHiragana  
1967          NaN    ほんとう        6                  NaN  
1968          NaN    あいする        8                  NaN  
1969          NaN     がまん        9                  NaN  
1970          1.0     おまけ        9                  NaN  
1971          NaN   むずかしい        8                  NaN  
Index(['Sort Field', 'Card', 'Due', 'Deck', 'Percent Correct', 'Again Count',
       'Reading', 'Reviews', 'WordReadingHiragana'],
      dtype='object')


 ## Clean Anki Core Deck

In [6]:
anki_core_df.drop(["WordReadingHiragana"], inplace=True, axis=1)
anki_core_df.rename(columns={"Sort Field": "Vocab"}, inplace=True)
print(anki_core_df.tail())


     Vocab    Card        Due                Deck Percent Correct  \
1967    本当  Card 1 2028-03-11  Core2.3k Version 3            100%   
1968   愛する  Card 1 2028-04-01  Core2.3k Version 3            100%   
1969    我慢  Card 1 2028-06-12  Core2.3k Version 3            100%   
1970   おまけ  Card 1 2028-06-16  Core2.3k Version 3            100%   
1971   難しい  Card 1 2028-08-22  Core2.3k Version 3            100%   

      Again Count Reading  Reviews  
1967          NaN    ほんとう        6  
1968          NaN    あいする        8  
1969          NaN     がまん        9  
1970          1.0     おまけ        9  
1971          NaN   むずかしい        8  


 ## Load Anki Mining Deck

In [7]:
anki_mining_df = pd.read_excel("anki_mining.xlsx", engine='openpyxl')
anki_mining_df.drop(["Reading"], inplace=True, axis=1)
anki_mining_df.rename(columns={"WordReadingHiragana": "Reading", "Sort Field": "Vocab"}, inplace=True)
print(anki_mining_df.tail())


           Vocab         Card                  Due    Deck Percent Correct  \
2541         未完成  Mining Card  2027-02-09 00:00:00  Mining            100%   
2542          恩人  Mining Card  2027-03-04 00:00:00  Mining            100%   
2543          内緒  Mining Card  2027-03-13 00:00:00  Mining            100%   
2544          権利  Mining Card  2027-03-20 00:00:00  Mining            100%   
2545  偽者 example  Mining Card  2027-07-15 00:00:00  Mining            100%   

      Again Count  Reviews Reading  
2541          NaN        6   みかんせい  
2542          NaN        6    おんじん  
2543          NaN        9    ないしょ  
2544          1.0       10     けんり  
2545         22.0       33    にせもの  


 ## Concatenate Anki Decks

In [8]:
anki_df = pd.concat([anki_core_df, anki_mining_df], ignore_index=True)
print(anki_df.head())
print(len(anki_df))

anki_df['Vocab'] = anki_df["Vocab"].str.strip()
anki_df['Reading'] = anki_df['Reading'].str.strip()


  Vocab    Card                  Due                Deck Percent Correct  \
0   起こる  Card 1  2025-06-18 00:00:00  Core2.3k Version 3            100%   
1    重要  Card 1  2025-06-18 00:00:00  Core2.3k Version 3             93%   
2    至る  Card 1  2025-06-18 00:00:00  Core2.3k Version 3            100%   
3    対応  Card 1  2025-06-18 00:00:00  Core2.3k Version 3             88%   
4   何より  Card 1  2025-06-18 00:00:00  Core2.3k Version 3            100%   

   Again Count Reading  Reviews  
0          1.0     おこる       12  
1          2.0   じゅうよう       15  
2          1.0     いたる       12  
3          5.0    たいおう       32  
4          NaN    なにより        9  
4518


 ## Check & Drop Duplicate Vocab Entries

In [9]:
anki_duplicates = anki_df.duplicated(subset=['Vocab', "Reading"])
print(anki_df[anki_duplicates])
print(f"Number of duplicates: {len(anki_df[anki_duplicates])}")

print(f"Initial length before dropping duplicates: {len(anki_df)}")
anki_df = anki_df[~anki_duplicates]
print(f"Final length before dropping duplicates: {len(anki_df)}")


     Vocab         Card                  Due                Deck  \
853    うまい       Card 1  2025-12-07 00:00:00  Core2.3k Version 3   
1296   起こす       Card 1  2026-05-30 00:00:00  Core2.3k Version 3   
1733   起きる       Card 1  2027-02-10 00:00:00  Core2.3k Version 3   
1768    よく       Card 1  2027-03-08 00:00:00  Core2.3k Version 3   
1779    やる       Card 1  2027-03-17 00:00:00  Core2.3k Version 3   
1878    もう       Card 1  2027-08-06 00:00:00  Core2.3k Version 3   
1937    ただ       Card 1  2027-12-19 00:00:00  Core2.3k Version 3   
2276     顎  Mining Card          New #⁨2498⁩              Mining   
2377    空間  Mining Card          New #⁨2604⁩              Mining   
2397    失う  Mining Card          New #⁨2624⁩              Mining   
2566    抜く  Mining Card  2025-06-20 00:00:00              Mining   
2649    注目  Mining Card  2025-06-21 00:00:00              Mining   
2683    理屈  Mining Card  2025-06-22 00:00:00              Mining   
2825    最終  Mining Card  2025-06-25 00:00:00    

 ## Check Data Types

In [10]:
print(anki_df.dtypes)


Vocab               object
Card                object
Due                 object
Deck                object
Percent Correct     object
Again Count        float64
Reading             object
Reviews              int64
dtype: object


 ## Filter and Clean Columns

In [11]:
anki_df = anki_df[~anki_df['Due'].astype(str).str.contains('New')]  # filter unreviewed cards
anki_df['Blacklisted'] = (anki_df['Deck'] == 'Core2.3k Version 3')
anki_df.drop(['Card'], inplace=True, axis=1)


 ## Handle NaNs

In [12]:
nan_cols = [i for i in anki_df.columns if anki_df[i].isna().any()]
print(nan_cols)

anki_df.fillna(0, inplace=True)
anki_df['Again Count'] = anki_df['Again Count'].astype(int)


['Again Count']


 ## Clean Datatypes

In [13]:
anki_df['Percent Correct'] = anki_df['Percent Correct'].astype(str).str.strip('%')
anki_df['Percent Correct'] = anki_df['Percent Correct'].astype(float)
anki_df['Due'] = pd.to_datetime(anki_df['Due'], errors='coerce')


 ## Enrich with vid, sid, frequency rank

In [14]:
anki_df = anki_df.apply(enrich_vocab, axis=1)


 ## Save Cleaned Anki DF

In [15]:
anki_df.to_csv(os.path.join(os.getcwd(), 'anki_df.csv'))


 ## Reload to Confirm

In [16]:
anki_df = pd.read_csv("anki_df.csv")
print(anki_df.head())
anki_df.drop(columns=['Unnamed: 0'], inplace=True)
print(anki_df.head())


   Unnamed: 0 Vocab         Due                Deck  Percent Correct  \
0           0   起こる  2025-06-18  Core2.3k Version 3            100.0   
1           1    重要  2025-06-18  Core2.3k Version 3             93.0   
2           2    至る  2025-06-18  Core2.3k Version 3            100.0   
3           3    対応  2025-06-18  Core2.3k Version 3             88.0   
4           4   何より  2025-06-18  Core2.3k Version 3            100.0   

   Again Count Reading  Reviews  Blacklisted      vid         sid  \
0            1     おこる       12         True  1223680  2146305150   
1            2   じゅうよう       15         True  1336820  3870032381   
2            1     いたる       12         True  1311870  1536129181   
3            5    たいおう       32         True  1409840  3337990295   
4            0    なにより        9         True  1188530  3589014081   

   frequency rank  
0           800.0  
1          1200.0  
2          1300.0  
3          1300.0  
4          1100.0  
  Vocab         Due             

 ## Handle NaNs in Frequency Rank

In [17]:
print(anki_df.isnull().sum())
with pd.option_context("display.max_rows", 71):
    print(anki_df[anki_df.isna().any(axis=1)])
    
#Exclude these rows since 71 is a low number out of 4000+ rows
anki_df = anki_df.dropna(subset=['frequency rank'])
print(anki_df.isnull().sum())
anki_df['Due'] = pd.to_datetime(anki_df['Due'], errors='coerce')
print(anki_df)


Vocab               0
Due                 0
Deck                0
Percent Correct     0
Again Count         0
Reading             0
Reviews             0
Blacklisted         0
vid                 0
sid                 0
frequency rank     71
dtype: int64
         Vocab         Due                Deck  Percent Correct  Again Count  \
190        生き方  2025-07-14  Core2.3k Version 3            100.0            1   
516         因る  2025-09-08  Core2.3k Version 3             89.0            4   
594       目茶苦茶  2025-09-25  Core2.3k Version 3            100.0            1   
1572        見方  2026-10-22  Core2.3k Version 3            100.0            0   
1728        何て  2027-02-07  Core2.3k Version 3            100.0            0   
1969        清聴  2025-06-18              Mining             87.0            7   
1990        冥福  2025-06-18              Mining             87.0            3   
1996    天秤に掛ける  2025-06-18              Mining            100.0            1   
1999       虫干し  2025-06-1

 ## Drop Duplicate vids

In [None]:
anki_df = anki_df.drop_duplicates(subset=['vid'])