In [7]:
import json
import pandas as pd
import os
import re
from lapp.dbms import init_db, inserts, modify, delete, find_by_attr
from lapp.tables import Unit, Vocabulary, GrammarRule


In [8]:
language_name = "zh"

## Add Units

In [9]:
# Initialize the database connection and create db file if it doesn't exist
engine, session = init_db(language_name)

if not os.path.exists(f'../data/{language_name}'):
    raise FileNotFoundError(f"Dataset for {language_name} not found.")

# Initialize the database
with open(f'../data/{language_name}/units.json', 'r', encoding="utf8") as f:
    units_array = json.load(f)
    
units = []
for idx, unit_data in enumerate(units_array):
    units.append(
        Unit(
            unit_id = f"{language_name.upper()}_{idx}",
            title = unit_data['title'],
            description = unit_data['description'],
            level = unit_data['level']
        )
    )
inserts(session, units)
session.close()

2025-07-07 10:44:10,184 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-07 10:44:10,184 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("unit")
2025-07-07 10:44:10,184 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:10,185 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("calligraphy_character")
2025-07-07 10:44:10,186 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:10,186 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("grammar_rule")
2025-07-07 10:44:10,186 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:10,187 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("vocabulary")
2025-07-07 10:44:10,187 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:10,188 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("exercises")
2025-07-07 10:44:10,188 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:10,188 INFO sqlalchemy.engine.Engine COMMIT
2025-07-07 10:44:10,195 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2

## Add Vocabulary

In [10]:
# Initialize the database connection and create db file if it doesn't exist
engine, session = init_db(language_name)

directory_path = f'../data/{language_name}/vocabulary'

if not os.path.exists(directory_path):
    raise FileNotFoundError(f"No vocabulary found for {language_name}.")

vocs = []

# List all elements (files and directories) in the specified directory and get their full paths
elements_paths = [os.path.join(directory_path, element) for element in os.listdir(directory_path)]
for voc_file in elements_paths:
    if os.path.isfile(voc_file) and voc_file.endswith('.csv'):
        unit_id = re.sub("[^0-9]", "", os.path.basename(voc_file))
        df = pd.read_csv(voc_file)
        for idx, row in df.iterrows():
            vocs.append( 
                Vocabulary(
                    learn_id=f"{language_name.upper()}_{unit_id}_V{idx}",
                    word = row['word'],
                    translation = row["translation"],
                    phonetic = row["pinyin"],
                    example_sentence = row.get("example_sentence", ""),
                    type = row["type"],
                    parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
                )
            )
inserts(session, vocs)
session.close()

2025-07-07 10:44:10,275 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-07 10:44:10,275 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("unit")
2025-07-07 10:44:10,276 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:10,276 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("calligraphy_character")
2025-07-07 10:44:10,276 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:10,277 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("grammar_rule")
2025-07-07 10:44:10,277 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:10,278 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("vocabulary")
2025-07-07 10:44:10,278 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:10,279 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("exercises")
2025-07-07 10:44:10,279 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:10,280 INFO sqlalchemy.engine.Engine COMMIT
2025-07-07 10:44:10,292 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2

  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()


2025-07-07 10:44:10,479 INFO sqlalchemy.engine.Engine [cached since 0.1866s ago] ('ZH_12', 1, 0)
2025-07-07 10:44:10,480 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 10:44:10,480 INFO sqlalchemy.engine.Engine [cached since 0.188s ago] ('ZH_12', 1, 0)
2025-07-07 10:44:10,482 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 10:44:10,482 INFO sqlalchemy.engine.Engine [cached since 0.1892s ago] ('ZH_12', 1, 0)
2025-07-07 10:44:10,484 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 

  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()


2025-07-07 10:44:10,684 INFO sqlalchemy.engine.Engine [cached since 0.3905s ago] ('ZH_2', 1, 0)
2025-07-07 10:44:10,686 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 10:44:10,686 INFO sqlalchemy.engine.Engine [cached since 0.3932s ago] ('ZH_2', 1, 0)
2025-07-07 10:44:10,688 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 10:44:10,688 INFO sqlalchemy.engine.Engine [cached since 0.3953s ago] ('ZH_2', 1, 0)
2025-07-07 10:44:10,689 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 10

  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()


2025-07-07 10:44:10,902 INFO sqlalchemy.engine.Engine [cached since 0.609s ago] ('ZH_6', 1, 0)
2025-07-07 10:44:10,904 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 10:44:10,904 INFO sqlalchemy.engine.Engine [cached since 0.6108s ago] ('ZH_6', 1, 0)
2025-07-07 10:44:10,905 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 10:44:10,906 INFO sqlalchemy.engine.Engine [cached since 0.6125s ago] ('ZH_6', 1, 0)
2025-07-07 10:44:10,907 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 10:

  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
  session.commit()


2025-07-07 10:44:11,136 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
2025-07-07 10:44:11,137 INFO sqlalchemy.engine.Engine [cached since 0.05279s ago] ('ZH_1',)
2025-07-07 10:44:11,137 INFO sqlalchemy.engine.Engine INSERT INTO vocabulary (learn_id, word, translation, phonetic, example_sentence, type, score, last_seen, unit_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
2025-07-07 10:44:11,138 INFO sqlalchemy.engine.Engine [cached since 0.06029s ago] ('ZH_1_V1', '你', 'Tu', 'Nǐ', '', 'Pronoms personnels', 0, '2025-07-07', 'ZH_1')
2025-07-07 10:44:11,139 INFO sqlalchemy.engine.Engine COMMIT
2025-07-07 10:44:11,140 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-07 10:44:11,142 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHER

## Add Grammar Rules

In [12]:
# Initialize the database connection and create db file if it doesn't exist
engine, session = init_db(language_name)

directory_path = f'../data/{language_name}/grammar'

if not os.path.exists(directory_path):
    raise FileNotFoundError(f"No grammar files found for {language_name}.")

grammars = []

# List all elements (files and directories) in the specified directory and get their full paths
elements_paths = [os.path.join(directory_path, element) for element in os.listdir(directory_path)]
for grammar_file in elements_paths:
    if os.path.isfile(grammar_file) and grammar_file.endswith('.json'):
        unit_id = re.sub("[^0-9]", "", os.path.basename(grammar_file))
        with open(grammar_file, 'r', encoding="utf8") as f:
            grammar_data = json.load(f)
        for idx, row in enumerate(grammar_data):
            grammars.append( 
                GrammarRule(
                    learn_id=f"{language_name.upper()}_{unit_id}_G{idx + 1}",
                    title=row['title'],
                    explanation=row["content"],
                    parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()
                )
            )
inserts(session, grammars)
session.close()

2025-07-07 10:44:32,760 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-07 10:44:32,760 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("unit")
2025-07-07 10:44:32,762 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:32,763 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("calligraphy_character")
2025-07-07 10:44:32,763 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:32,764 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("grammar_rule")
2025-07-07 10:44:32,764 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:32,764 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("vocabulary")
2025-07-07 10:44:32,765 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:32,765 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("exercises")
2025-07-07 10:44:32,766 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 10:44:32,766 INFO sqlalchemy.engine.Engine COMMIT
2025-07-07 10:44:32,768 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2

  parent=session.query(Unit).filter(Unit.unit_id == f"{language_name.upper()}_{unit_id}").first()


2025-07-07 10:44:32,956 INFO sqlalchemy.engine.Engine [cached since 0.1881s ago] ('ZH_7', 1, 0)
2025-07-07 10:44:32,957 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 10:44:32,957 INFO sqlalchemy.engine.Engine [cached since 0.1894s ago] ('ZH_7', 1, 0)
2025-07-07 10:44:32,958 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 10:44:32,959 INFO sqlalchemy.engine.Engine [cached since 0.1904s ago] ('ZH_7', 1, 0)
2025-07-07 10:44:32,959 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 10

  session.commit()


2025-07-07 10:44:33,186 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-07 10:44:33,187 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level 
FROM unit 
WHERE unit.unit_id = ?
2025-07-07 10:44:33,187 INFO sqlalchemy.engine.Engine [cached since 0.1808s ago] ('ZH_2',)
2025-07-07 10:44:33,188 INFO sqlalchemy.engine.Engine INSERT INTO grammar_rule (learn_id, title, explanation, score, last_seen, unit_id) VALUES (?, ?, ?, ?, ?, ?)
2025-07-07 10:44:33,189 INFO sqlalchemy.engine.Engine [cached since 0.1878s ago] ('ZH_2_G2', 'Quelques règles supplémentaires', "Toutefois, les zéros placés à la fin d'un nombre ne se prononcent pas. Le caractère 零 líng est souvent remplacé par un O.\n\nIl faut également savoir ... (577 characters truncated) ... t dire 十二 shí'èr. Ceci est valable seulement de 10 à 19. En effet, pour « 112 » il faut dire :\n\n* 一百一十二 yībǎi yīshí'èr : 1 × 100 + 1 × 10 + 2 = 