In [1]:
import json
import pandas as pd
import os
import re
from lapp.dbms import init_db, inserts, modify, delete, find_by_attr, insert
from lapp.tables import Unit, Vocabulary, GrammarRule, Language, CalligraphyCharacter


In [2]:
language_id = "zh"

## Create a Language

In [3]:
engine, session = init_db()

if not os.path.exists("../db"):
    os.makedirs("../db")
    print(f"Created directory for database: ../db")

language = Language(
    id=language_id.upper(),
    name="Chinois",
    native_name="中文",
    level="A1",
    flag="🇨🇳"
)
insert(session, language)

session.close()

2025-07-10 10:18:02,515 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-10 10:18:02,515 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("language")
2025-07-10 10:18:02,515 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,515 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("language")
2025-07-10 10:18:02,516 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,516 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("unit")
2025-07-10 10:18:02,516 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,516 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("unit")
2025-07-10 10:18:02,516 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,516 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("calligraphy_character")
2025-07-10 10:18:02,517 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,517 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("calligraphy_character")
2025-07-10 10:18:02,517 INFO sqlalchemy.en

## Add Units

In [4]:
# Initialize the database connection and create db file if it doesn't exist
engine, session = init_db()

# Initialize the database
with open(f'../data/{language_id}/units.json', 'r', encoding="utf8") as f:
    units_array = json.load(f)
    
units = []
for idx, unit_data in enumerate(units_array):
    units.append(
        Unit(
            id = f"{language_id.upper()}_{idx}",
            title = unit_data['title'],
            description = unit_data['description'],
            level = unit_data['level'],
            parent = language,
        )
    )
inserts(session, units)
session.close()

2025-07-10 10:18:02,543 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-10 10:18:02,544 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("language")
2025-07-10 10:18:02,544 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,544 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("unit")
2025-07-10 10:18:02,544 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,545 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("calligraphy_character")
2025-07-10 10:18:02,545 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,545 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("grammar_rule")
2025-07-10 10:18:02,545 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,545 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("vocabulary")
2025-07-10 10:18:02,545 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,545 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("exercises")
2025-07-10 10:18:02,546 INFO sqlalchemy.engi

  session.commit()


## Add Vocabulary

In [5]:
# Initialize the database connection and create db file if it doesn't exist
engine, session = init_db()

directory_path = f'../data/{language_id}/vocabulary'

if not os.path.exists(directory_path):
    raise FileNotFoundError(f"No vocabulary found for {language_id}.")

vocs = []

# List all elements (files and directories) in the specified directory and get their full paths
elements_paths = [os.path.join(directory_path, element) for element in os.listdir(directory_path)]
for voc_file in elements_paths:
    if os.path.isfile(voc_file) and voc_file.endswith('.csv'):
        unit_id = re.sub("[^0-9]", "", os.path.basename(voc_file))
        df = pd.read_csv(voc_file)
        for idx, row in df.iterrows():
            vocs.append( 
                Vocabulary(
                    id=f"{language_id.upper()}_{unit_id}_V{idx}",
                    word = row['word'],
                    translation = row["translation"],
                    phonetic = row["pinyin"],
                    example_sentence = row.get("example_sentence", ""),
                    type = row["type"],
                    parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
                )
            )
inserts(session, vocs)
session.close()

2025-07-10 10:18:02,577 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-10 10:18:02,577 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("language")
2025-07-10 10:18:02,577 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,578 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("unit")
2025-07-10 10:18:02,578 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,578 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("calligraphy_character")
2025-07-10 10:18:02,578 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,579 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("grammar_rule")
2025-07-10 10:18:02,579 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,579 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("vocabulary")
2025-07-10 10:18:02,579 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:02,579 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("exercises")
2025-07-10 10:18:02,579 INFO sqlalchemy.engi

  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()


2025-07-10 10:18:02,748 INFO sqlalchemy.engine.Engine [cached since 0.1638s ago] ('ZH_6', 1, 0)
2025-07-10 10:18:02,749 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.last_seen AS unit_last_seen, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.id = ?
 LIMIT ? OFFSET ?
2025-07-10 10:18:02,750 INFO sqlalchemy.engine.Engine [cached since 0.1651s ago] ('ZH_6', 1, 0)
2025-07-10 10:18:02,751 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.last_seen AS unit_last_seen, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.id = ?
 LIMIT ? OFFSET ?
2025-07-10 10:18:02,752 INFO sqlalchemy.engine.Engine [cached since 0.1674s ago] ('ZH_6', 1, 0)
2025-07-10 10:18:02,754 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id,

  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()


2025-07-10 10:18:02,786 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.last_seen AS unit_last_seen, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.id = ?
 LIMIT ? OFFSET ?
2025-07-10 10:18:02,787 INFO sqlalchemy.engine.Engine [cached since 0.2026s ago] ('ZH_15', 1, 0)
2025-07-10 10:18:02,789 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.last_seen AS unit_last_seen, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.id = ?
 LIMIT ? OFFSET ?
2025-07-10 10:18:02,789 INFO sqlalchemy.engine.Engine [cached since 0.2047s ago] ('ZH_15', 1, 0)
2025-07-10 10:18:02,790 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, uni

  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()


2025-07-10 10:18:02,972 INFO sqlalchemy.engine.Engine [cached since 0.3872s ago] ('ZH_9', 1, 0)
2025-07-10 10:18:02,972 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.last_seen AS unit_last_seen, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.id = ?
 LIMIT ? OFFSET ?
2025-07-10 10:18:02,972 INFO sqlalchemy.engine.Engine [cached since 0.3879s ago] ('ZH_9', 1, 0)
2025-07-10 10:18:02,973 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.last_seen AS unit_last_seen, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.id = ?
 LIMIT ? OFFSET ?
2025-07-10 10:18:02,973 INFO sqlalchemy.engine.Engine [cached since 0.3885s ago] ('ZH_9', 1, 0)
2025-07-10 10:18:02,974 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id,

  session.commit()


2025-07-10 10:18:03,010 INFO sqlalchemy.engine.Engine COMMIT
2025-07-10 10:18:03,010 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-10 10:18:03,010 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.last_seen AS unit_last_seen, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.id = ?
2025-07-10 10:18:03,011 INFO sqlalchemy.engine.Engine [cached since 0.0239s ago] ('ZH_2',)
2025-07-10 10:18:03,011 INFO sqlalchemy.engine.Engine INSERT INTO vocabulary (id, word, translation, phonetic, example_sentence, type, score, last_seen, unit_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
2025-07-10 10:18:03,011 INFO sqlalchemy.engine.Engine [cached since 0.03021s ago] ('ZH_2_V5', '六', 'Six', 'Liù', '', 'Chiffres et nombres', 0, '2025-07-10', 'ZH_2')
2025-07-10 10:18:03,012 INFO sqlalchemy.engine.Engine COMMIT
2025-07-10 10:18:03,013 INFO sqlalchemy.engine.Engine BE

## Add Grammar Rules

In [6]:
# Initialize the database connection and create db file if it doesn't exist
engine, session = init_db()

directory_path = f'../data/{language_id}/grammar'

if not os.path.exists(directory_path):
    raise FileNotFoundError(f"No grammar files found for {language_id}.")

grammars = []

# List all elements (files and directories) in the specified directory and get their full paths
elements_paths = [os.path.join(directory_path, element) for element in os.listdir(directory_path)]
for grammar_file in elements_paths:
    if os.path.isfile(grammar_file) and grammar_file.endswith('.json'):
        unit_id = re.sub("[^0-9]", "", os.path.basename(grammar_file))
        with open(grammar_file, 'r', encoding="utf8") as f:
            grammar_data = json.load(f)
        for idx, row in enumerate(grammar_data):
            grammars.append( 
                GrammarRule(
                    id=f"{language_id.upper()}_{unit_id}_G{idx + 1}",
                    title=row['title'],
                    explanation=row["content"],
                    parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
                )
            )
inserts(session, grammars)
session.close()

2025-07-10 10:18:04,489 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-10 10:18:04,489 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("language")
2025-07-10 10:18:04,489 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:04,489 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("unit")
2025-07-10 10:18:04,490 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:04,490 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("calligraphy_character")
2025-07-10 10:18:04,490 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:04,490 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("grammar_rule")
2025-07-10 10:18:04,490 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:04,490 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("vocabulary")
2025-07-10 10:18:04,491 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:04,491 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("exercises")
2025-07-10 10:18:04,491 INFO sqlalchemy.engi

  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  session.commit()


2025-07-10 10:18:04,695 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-10 10:18:04,696 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.last_seen AS unit_last_seen, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.id = ?
2025-07-10 10:18:04,696 INFO sqlalchemy.engine.Engine [cached since 0.1373s ago] ('ZH_13',)
2025-07-10 10:18:04,697 INFO sqlalchemy.engine.Engine INSERT INTO grammar_rule (id, title, explanation, score, last_seen, unit_id) VALUES (?, ?, ?, ?, ?, ?)
2025-07-10 10:18:04,697 INFO sqlalchemy.engine.Engine [cached since 0.1397s ago] ('ZH_13_G3', '"Avant" et "après" avec 以前 yǐqián et 以后 yǐhòu', "以前 yǐqián et 以后 yǐhòu s'emploient également en fin de proposition et permettent de décrire quelque chose qui se passe avant ou après un moment donné. ... (811 characters truncated) ...  plus tard ? »\n* 我以前住在巴黎 wǒ yǐqián zhù zài Bālí : « 

## Add Characters

In [7]:
engine, session = init_db()

directory_path = f'../data/{language_id}/character'

if not os.path.exists(directory_path):
    raise FileNotFoundError(f"No character files found for {language_id}.")

characters = []

# List all elements (files and directories) in the specified directory and get their full paths
elements_paths = [os.path.join(directory_path, element) for element in os.listdir(directory_path)]
for character_file in elements_paths:
    if os.path.isfile(character_file) and character_file.endswith('.csv'):
        unit_id = re.sub("[^0-9]", "", os.path.basename(character_file))
        print(f"Processing: {unit_id}")
        df = pd.read_csv(character_file)
        for idx, row in enumerate(df.itertuples(index=False)):
            characters.append(
                CalligraphyCharacter(
                    id=f"{language_id.upper()}_{unit_id}_C{idx + 1}",
                    character=row.character,
                    components=row.components,
                    phonetic=row.phonetic,
                    meaning=row.meaning,
                    example_word=row.example_word,
                    parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
                )
            )
inserts(session, characters)
session.close()

2025-07-10 10:18:04,741 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-10 10:18:04,743 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("language")
2025-07-10 10:18:04,743 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:04,744 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("unit")
2025-07-10 10:18:04,745 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:04,746 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("calligraphy_character")
2025-07-10 10:18:04,746 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:04,747 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("grammar_rule")
2025-07-10 10:18:04,748 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:04,749 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("vocabulary")
2025-07-10 10:18:04,750 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-10 10:18:04,751 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("exercises")
2025-07-10 10:18:04,754 INFO sqlalchemy.engi

  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()

2025-07-10 10:18:04,968 INFO sqlalchemy.engine.Engine [cached since 0.2057s ago] ('ZH_8', 1, 0)
2025-07-10 10:18:04,968 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.last_seen AS unit_last_seen, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.id = ?
 LIMIT ? OFFSET ?
2025-07-10 10:18:04,968 INFO sqlalchemy.engine.Engine [cached since 0.2065s ago] ('ZH_8', 1, 0)
2025-07-10 10:18:04,969 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.last_seen AS unit_last_seen, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.id = ?
 LIMIT ? OFFSET ?
2025-07-10 10:18:04,969 INFO sqlalchemy.engine.Engine [cached since 0.2074s ago] ('ZH_8', 1, 0)
2025-07-10 10:18:04,970 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id,

  parent=session.query(Unit).filter(Unit.id == f"{language_id.upper()}_{unit_id}").first()
  session.commit()


2025-07-10 10:18:05,176 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-10 10:18:05,177 INFO sqlalchemy.engine.Engine SELECT unit.id AS unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.last_seen AS unit_last_seen, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.id = ?
2025-07-10 10:18:05,177 INFO sqlalchemy.engine.Engine [cached since 0.182s ago] ('ZH_5',)
2025-07-10 10:18:05,177 INFO sqlalchemy.engine.Engine INSERT INTO calligraphy_character (id, character, components, phonetic, meaning, example_word, score, last_seen, unit_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
2025-07-10 10:18:05,177 INFO sqlalchemy.engine.Engine [cached since 0.1839s ago] ('ZH_5_C3', '的', '白 (blanc) + 勺 (cuillère)', 'de', 'Possessif', '我的 wǒ de : Mon', 0, '2025-07-10', 'ZH_5')
2025-07-10 10:18:05,177 INFO sqlalchemy.engine.Engine COMMIT
2025-07-10 10:18:05,178 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-10