In [1]:
import json
import pandas as pd
import os
import re
from lapp.dbms import init_db, inserts, modify, delete, find_by_attr, insert
from lapp.tables import Unit, Vocabulary, GrammarRule, Language


In [2]:
language_id = "zh"

## Create a Language

In [8]:
engine, session = init_db()

if not os.path.exists("../db"):
    os.makedirs("../db")
    print(f"Created directory for database: ../db")

language = Language(
    language_id=language_id,
    name="Chinois",
    native_name="中文",
    level="A1",
    flag="🇨🇳"
)
insert(session, language)

session.close()

2025-07-07 15:28:48,011 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-07 15:28:48,011 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("language")
2025-07-07 15:28:48,012 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:28:48,013 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("unit")
2025-07-07 15:28:48,013 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:28:48,013 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("calligraphy_character")
2025-07-07 15:28:48,014 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:28:48,014 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("grammar_rule")
2025-07-07 15:28:48,014 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:28:48,014 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("vocabulary")
2025-07-07 15:28:48,015 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:28:48,015 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("exercises")
2025-07-07 15:28:48,015 INFO sqlalchemy.engi

Insert failed: (sqlite3.IntegrityError) UNIQUE constraint failed: language.language_id
[SQL: INSERT INTO language (language_id, name, native_name, level, description, score, last_seen, flag, current_unit) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)]
[parameters: ('zh', 'Chinois', '中文', 'A1', '', 0, '2025-07-07', '🇨🇳', None)]
(Background on this error at: https://sqlalche.me/e/20/gkpj). Attempting to modify an existing record.


2025-07-07 15:28:48,023 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-07 15:28:48,025 INFO sqlalchemy.engine.Engine SELECT language.language_id AS language_language_id, language.name AS language_name, language.native_name AS language_native_name, language.level AS language_level, language.description AS language_description, language.score AS language_score, language.last_seen AS language_last_seen, language.flag AS language_flag, language.current_unit AS language_current_unit 
FROM language 
WHERE language.language_id = ?
 LIMIT ? OFFSET ?
2025-07-07 15:28:48,028 INFO sqlalchemy.engine.Engine [generated in 0.00319s] ('zh', 1, 0)
2025-07-07 15:28:48,038 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.language_id AS unit_language_id 
FROM unit 
WHERE ? = unit.language_id
2025-07-07 15:28:48,039 INFO sqlalchemy.engine.Engine [generated in 0.

## Add Units

In [5]:
# Initialize the database connection and create db file if it doesn't exist
engine, session = init_db()

# Initialize the database
with open(f'../data/{language_id}/units.json', 'r', encoding="utf8") as f:
    units_array = json.load(f)
    
units = []
for idx, unit_data in enumerate(units_array):
    units.append(
        Unit(
            unit_id = f"{language_id.upper()}_{idx}",
            title = unit_data['title'],
            description = unit_data['description'],
            level = unit_data['level'],
            language_id = language_id,
        )
    )
inserts(session, units)
session.close()

2025-07-07 15:26:07,031 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-07 15:26:07,032 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("language")
2025-07-07 15:26:07,032 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:26:07,032 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("unit")
2025-07-07 15:26:07,033 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:26:07,034 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("calligraphy_character")
2025-07-07 15:26:07,034 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:26:07,035 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("grammar_rule")
2025-07-07 15:26:07,035 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:26:07,036 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("vocabulary")
2025-07-07 15:26:07,036 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:26:07,036 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("exercises")
2025-07-07 15:26:07,036 INFO sqlalchemy.engi

## Add Vocabulary

In [6]:
# Initialize the database connection and create db file if it doesn't exist
engine, session = init_db()

directory_path = f'../data/{language_id}/vocabulary'

if not os.path.exists(directory_path):
    raise FileNotFoundError(f"No vocabulary found for {language_id}.")

vocs = []

# List all elements (files and directories) in the specified directory and get their full paths
elements_paths = [os.path.join(directory_path, element) for element in os.listdir(directory_path)]
for voc_file in elements_paths:
    if os.path.isfile(voc_file) and voc_file.endswith('.csv'):
        unit_id = re.sub("[^0-9]", "", os.path.basename(voc_file))
        df = pd.read_csv(voc_file)
        for idx, row in df.iterrows():
            vocs.append( 
                Vocabulary(
                    learn_id=f"{language_id.upper()}_{unit_id}_V{idx}",
                    word = row['word'],
                    translation = row["translation"],
                    phonetic = row["pinyin"],
                    example_sentence = row.get("example_sentence", ""),
                    type = row["type"],
                    parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
                )
            )
inserts(session, vocs)
session.close()

2025-07-07 15:26:57,639 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-07 15:26:57,640 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("language")
2025-07-07 15:26:57,640 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:26:57,641 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("unit")
2025-07-07 15:26:57,641 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:26:57,642 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("calligraphy_character")
2025-07-07 15:26:57,642 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:26:57,643 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("grammar_rule")
2025-07-07 15:26:57,643 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:26:57,644 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("vocabulary")
2025-07-07 15:26:57,644 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:26:57,644 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("exercises")
2025-07-07 15:26:57,645 INFO sqlalchemy.engi

  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()


2025-07-07 15:26:57,859 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 15:26:57,859 INFO sqlalchemy.engine.Engine [cached since 0.204s ago] ('ZH_15', 1, 0)
2025-07-07 15:26:57,861 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 15:26:57,861 INFO sqlalchemy.engine.Engine [cached since 0.206s ago] ('ZH_15', 1, 0)
2025-07-07 15:26:57,862 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.la

  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()


2025-07-07 15:26:58,145 INFO sqlalchemy.engine.Engine [cached since 0.4895s ago] ('ZH_9', 1, 0)
2025-07-07 15:26:58,152 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 15:26:58,153 INFO sqlalchemy.engine.Engine [cached since 0.4975s ago] ('ZH_9', 1, 0)
2025-07-07 15:26:58,154 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.unit_id = ?
 LIMIT ? OFFSET ?
2025-07-07 15:26:58,154 INFO sqlalchemy.engine.Engine [cached since 0.4989s ago] ('ZH_9', 1, 0)
2025-07-07 15:26:58,155 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, u

  session.commit()


2025-07-07 15:26:58,383 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-07 15:26:58,383 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS unit_title, unit.description AS unit_description, unit.level AS unit_level, unit.score AS unit_score, unit.language_id AS unit_language_id 
FROM unit 
WHERE unit.unit_id = ?
2025-07-07 15:26:58,384 INFO sqlalchemy.engine.Engine [cached since 0.1969s ago] ('ZH_1',)
2025-07-07 15:26:58,384 INFO sqlalchemy.engine.Engine INSERT INTO vocabulary (learn_id, word, translation, phonetic, example_sentence, type, score, last_seen, unit_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
2025-07-07 15:26:58,384 INFO sqlalchemy.engine.Engine [cached since 0.2016s ago] ('ZH_1_V23', '不', 'Négation', 'Bù', '', 'Divers', 0, '2025-07-07', 'ZH_1')
2025-07-07 15:26:58,386 INFO sqlalchemy.engine.Engine COMMIT
2025-07-07 15:26:58,388 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-07 15:26:58,388 INFO sqlalchemy.engine.Engine SELECT unit.un

## Add Grammar Rules

In [7]:
# Initialize the database connection and create db file if it doesn't exist
engine, session = init_db()

directory_path = f'../data/{language_id}/grammar'

if not os.path.exists(directory_path):
    raise FileNotFoundError(f"No grammar files found for {language_id}.")

grammars = []

# List all elements (files and directories) in the specified directory and get their full paths
elements_paths = [os.path.join(directory_path, element) for element in os.listdir(directory_path)]
for grammar_file in elements_paths:
    if os.path.isfile(grammar_file) and grammar_file.endswith('.json'):
        unit_id = re.sub("[^0-9]", "", os.path.basename(grammar_file))
        with open(grammar_file, 'r', encoding="utf8") as f:
            grammar_data = json.load(f)
        for idx, row in enumerate(grammar_data):
            grammars.append( 
                GrammarRule(
                    learn_id=f"{language_id.upper()}_{unit_id}_G{idx + 1}",
                    title=row['title'],
                    explanation=row["content"],
                    parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
                )
            )
inserts(session, grammars)
session.close()

2025-07-07 15:27:27,410 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-07 15:27:27,411 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("language")
2025-07-07 15:27:27,411 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:27:27,412 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("unit")
2025-07-07 15:27:27,412 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:27:27,412 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("calligraphy_character")
2025-07-07 15:27:27,412 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:27:27,413 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("grammar_rule")
2025-07-07 15:27:27,413 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:27:27,413 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("vocabulary")
2025-07-07 15:27:27,414 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-07-07 15:27:27,414 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("exercises")
2025-07-07 15:27:27,414 INFO sqlalchemy.engi

  parent=session.query(Unit).filter(Unit.unit_id == f"{language_id.upper()}_{unit_id}").first()
  session.commit()


2025-07-07 15:27:27,621 INFO sqlalchemy.engine.Engine [cached since 0.1467s ago] ('ZH_9',)
2025-07-07 15:27:27,626 INFO sqlalchemy.engine.Engine INSERT INTO grammar_rule (learn_id, title, explanation, score, last_seen, unit_id) VALUES (?, ?, ?, ?, ?, ?)
2025-07-07 15:27:27,629 INFO sqlalchemy.engine.Engine [cached since 0.1575s ago] ('ZH_9_G3', '"D\'abord... ensuite..." avec 先 xiān et 然后 ránhòu', "先 xiān « premièrement », et 然后 ránhòu « ensuite » peuvent être utilisés ensemble pour indiquer que deux événements se succèdent :\n\n* 我们先吃饭，然后看电视 wǒ ... (220 characters truncated) ... emièrement » peut également être utilisé seul :\n\n* 他先到了 tā xiān dào le : « Il est arrivé en premier »\n* 我先说 wǒ xiān shuō : « Je parle en premier »", 0, '2025-07-07', 'ZH_9')
2025-07-07 15:27:27,629 INFO sqlalchemy.engine.Engine COMMIT
2025-07-07 15:27:27,631 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-07-07 15:27:27,631 INFO sqlalchemy.engine.Engine SELECT unit.unit_id AS unit_unit_id, unit.title AS 