In [1]:
import os

import lmdb
import zipfile
from tqdm import tqdm
import numpy as np
import argparse

In [None]:
import os
import zipfile
import lmdb
import numpy as np
from tqdm import tqdm
import argparse
import sys
import gc
import shutil


def process_batch(batch, env, start_idx):
    with env.begin(write=True) as txn:
        for idx, (vid, zip_path) in enumerate(batch):
            try:
                with zipfile.ZipFile(zip_path, 'r') as handler:
                    img_name_list = sorted(handler.namelist())
                    for img_name in img_name_list:
                        content = handler.read(img_name)
                        txn.put(str(start_idx + idx).encode(), content)
                        start_idx += 1
            except Exception as e:
                print(f"Lỗi xử lý {vid}: {e}")
    return start_idx

def main(args):
    vid_path = args.vid_path
    vid_pattern = args.vid_pattern
    lmdb_path = args.lmdb_path
    output_path = args.output_path
    
    lmdb_mapsize = 1e10
    if os.path.exists(lmdb_path):
        shutil.rmtree(lmdb_path)# Giảm xuống 1GB hoặc thấp hơn nếu cần

    with open(vid_path, "r", encoding="utf-8") as f:
        feeds = [line.strip() for line in f if line.strip()]
        feeds.sort()
        print(f"### Tổng số feeds: {len(feeds)}")
        print(f"### Mẫu: {' '.join(feeds[:5])}")

    os.makedirs(os.path.dirname(lmdb_path), exist_ok=True)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    global_idx = 0
    vids = []
    intervals = []
    batch_size = 100  # Xử lý 100 video mỗi lần

    # env = lmdb.open(lmdb_path, map_size=int(lmdb_mapsize), writemap=True, subdir=True)
    try:
        env = lmdb.open(lmdb_path, map_size=int(lmdb_mapsize), writemap=True, subdir=True)
    except lmdb.Error as e:
        print(f"Lỗi khi mở LMDB: {e}")
        raise
    
    for i in tqdm(range(0, len(feeds), batch_size)):
        batch = []
        for vid in feeds[i:i+batch_size]:
            try:
                zip_path = vid_pattern % (vid[-2:], vid)
            except Exception:
                zip_path = vid_pattern % vid

            if os.path.exists(zip_path):
                batch.append((vid, zip_path))

        start = global_idx
        global_idx = process_batch(batch, env, global_idx)
        end = global_idx

        vids.extend([vid for vid, _ in batch])
        intervals.extend([[start + i, start + i + 1] for i in range(end - start)])

        # Giải phóng bộ nhớ
        gc.collect()

    env.close()

    number_entries = global_idx
    print(f"Tổng số mục: {number_entries}")

    vids = np.array(vids)
    intervals = np.array(intervals)
    np.savez(output_path, vids=vids, intervals=intervals)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--vid_path", type=str, required=True)
    parser.add_argument("--vid_pattern", type=str, required=True)
    parser.add_argument("--lmdb_path", type=str, required=True)
    parser.add_argument("--output_path", type=str, required=True)

    all_videos_id = # File txt tất cả vid id

    if 'ipykernel' in sys.modules:
        args_dict = {
            'vid_path': f"../data/videos/{all_videos_id}",
            'vid_pattern': "../data/jpg_zips/%s/%s.zip",
            'lmdb_path': "../data/lmdb/train_vsc",
            'output_path': "../data/lmdb/train_vsc/meta"
        }
        args = argparse.Namespace(**args_dict)
    else:
        args = parser.parse_args()
    
    main(args)

### Tổng số feeds: 1306
### Mẫu: Q100400 Q100401 Q100402 Q100403 Q100404


  0%|          | 0/14 [00:00<?, ?it/s]

100%|██████████| 14/14 [00:05<00:00,  2.68it/s]

Tổng số mục: 42591



