In [3]:
import json
import os
import requests
from tqdm.notebook import tqdm

MOVIES_DIR = "/var/lib/kinsovet/data/movies"
POCKETBASE_URL = "http://localhost:8090"  # или http://pocketbase:8090 в docker

# 1. Собираем все id из similarMovies
similar_ids = set()
for filename in tqdm(os.listdir(MOVIES_DIR)):
    if not filename.endswith(".json"):
        continue
    with open(os.path.join(MOVIES_DIR, filename)) as f:
        movie = json.load(f)
    for sm in movie.get("similarMovies", []):
        if "id" in sm:
            similar_ids.add(sm["id"])

print(f"Уникальных similar id: {len(similar_ids)}")

  0%|          | 0/98125 [00:00<?, ?it/s]

Уникальных similar id: 13872


In [6]:
# 2. Выгружаем все kinopoisk_id из PocketBase
existing_ids = set()
page = 1
while True:
    resp = requests.get(
        f"{POCKETBASE_URL}/api/collections/movies/records",
        params={"perPage": 500, "page": page, "fields": "kinopoisk_id"},
    )
    resp.raise_for_status()
    data = resp.json()
    for item in data["items"]:
        existing_ids.add(item["kinopoisk_id"])
    if page >= data["totalPages"]:
        break
    page += 1

print(f"Фильмов в БД: {len(existing_ids)}")

Фильмов в БД: 98125


In [7]:
# 3. Разница
missing_ids = similar_ids - existing_ids
print(f"Отсутствуют в БД: {len(missing_ids)}")

Отсутствуют в БД: 1


In [8]:
missing_ids

{0}

In [9]:
movies_with_zero = []
for filename in tqdm(os.listdir(MOVIES_DIR)):
    if not filename.endswith(".json"):
        continue
    with open(os.path.join(MOVIES_DIR, filename)) as f:
        data = json.load(f)
    for sm in data.get("similarMovies", []):
        if sm.get("id") == 0:
            movies_with_zero.append(data["id"])
            break

print(f"Фильмов с similar id=0: {len(movies_with_zero)}")
print(movies_with_zero[:20])

  0%|          | 0/98125 [00:00<?, ?it/s]

Фильмов с similar id=0: 21
[863009, 428930, 5165951, 1435087, 1254133, 252123, 1108689, 1144179, 804876, 1047269, 804748, 542445, 277535, 485542, 1142153, 4445150, 1301516, 1337642, 462649, 622652]
