In [2]:
import json
import pandas as pd


items_path = "mv_items.json"
prices_path = "only_prices.json"
output_path = "mvideo_features.csv"

items = []
with open(items_path, "r", encoding="utf-8") as f:
    for line in f:
        items.append(json.loads(line))

with open(prices_path, "r", encoding="utf-8") as f:
    prices_list = json.load(f)
price_map = {p["productId"]: p for p in prices_list}

all_feature_names = set()
for item in items:
    for block in item["all_properties"]:
        for prop in block["properties"]:
            name = prop["name"]
            measure = prop.get("measure")
            if measure:
                name = f"{name} ({measure})"
            all_feature_names.add(name)
data_rows = []
for item in items:
    pid = item.get("product_id")
    row = {
        "product_id": pid,
        "name": item.get("name"),
        "brand": item.get("brand")
    }
    for block in item["all_properties"]:
        for prop in block["properties"]:
            name = prop["name"]
            value = prop.get("value")
            measure = prop.get("measure")
            if measure:
                name = f"{name} ({measure})"
            row[name] = value
    price_data = price_map.get(pid)
    if price_data:
        row["basePrice"] = price_data.get("basePrice")
        row["salePrice"] = price_data.get("salePrice")

    data_rows.append(row)

df = pd.DataFrame(data_rows)
df.to_csv(output_path, index=False, encoding="utf-8")


In [3]:
import pandas as pd

df = pd.read_csv("mvideo_features.csv")

print("üìã –í—Å–µ –∫–æ–ª–æ–Ω–∫–∏:")
for col in df.columns:
    print(col)

print("\nüìâ –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π (NaN) –ø–æ –∫–∞–∂–¥–æ–π –∫–æ–ª–æ–Ω–∫–µ:")
nan_counts = df.isna().sum()
print(nan_counts[nan_counts > 0].sort_values(ascending=False))  


üìã –í—Å–µ –∫–æ–ª–æ–Ω–∫–∏:
product_id
name
brand
–ì–∞—Ä–∞–Ω—Ç–∏—è
–ì–∞—Ä–∞–Ω—Ç–∏—è –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª—è–µ—Ç—Å—è
–°—Ç—Ä–∞–Ω–∞
–ì–æ–¥ —Ä–µ–ª–∏–∑–∞
–°–µ—Ä–∏—è
–°–æ—Å—Ç–æ—è–Ω–∏–µ
–í–Ω–µ—à–Ω–∏–π –≤–∏–¥
–†–∞–∑—Ä–µ—à–µ–Ω–∏–µ —ç–∫—Ä–∞–Ω–∞
–≠–∫—Ä–∞–Ω
–¢–µ—Ö–Ω–æ–ª–æ–≥–∏—è —ç–∫—Ä–∞–Ω–∞
–¢–∏–ø —ç–∫—Ä–∞–Ω–∞
–ß–∞—Å—Ç–æ—Ç–∞ –æ–±–Ω–æ–≤–ª–µ–Ω–∏—è (–ì—Ü)
–Ø—Ä–∫–æ—Å—Ç—å (–∫–¥/–∫–≤.–º)
–¢–∏–ø –ø—Ä–æ—Ü–µ—Å—Å–æ—Ä–∞
–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —è–¥–µ—Ä
–ì—Ä–∞—Ñ–∏—á–µ—Å–∫–∏–π —É—Å–∫–æ—Ä–∏—Ç–µ–ª—å
–û–ø–µ—Ä–∞—Ü–∏–æ–Ω–Ω–∞—è —Å–∏—Å—Ç–µ–º–∞
–í—Å—Ç—Ä–æ–µ–Ω–Ω–∞—è –ø–∞–º—è—Ç—å (ROM) (–ì–ë)
–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –æ—Å–Ω–æ–≤–Ω—ã—Ö –∫–∞–º–µ—Ä (—à—Ç)
–û—Å–Ω–æ–≤–Ω–∞—è –∫–∞–º–µ—Ä–∞ –ú–ü–∏–∫—Å
–°—ä–µ–º–∫–∞ –≤–∏–¥–µ–æ –≤ –ø–æ—Ä—Ç—Ä–µ—Ç–Ω–æ–º —Ä–µ–∂–∏–º–µ
–¶–∏—Ñ—Ä–æ–≤–æ–π –∑—É–º (x)
–†–∞–∑—Ä–µ—à–µ–Ω–∏–µ –≤–∏–¥–µ–æ—Å—ä–µ–º–∫–∏
–û–ø—Ç–∏—á–µ—Å–∫–∞—è —Å—Ç–∞–±–∏–ª–∏–∑–∞—Ü–∏—è
–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ñ—Ä–æ–Ω—Ç–∞–ª—å–Ω—ã—Ö –∫–∞–º–µ—Ä (—à—Ç)
–§—Ä–æ–Ω—Ç–∞–ª—å–Ω–∞—è –∫–∞–º–µ—Ä–∞ –ú–ü–∏–∫—Å
–ü–æ–¥–¥–µ—Ä–∂–∫–∞ SIM –∫–∞—Ä—Ç
–ü–æ–¥–¥–µ—Ä–∂–∫–∞ —Å—

In [None]:
import matplotlib.pyplot as plt

nan_df = nan_counts[nan_counts >= 0].sort_values(ascending=False).to_frame(name='nan_count')

plt.figure(figsize=(10, max(4, len(nan_df) * 0.3)))
nan_df['nan_count'].plot(kind='barh')
plt.xlabel('–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ NaN')
plt.ylabel('–ö–æ–ª–æ–Ω–∫–∞')
plt.title('–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π –ø–æ –∫–æ–ª–æ–Ω–∫–∞–º')
plt.tight_layout()
plt.savefig('nan_counts.png')
plt.close()