In [None]:
from google.colab import files
files.upload()  #  kaggle.json


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"samlys","key":"0db25effa810e54ec78642310ca86ed7"}'}

In [None]:
import os
import shutil


os.makedirs("/root/.kaggle", exist_ok=True)


shutil.move("kaggle.json", "/root/.kaggle/kaggle.json")


os.chmod("/root/.kaggle/kaggle.json", 0o600)


In [None]:
!pip install -q kaggle
!kaggle datasets list -s artworks


ref                                                         title                                                 size  lastUpdated                 downloadCount  voteCount  usabilityRating  
----------------------------------------------------------  ---------------------------------------------  -----------  --------------------------  -------------  ---------  ---------------  
ikarus777/best-artworks-of-all-time                         Best Artworks of All Time                       2460161668  2019-03-02 09:21:51.907000          38927       1029  1.0              
momanyc/museum-collection                                   Museum of Modern Art Collection                    6199715  2017-02-15 14:40:59.537000           7473        158  0.8235294        
metmuseum/the-metropolitan-museum-of-art-open-access        The Metropolitan Museum of Art Open Access        26390765  2017-04-07 07:57:15.063000           1595         47  0.8235294        
jackogozaly/moma-artworks-on-view       

In [None]:
!kaggle datasets download -d ikarus777/best-artworks-of-all-time -p data --unzip


Dataset URL: https://www.kaggle.com/datasets/ikarus777/best-artworks-of-all-time
License(s): CC-BY-NC-SA-4.0
Downloading best-artworks-of-all-time.zip to data
 99% 2.28G/2.29G [00:25<00:00, 188MB/s]
100% 2.29G/2.29G [00:25<00:00, 95.4MB/s]


In [13]:
import os
os.makedirs("data/best-artworks-of-all-time", exist_ok=True)


for filename in os.listdir("data"):
    if filename != "best-artworks-of-all-time":
        os.rename(f"data/{filename}", f"data/best-artworks-of-all-time/{filename}")

os.listdir("data/best-artworks-of-all-time")


['artists.csv', 'resized', 'images']

In [16]:
# eda_analysis.py
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


BASE_DIR    = "data/best-artworks-of-all-time"
CSV_PATH    = os.path.join(BASE_DIR, "artists.csv")
IMAGES_DIR  = os.path.join(BASE_DIR, "images")
PLOTS_DIR   = "plots/eda"
os.makedirs(PLOTS_DIR, exist_ok=True)


df = pd.read_csv(CSV_PATH)


df_sorted = df.sort_values("paintings", ascending=False)
plt.figure(figsize=(10,5))
plt.bar(df_sorted["name"].head(20), df_sorted["paintings"].head(20))
plt.xticks(rotation=90)
plt.title("Топ‑20 художников по заявленному числу картин")
plt.ylabel("Paintings (заявлено в CSV)")
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "top20_paintings_declared.png"))
plt.close()


real_counts = []
for artist in df["name"]:
    artist_dir = os.path.join(IMAGES_DIR, artist)
    if os.path.isdir(artist_dir):
        real_counts.append(len(os.listdir(artist_dir)))
    else:
        real_counts.append(0)
df["real_count"] = real_counts
df["diff"] = df["real_count"] - df["paintings"]

plt.figure(figsize=(6,6))
plt.scatter(df["paintings"], df["real_count"], alpha=0.6)
plt.plot([0, max(df["paintings"])], [0, max(df["paintings"])], 'r--')
plt.title("Заявлено vs Реально (файлов)")
plt.xlabel("Заявлено в CSV")
plt.ylabel("Сколько файлов найдено")
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "declared_vs_real.png"))
plt.close()

nat_counts = df["nationality"].value_counts().head(10)
plt.figure(figsize=(8,5))
nat_counts.plot(kind="bar")
plt.title("Топ‑10 национальностей")
plt.ylabel("Количество художников")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "top10_nationalities.png"))
plt.close()


genre_counts = df["genre"].value_counts().head(10)
plt.figure(figsize=(8,5))
genre_counts.plot(kind="bar", color="green")
plt.title("Топ‑10 жанров")
plt.ylabel("Количество художников")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "top10_genres.png"))
plt.close()


def parse_years(s):
    if pd.isna(s):
        return np.nan, np.nan
    parts = re.split(r"[–—\-]", s)
    try:
        b, d = int(parts[0]), int(parts[1])
    except:
        return np.nan, np.nan
    return b, d

births, deaths = zip(*df["years"].apply(parse_years))
df["birth"] = births
df["death"] = deaths
df["lifespan"] = df["death"] - df["birth"]

# Распределение lifespan
plt.figure(figsize=(8,5))
df["lifespan"].dropna().plot(kind="hist", bins=20)
plt.title("Распределение продолжительности жизни художников")
plt.xlabel("Лет")
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "lifespan_hist.png"))
plt.close()

plt.figure(figsize=(6,6))
plt.scatter(df["lifespan"], df["paintings"], alpha=0.6)
plt.title("Paintings vs Lifespan")
plt.xlabel("Продолжительность жизни (лет)")
plt.ylabel("Число картин (заявлено)")
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "paintings_vs_lifespan.png"))
plt.close()

#  Вывод основных статистик в консоль
print("=== Статистика по lifspan ===")
print(df["lifespan"].describe().round(1))
print("\n=== Средняя разница (реал vs заявлено) ===")
print(df["diff"].describe().round(1))


=== Статистика по lifspan ===
count    50.0
mean     64.8
std      16.7
min      32.0
25%      55.2
50%      65.5
75%      77.5
max      98.0
Name: lifespan, dtype: float64

=== Средняя разница (реал vs заявлено) ===
count     50.0
mean    -168.9
std      157.5
min     -877.0
25%     -191.8
50%     -123.0
75%      -81.0
max      -24.0
Name: diff, dtype: float64
