In [1]:
from google.colab import files
import os
import json
import getpass

use_upload = input('Do you want to upload the Kaggle credentials file? (y/n): ')
if use_upload.lower() == 'y':
    uploaded = files.upload()

    for name in uploaded.keys():
        if 'kaggle' in name.lower():
            os.rename(name, 'KaggleCredential.json')
            break
else:
    username = input('Enter your Kaggle username: ')
    password = getpass.getpass('Enter your Kaggle key: ')

    credentials = {'username': username, 'key': password}
    with open('KaggleCredential.json', 'w') as f:
        json.dump(credentials, f)

with open('KaggleCredential.json', 'r') as f:
    credentials = json.load(f)

username = credentials['username']
password = credentials['key']
print(username)

Do you want to upload the Kaggle credentials file? (y/n): y


Saving kaggle.json to kaggle.json
edoardovergani


In [2]:
credentials = {'username': username, 'key': password}
kaggle_dir = '/root/.kaggle'
if not os.path.exists(kaggle_dir):
    os.makedirs(kaggle_dir)
with open(os.path.join(kaggle_dir, 'kaggle.json'), 'w') as f:
    json.dump(credentials, f)

# Set the file permissions to read/write only for the owner
!chmod 600 /root/.kaggle/kaggle.json

In [3]:
import kaggle
import zipfile

kaggle.api.authenticate()

In [4]:
kaggle.api.dataset_download_file('maparla/prado-museum-pictures','prado.csv')

Dataset URL: https://www.kaggle.com/datasets/maparla/prado-museum-pictures


False

In [5]:
with zipfile.ZipFile('prado.csv.zip', 'r') as zip_ref:
    zip_ref.extractall()  # Extract to the current directory

In [6]:
!pip3 install pyspark
import pyspark
from pyspark.sql import SparkSession

import pyspark.sql.functions as f
from pyspark.sql import Window



In [7]:
import pandas as pd

In [8]:
df = pd.read_csv('prado.csv')

In [9]:
df_selected=df[['work_url', 'work_tags']]
df_selected.to_csv('prado_selected.csv', index=False)

In [10]:
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import explode, split, regexp_replace, col, trim

spark = SparkSession.builder.appName("PradoMuseumPictures").getOrCreate()

rdd = spark.sparkContext.textFile('prado_selected.csv', minPartitions=8)

schema = StructType([
    StructField("id", StringType(), True),
    StructField("work_tags", StringType(), True)
])

header = rdd.first()
data_rdd = rdd.filter(lambda row: row != header).map(lambda line: line.split(","))

row_rdd = data_rdd.map(lambda fields: Row(id=fields[0], work_tags=fields[1]))

df = spark.createDataFrame(row_rdd, schema)

exploded_df = df.withColumn("work_tag", explode(split(col("work_tags"), ';')))

cleaned_df = exploded_df.withColumn('work_tag', regexp_replace('work_tag', r'[+"\"]', ''))

cleaned_df = cleaned_df.filter(trim(col('work_tag')) != '')

exploded_rdd = cleaned_df.rdd

grouped_rdd = exploded_rdd.map(lambda row: (row.work_tag, row.id)).groupByKey().mapValues(list)

filtered_rdd = grouped_rdd.filter(lambda x: len(x[1]) > 1)

filtered_data = filtered_rdd.collect()
for work_tag, ids in filtered_data:
    print(f"Work Tag: {work_tag}, IDs: {ids}")



Work Tag: 1764, IDs: ['https://www.museodelprado.es/coleccion/obra-de-arte/cabeza-de-gigante/3c99ed98-f274-4e36-bfe2-ccd57db3ea27', 'https://www.museodelprado.es/coleccion/obra-de-arte/joven-con-turbante/896a9001-996a-4639-a26b-30da23b755a5', 'https://www.museodelprado.es/coleccion/obra-de-arte/cabeza-de-una-ninfa-y-esbozo-de-una-esfinge/20a55acd-c069-4645-8a6b-ec95529bd9e9', 'https://www.museodelprado.es/coleccion/obra-de-arte/cesto-con-telas/cb4e2899-902f-498e-83ee-41a08c4656d0', 'https://www.museodelprado.es/coleccion/obra-de-arte/santa-catalina-de-siena/a8409683-ec41-496b-82dd-e446549cc991', 'https://www.museodelprado.es/coleccion/obra-de-arte/la-virgen-anunciada/ac77a9bd-6606-42b3-a221-d9fb201c9121', 'https://www.museodelprado.es/coleccion/obra-de-arte/vaso-refrescador/86593dda-4568-45c1-998a-e1239eb69a42', 'https://www.museodelprado.es/coleccion/obra-de-arte/el-tocador-de-venus/adb7b3d1-f336-46e3-84df-347fd98e0361', 'https://www.museodelprado.es/coleccion/obra-de-arte/madame-de-p

In [11]:
import itertools
from itertools import combinations

def combination(row):
    pairs = list(combinations(row[1], 2))
    return pairs + [(b, a) for a, b in pairs]

linkage_rdd = filtered_rdd.flatMap(combination)

In [12]:
total_nodes = linkage_rdd.flatMap(lambda x: [x[0], x[1]]).distinct().count()

In [13]:
id2degree = linkage_rdd.map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x + y).collectAsMap()

In [14]:
p2diz = {node: 1 / total_nodes for node in id2degree.keys()}

In [18]:
P = linkage_rdd.map(lambda x: (x[0], x[1], 1/id2degree[x[0]]))

PT = P.map(lambda x: (x[1], x[0], x[2]))

epsilon = 1e-6  # Tolerance level
alpha = 0.85  # Damping factor
max_iterations = 10  # to prevent infinite loops

iteration_count = 0

while True:
    iteration_count += 1
    new_p = PT.map(lambda x: (x[0], alpha * x[2] * p2diz.get(x[1], 0)))\
              .reduceByKey(lambda x, y: x + y)\
              .collect()

    converged = True
    for idx, prb in new_p:
        if abs(p2diz[idx] - prb) > epsilon:
            converged = False
        p2diz[idx] = prb + (1 - alpha) / total_nodes

    if converged:
        print(f"Converged after {iteration_count} iterations.")
        break

    if iteration_count >= max_iterations:
        print(f"Reached maximum iterations ({max_iterations}) without convergence.")
        break

Reached maximum iterations (10) without convergence.


In [19]:
p2diz_sort = sorted(p2diz.items(), key=lambda x: x[1], reverse=True)

print('Highest-ranked pictures based on PageRank:')
for kv in range(min(20, len(p2diz_sort))):
    print(f'With PageRank: {p2diz_sort[kv][1]}, Picture ID: {p2diz_sort[kv][0]}')

Highest-ranked pictures based on PageRank:
With PageRank: 0.00014992517978630073, Picture ID: https://www.museodelprado.es/coleccion/obra-de-arte/sagrada-familia-con-san-juanito-y-santa-catalina/ad7a7cd5-5fad-41cd-90c3-47f8dd8996fe
With PageRank: 0.00014647587259316984, Picture ID: https://www.museodelprado.es/coleccion/obra-de-arte/apolo-servido-por-las-ninfas/222697ef-7345-446a-b0e1-033b5d92f502
With PageRank: 0.00014459240145153445, Picture ID: https://www.museodelprado.es/coleccion/obra-de-arte/virgen-con-el-nio-sentado-en-su-regazo-enmarcada/243bb020-03f5-4654-bd0f-80b4ecb82c44
With PageRank: 0.00014459240145153445, Picture ID: https://www.museodelprado.es/coleccion/obra-de-arte/el-sueo-de-san-jose-o-la-muerte-de-san-francisco/bef45e11-7e3e-42a9-a891-968f98a60411
With PageRank: 0.00014459240145153442, Picture ID: https://www.museodelprado.es/coleccion/obra-de-arte/anotacion-sobre-la-boda-del-artista-referencia-a/d67b5f82-8233-4809-8137-08eb3129451f
With PageRank: 0.000143678769950