<a href="https://colab.research.google.com/github/dariashcherbakovaaa/Algorithms-for-massive-data/blob/main/Algorithms_for_massive_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment settings & libraries

In [1]:
import os
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName('PageRank').getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark
import pyspark
type(spark)

sc = spark.sparkContext
spark

from google.colab import userdata
os.environ['KAGGLE_USERNAME'] = userdata.get("KAGGLE_USERNAME")
os.environ['KAGGLE_KEY'] = userdata.get("KAGGLE_KEY")

In [None]:
# LinkedIn dataset (Finding similar items, Market-basket analysis)
# !kaggle datasets download -d asaniczka/1-3m-linkedin-jobs-and-skills-2024
# !unzip 1-3m-linkedin-jobs-and-skills-2024.zip -d job_skills

# *The task is to implement a ranking system based on the PageRank index using the «Prado Museum Pictures» dataset, published on Kaggle under the MIT license. The entities to be ranked are pictures, linked together if they share at least a common tag in the work_tag attribute of the  prado.csv file, although you can experiment with different strategies in order to define links between pictures.*

In network science the page rank is used to calculate centrality

In [None]:
# Link analysis (PageRank)
!kaggle datasets download -d maparla/prado-museum-pictures
!unzip prado-museum-pictures.zip - d prada

In [None]:
!kaggle datasets download -d maparla/prado-museum-pictures
!unzip prado-museum-pictures.zip -d prada

In [29]:
import pandas as pd

df = pd.read_csv('/content/prada/prado.csv', sep = ',')

df[['work_url', 'work_tags']].sample(9)

Unnamed: 0,work_url,work_tags
10320,https://www.museodelprado.es/coleccion/obra-de...,Serie de dibujos preparatorios para las pechin...
1090,https://www.museodelprado.es/coleccion/obra-de...,Colección general de trages que en la actualid...
9385,https://www.museodelprado.es/coleccion/obra-de...,Óleo;Lienzo;Retrato de personaje aristocrático...
13171,https://www.museodelprado.es/coleccion/obra-de...,Buñol y alrededores;Gelatina / Colodión;Placa ...
4759,https://www.museodelprado.es/coleccion/obra-de...,Óleo;Lienzo;Arquitectura religiosa;Caballo (Eq...
4610,https://www.museodelprado.es/coleccion/obra-de...,Serie de la Jerusalén liberada;Óleo;Lienzo;His...
5940,https://www.museodelprado.es/coleccion/obra-de...,Cuaderno de Burdeos II o Cuaderno H;Lápiz;Pape...
9114,https://www.museodelprado.es/coleccion/obra-de...,Óleo;Lienzo;Retrato de personaje aristocrático...
12883,https://www.museodelprado.es/coleccion/obra-de...,Clarión;Sanguina;Papel verjurado;Retrato;1701;...


In [54]:
prada_museum = spark.read.format('csv') \
    .option('header', 'true') \
    .option('sep', ',') \
    .option('inferSchema', 'true') \
    .option('path', '/content/prada/prado.csv') \
    .load()

In [55]:
prada_museum.printSchema()

root
 |-- work_url: string (nullable = true)
 |-- work_image_url: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_bio: string (nullable = true)
 |-- author_url: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- work_title: string (nullable = true)
 |-- work_subtitle: string (nullable = true)
 |-- work_exposed: string (nullable = true)
 |-- work_description: string (nullable = true)
 |-- work_tags: string (nullable = true)
 |-- technical_sheet_numero_de_catalogo: string (nullable = true)
 |-- technical_sheet_autor: string (nullable = true)
 |-- technical_sheet_titulo: string (nullable = true)
 |-- technical_sheet_fecha: string (nullable = true)
 |-- technical_sheet_tecnica: string (nullable = true)
 |-- technical_sheet_soporte: string (nullable = true)
 |-- technical_sheet_dimension: string (nullable = true)
 |-- technical_sheet_serie: string (nullable = true)
 |-- technical_sheet_procedencia: string (nullable = true)
 |-- bibliography: st

In [49]:
prada_museum.show(7)

+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+------------+--------------------+-----------------+----------------------------------+---------------------+----------------------+---------------------+-----------------------+-----------------------+-------------------------+---------------------+---------------------------+------------+---------+-----------+---------+-----------------------+--------------------------------+-----------------------+--------------------+----------------------+-----------------------------------+
|            work_url|      work_image_url|              author|          author_bio|          author_url|  author_id|          work_title|       work_subtitle|work_exposed|    work_description|        work_tags|technical_sheet_numero_de_catalogo|technical_sheet_autor|technical_sheet_titulo|technical_sheet_fecha|technical_sheet_tecnica|technical_sheet_s

In [52]:
from pyspark.sql.functions import col, from_csv# Применяем функцию from_csv для столбца work_image_url
schema = "work_image_url STRING"
prada_museum = prada_museum.withColumn("work_image_url_parsed", from_csv(col("work_image_url"), schema))

# Извлекаем значение work_id из структуры
prada_museum = prada_museum.withColumn("work_id", prada_museum["work_image_url_parsed.work_image_url"].substr(-1, 100))

# Создаем столбец image_path
prada_museum = prada_museum.withColumn("image_path", "images/" + col("work_id"))

# Выводим первые 5 строк для проверки
prada_museum.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+------------+--------------------+-----------------+----------------------------------+---------------------+----------------------+---------------------+-----------------------+-----------------------+-------------------------+---------------------+---------------------------+------------+---------+-----------+---------+-----------------------+--------------------------------+-----------------------+--------------------+----------------------+-----------------------------------+-------+----------+---------------------+
|            work_url|      work_image_url|              author|          author_bio|          author_url|  author_id|          work_title|       work_subtitle|work_exposed|    work_description|        work_tags|technical_sheet_numero_de_catalogo|technical_sheet_autor|technical_sheet_titulo|technical_sheet_fecha|

In [40]:
from pyspark.sql.functions import concat, lit
prada_museum = prada_museum.withColumn("image_path", concat(lit("images/"), col("work_id")))

In [22]:
prada = prada_museum.select(prada_museum.columns[:11]) select('col1', 'col3')
prada = prada.drop(['author_bio', 'author_url', 'work_subtitle', 'work_exposed', 'work_description'])
prada.show(7)

TypeError: col should be a string or a Column

## Graph creation and visualisation

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

g = nx.DiGraph()

for p in pages:
    g.add_node(p)

for (a, b) in links:
    g.add_edge(pages[a], pages[b])

Build the transition matrix (the base to perform PageRank)