In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [5]:
spark = SparkSession.builder.appName("books-and-authors").getOrCreate()

In [8]:
books_df = spark.read.csv("data/book_dir/books.csv", header=True, inferSchema=True)
authors_df = spark.read.csv("data/book_dir/authors.csv", header=True, inferSchema=True)

In [13]:
books_df.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- author_id: integer (nullable = true)
 |-- genre: string (nullable = true)
 |-- price: double (nullable = true)
 |-- publish_date: date (nullable = true)



In [14]:
books_authors_df = books_df.join(authors_df, on = "author_id", how="inner")

Объединение таблиц

In [15]:
books_authors_df.show()

+---------+-------+-------+-----------+-----+------------+---------+----------+---------+
|author_id|book_id|  title|      genre|price|publish_date|     name|birth_date|  country|
+---------+-------+-------+-----------+-----+------------+---------+----------+---------+
|        2|      1| Book_1|    Mystery|73.57|  1980-12-31| Author_2|1965-12-31|   Canada|
|        1|      2| Book_2|Non-Fiction| 41.1|  1982-12-31| Author_1|1960-12-31|    India|
|       10|      3| Book_3|    Fiction|10.63|  1984-12-31|Author_10|2005-12-31|    India|
|        9|      4| Book_4|Non-Fiction|46.31|  1986-12-31| Author_9|2000-12-31|Australia|
|        7|      5| Book_5|    Science|31.13|  1988-12-31| Author_7|1990-12-31|      USA|
|        4|      6| Book_6|Non-Fiction| 83.7|  1990-12-31| Author_4|1975-12-31|       UK|
|        6|      7| Book_7|Non-Fiction|40.36|  1992-12-31| Author_6|1985-12-31|      USA|
|        2|      8| Book_8|Non-Fiction|84.48|  1994-12-31| Author_2|1965-12-31|   Canada|
|        7

Топ-5 авторов по выручке

In [19]:
best_authors_df = books_authors_df.groupBy("author_id", "name").agg(sum(col("price")).alias("total_revenue")).orderBy(col("total_revenue").desc()).limit(5)
best_authors_df.show()

+---------+--------+-------------+
|author_id|    name|total_revenue|
+---------+--------+-------------+
|        2|Author_2|       231.97|
|        7|Author_7|       132.66|
|        1|Author_1|       111.86|
|        8|Author_8|       107.16|
|        5|Author_5|        88.83|
+---------+--------+-------------+



Количество книг в каждом жанре

In [26]:
genre_df = books_authors_df.groupBy("genre").count().orderBy(col("count").desc())
genre_df.show()

+-----------+-----+
|      genre|count|
+-----------+-----+
|Non-Fiction|    9|
|    Science|    3|
|    Fiction|    3|
|    Fantasy|    3|
|    Mystery|    2|
+-----------+-----+



Средняя цена книг по каждому автору

In [31]:
avg_price_df = books_authors_df.groupBy(col("author_id"), col("name")).agg(avg("price").alias("average_price")).orderBy(col("average_price").desc())
avg_price_df.show()

+---------+---------+-----------------+
|author_id|     name|    average_price|
+---------+---------+-----------------+
|        5| Author_5|            88.83|
|        4| Author_4|             83.7|
|        2| Author_2|          57.9925|
|        9| Author_9|            46.31|
|        7| Author_7|            44.22|
|        6| Author_6|           43.965|
|        1| Author_1|37.28666666666667|
|        8| Author_8|            35.72|
|       10|Author_10|           21.165|
+---------+---------+-----------------+



In [None]:
Книги, опубликованные после 2000 года, отсортированные по цене

In [39]:
modern_books_df = books_authors_df.select("*").where(year("publish_date") > 2000).orderBy(col("price").desc())
modern_books_df.show()

+---------+-------+-------+-----------+-----+------------+--------+----------+---------+
|author_id|book_id|  title|      genre|price|publish_date|    name|birth_date|  country|
+---------+-------+-------+-----------+-----+------------+--------+----------+---------+
|        7|     20|Book_20|    Mystery|91.48|  2018-12-31|Author_7|1990-12-31|      USA|
|        5|     19|Book_19|    Science|88.83|  2016-12-31|Author_5|1980-12-31|      USA|
|        8|     15|Book_15|    Fantasy| 60.0|  2008-12-31|Author_8|1995-12-31|Australia|
|        6|     17|Book_17|    Fantasy|47.57|  2012-12-31|Author_6|1985-12-31|      USA|
|        1|     18|Book_18|Non-Fiction|43.92|  2014-12-31|Author_1|1960-12-31|    India|
|        2|     16|Book_16|    Fiction|36.22|  2010-12-31|Author_2|1965-12-31|   Canada|
|        8|     12|Book_12|Non-Fiction|31.02|  2002-12-31|Author_8|1995-12-31|Australia|
|        1|     14|Book_14|    Fiction|26.84|  2006-12-31|Author_1|1960-12-31|    India|
|        8|     13|Bo