In [4]:
from pyspark.sql import *

In [5]:
from pyspark.sql.functions import *

In [88]:
from pyspark.sql.window import *

In [None]:
spark = SparkSession.builder.appName("analysis").getOrCreate()

In [76]:
books_df = spark.read.csv("books.csv", header=True, inferSchema=True)

In [77]:
books_df.count()

11127

In [78]:
books_df.printSchema()

root
 |-- num_pages: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- bookID: integer (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- publication_date: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- ratings_count: integer (nullable = true)
 |-- text_reviews_count: integer (nullable = true)
 |-- title: string (nullable = true)



In [79]:
# check if there are nulls in the dataset
null_counts = books_df.agg(*[sum(col(c).isNull().cast("int")).alias(c) for c in books_df.columns])
null_counts.show()

+---------+-------+--------------+------+----+------+-------------+----------------+---------+-------------+------------------+-----+
|num_pages|authors|average_rating|bookID|isbn|isbn13|language_code|publication_date|publisher|ratings_count|text_reviews_count|title|
+---------+-------+--------------+------+----+------+-------------+----------------+---------+-------------+------------------+-----+
|        0|      0|             0|     0|   0|     0|            0|               0|        0|            0|                 0|    0|
+---------+-------+--------------+------+----+------+-------------+----------------+---------+-------------+------------------+-----+



In [80]:
# identify if dataset has duplicate rows

isbn_total = books_df.groupBy("isbn").agg(count("*").alias("isbn_cnt")).selectExpr("sum(isbn_cnt) as isbn_total")
isbn13_total = books_df.groupBy("isbn13").agg(count("*").alias("isbn13_cnt")).selectExpr("sum(isbn13_cnt) as isbn13_total")
bookid_total = books_df.groupBy("bookID").agg(count("*").alias("bookID_cnt")).selectExpr("sum(bookID_cnt) as bookid_total")

result = isbn_total.crossJoin(isbn13_total).crossJoin(bookid_total)
# Select the desired columns
result.select("isbn_total", "isbn13_total", "bookid_total").show()

+----------+------------+------------+
|isbn_total|isbn13_total|bookid_total|
+----------+------------+------------+
|     11127|       11127|       11127|
+----------+------------+------------+



In [81]:
# How many total books are present in the database?
books_cnt = books_df.agg(countDistinct("bookID").alias("cnt_books"))
books_cnt.show()

+---------+
|cnt_books|
+---------+
|    11127|
+---------+



In [None]:
# How many books did each author release every year? Please note that the authors field may contain multiple authors.

In [87]:
# not accounting for multiple authors within a single columns
author_books_cnt = books_df.groupBy("authors", split("publication_date", "/")[2].alias("published_year"))\
.agg(countDistinct("bookID").alias("books_cnt"))\
.select("authors", "published_year", "books_cnt")

author_books_cnt.show()

+--------------------+--------------+---------+
|             authors|published_year|books_cnt|
+--------------------+--------------+---------+
|     E.L. Konigsburg|          2007|        1|
|James Daley/Barac...|          2006|        1|
|Lonely Planet/Sar...|          2004|        1|
|Anonymous/Alfred ...|          2004|        1|
|Rebecca J. Donatelle|          2006|        1|
|Sherrilyn Kenyon/...|          2006|        1|
|    Patricia Schultz|          2003|        1|
|Dante Alighieri/D...|          1981|        1|
|    Jayne Ann Krentz|          2004|        1|
|        Linda Turner|          1989|        1|
|J.R.R. Tolkien/Ch...|          2000|        2|
|David Foster Wallace|          1998|        1|
|     Terry Lee Rioux|          2005|        1|
|       John Newhouse|          2007|        1|
|          Pat Conroy|          2002|        2|
|    David McCullough|          2004|        1|
|        Shayla Black|          2007|        1|
|    Alden T. Vaughan|          2019|   

In [95]:
# How many books did each author release every year? Please note that the authors field may contain multiple authors.
# account for multiple authors within a single columns
individual_books_df = books_df.select(explode(split(books_df['authors'], '/')).alias("author"), split(books_df['publication_date'], '/')[2].alias("published_year"), books_df['bookId'])
# individual_books_df.show()
individual_author_cnt = individual_books_df.groupBy("author", "published_year")\
.agg(countDistinct("bookID").alias("books_cnt"))\
.select("author", "published_year", "books_cnt")

individual_author_cnt.show()

+--------------------+--------------+---------+
|              author|published_year|books_cnt|
+--------------------+--------------+---------+
|       Alma Flor Ada|          2004|        1|
|         Rachel Ryan|          2005|        1|
|      Delba Winthrop|          2002|        1|
|      Mary M. Flekke|          2006|        1|
|     E.L. Konigsburg|          2007|        1|
|         Larry Niven|          2004|        1|
|       Jincy Willett|          2002|        1|
|Rebecca J. Donatelle|          2006|        1|
|           Dick Hill|          2005|        4|
|       Howard Massey|          2007|        1|
|     Henning Mankell|          2004|        1|
|         Rosie Daley|          1994|        1|
|    Patricia Schultz|          2003|        1|
|        Janice Horne|          1971|        1|
|    Jayne Ann Krentz|          2004|        2|
|        Linda Turner|          1989|        1|
| Christiane Northrup|          2001|        1|
|      Kevin O'Malley|          2005|   

In [126]:
subquery1 = individual_author_cnt.groupBy("author")\
.agg(sum("books_cnt").alias("total_books_cnt"))

query = subquery1.withColumn("total_books_rnk", rank().over(Window.orderBy(subquery1.total_books_cnt.desc())))

result = query.filter(query.total_books_rnk == 1)
result.show()

23/05/26 15:48:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/26 15:48:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/26 15:48:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/26 15:48:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/26 15:48:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/26 15:48:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/26 1

+------------+---------------+---------------+
|      author|total_books_cnt|total_books_rnk|
+------------+---------------+---------------+
|Stephen King|             99|              1|
+------------+---------------+---------------+



23/05/26 15:48:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/26 15:48:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
