In [1]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from numpy as np

In [3]:
spark = SparkSession.builder.appName("notion").getOrCreate()

23/05/16 10:23:20 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [6]:
file_name = "books.csv"
books_df = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load(file_name)

In [7]:
books_df.count()

11127

In [8]:
books_df.printSchema()

root
 |-- num_pages: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- bookID: integer (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- publication_date: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- ratings_count: integer (nullable = true)
 |-- text_reviews_count: integer (nullable = true)
 |-- title: string (nullable = true)



In [None]:
# deduping
def dedup(df):
    dup_cnt, no_dup_cnt = books_df.count(), books_df.distinct().count()
    diff = dup_cnt - no_dup_cnt
    if diff:
        return df.dropDuplicates()
    else:
        df
new_df = dedup(books_df)

In [None]:
# How many total books are present in the database?

In [22]:
books_df.createOrReplaceTempView("books")

In [29]:
# dedeuping rows
query = spark.sql(f"""
WITH CTE AS
(
SELECT 
bookID,
RANK() OVER (PARTITION BY bookID ORDER BY publication_date) as rnk
FROM books
)

SELECT COUNT(DISTINCT bookID) AS books_cnt 
FROM CTE WHERE rnk = 1
""")
query.show()

+---------+
|books_cnt|
+---------+
|    11127|
+---------+



In [None]:
# How many books did each author release every year? Please note that the authors field may contain multiple authors.

In [32]:
query = spark.sql(f"""
SELECT 
DISTINCT authors
FROM books
LIMIT 100
""")
query.show()

+--------------------+
|             authors|
+--------------------+
|          James Frey|
|Giorgio De Santil...|
|     Eric Klinenberg|
|     Karen Armstrong|
|Andy Hunt/Dave Th...|
|Judi Barrett/Ron ...|
|Louis-Ferdinand C...|
|Albert Einstein/H...|
|Leo Tolstoy/Const...|
|Jonathan Clements...|
|         Ann Rinaldi|
|          Dava Sobel|
|Hildegarde Hoyt S...|
|         Ann Beattie|
|Gardner Dozois/Ge...|
|Grant Morrison/St...|
|Albert Jack/Ann Page|
|Wendy  Mitchell/J...|
|William Shakespea...|
|Satoru Kannagi/Ho...|
+--------------------+
only showing top 20 rows



In [60]:
df = books_df.select(books_df['authors'], books_df['bookID'], books_df['publication_date'], explode(split(books_df['authors'], '/')).alias('author'))
df.show()



+--------------------+------+----------------+--------------------+
|             authors|bookID|publication_date|              author|
+--------------------+------+----------------+--------------------+
|J.K. Rowling/Mary...|     1|       9/16/2006|        J.K. Rowling|
|J.K. Rowling/Mary...|     1|       9/16/2006|       Mary GrandPré|
|J.K. Rowling/Mary...|     2|        9/1/2004|        J.K. Rowling|
|J.K. Rowling/Mary...|     2|        9/1/2004|       Mary GrandPré|
|        J.K. Rowling|     4|       11/1/2003|        J.K. Rowling|
|J.K. Rowling/Mary...|     5|        5/1/2004|        J.K. Rowling|
|J.K. Rowling/Mary...|     5|        5/1/2004|       Mary GrandPré|
|J.K. Rowling/Mary...|     8|       9/13/2004|        J.K. Rowling|
|J.K. Rowling/Mary...|     8|       9/13/2004|       Mary GrandPré|
|W. Frederick Zimm...|     9|       4/26/2005|W. Frederick Zimm...|
|        J.K. Rowling|    10|       9/12/2005|        J.K. Rowling|
|       Douglas Adams|    12|       11/1/2005|  

In [63]:
df.createOrReplaceTempView("authors")

In [70]:
authors = spark.sql(f"""
SELECT 
DISTINCT 
author,
SPLIT(publication_date, '/')[2] as pub_year,
COUNT(DISTINCT bookID) as books_cnt
FROM authors
WHERE author = 'Stephen King'
GROUP BY 1,2
""")
# authors.show(truncate=False)
authors.show()

+------------+--------+---------+
|      author|pub_year|books_cnt|
+------------+--------+---------+
|Stephen King|    2001|        8|
|Stephen King|    1984|        1|
|Stephen King|    1990|        1|
|Stephen King|    1992|        3|
|Stephen King|    1987|        2|
|Stephen King|    2002|       13|
|Stephen King|    1994|        1|
|Stephen King|    1991|        1|
|Stephen King|    1999|        7|
|Stephen King|    1995|        2|
|Stephen King|    1976|        2|
|Stephen King|    1980|        1|
|Stephen King|    1988|        1|
|Stephen King|    1979|        2|
|Stephen King|    2005|        9|
|Stephen King|    1975|        1|
|Stephen King|    2010|        1|
|Stephen King|    2006|        8|
|Stephen King|    2000|        4|
|Stephen King|    1997|        1|
+------------+--------+---------+
only showing top 20 rows



In [None]:
# Which individual authors released the largest number of books overall? 
# Your response should cover cases where there might be multiple authors who released the same maximum book count.

In [69]:
max_authors = spark.sql(f"""
SELECT
author
FROM 
(
SELECT 
DISTINCT 
author,
DENSE_RANK() OVER (ORDER BY COUNT(DISTINCT bookID) DESC) as books_cnt_rnk
FROM authors
GROUP BY 1
)
WHERE books_cnt_rnk = 1
""")
max_authors.show()

23/05/16 13:02:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/16 13:02:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/16 13:02:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/16 13:02:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/16 13:02:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/16 13:02:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/16 1

+------------+
|      author|
+------------+
|Stephen King|
+------------+



In [None]:
max_authors_ = spark.sql(f"""
SELECT
author,
books_cnt
FROM 
(
SELECT 
DISTINCT 
author,
COUNT(DISTINCT bookID) AS books_cnt
FROM authors
GROUP BY 1
)
WHERE books_cnt_rnk = 1
""")
max_authors.show()