# An analysis of the subscription-based book-reading service's database

## Data overview

In [1]:
# suppressing warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# importing libraries
import pandas as pd
from sqlalchemy import create_engine

In [4]:
# setting parameters
db_config = {'user': 'praktikum_student',
             'pwd': 'Sdf4$2;d-d30pp', 
             'host': 'rc1b-wcoijxj3yxfsf3fs.mdb.yandexcloud.net',
             'port': 6432, 
             'db': 'data-analyst-final-project-db'} 

connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(db_config['user'],
                                                         db_config['pwd'],
                                                         db_config['host'],
                                                         db_config['port'],
                                                         db_config['db'])

In [5]:
# saving the connector
engine = create_engine(connection_string, connect_args={'sslmode':'require'}) 

In [6]:
# creating a function to generate a query, store the result in a DataFrame, display tables and info about them
for table in ['books', 'authors', 'publishers', 'ratings', 'reviews']:
    query = ''' SELECT *
                FROM {}
            '''.format(table)
    request = pd.io.sql.read_sql(query, con = engine)
    print()
    print(f'Table - {table}')
    display(request.head())
    print(request.info())
    


Table - books


Unnamed: 0,book_id,author_id,title,num_pages,publication_date,publisher_id
0,1,546,'Salem's Lot,594,2005-11-01,93
1,2,465,1 000 Places to See Before You Die,992,2003-05-22,336
2,3,407,13 Little Blue Envelopes (Little Blue Envelope...,322,2010-12-21,135
3,4,82,1491: New Revelations of the Americas Before C...,541,2006-10-10,309
4,5,125,1776,386,2006-07-04,268


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
book_id             1000 non-null int64
author_id           1000 non-null int64
title               1000 non-null object
num_pages           1000 non-null int64
publication_date    1000 non-null object
publisher_id        1000 non-null int64
dtypes: int64(4), object(2)
memory usage: 47.0+ KB
None

Table - authors


Unnamed: 0,author_id,author
0,1,A.S. Byatt
1,2,Aesop/Laura Harris/Laura Gibbs
2,3,Agatha Christie
3,4,Alan Brennert
4,5,Alan Moore/David Lloyd


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 636 entries, 0 to 635
Data columns (total 2 columns):
author_id    636 non-null int64
author       636 non-null object
dtypes: int64(1), object(1)
memory usage: 10.0+ KB
None

Table - publishers


Unnamed: 0,publisher_id,publisher
0,1,Ace
1,2,Ace Book
2,3,Ace Books
3,4,Ace Hardcover
4,5,Addison Wesley Publishing Company


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 2 columns):
publisher_id    340 non-null int64
publisher       340 non-null object
dtypes: int64(1), object(1)
memory usage: 5.4+ KB
None

Table - ratings


Unnamed: 0,rating_id,book_id,username,rating
0,1,1,ryanfranco,4
1,2,1,grantpatricia,2
2,3,1,brandtandrea,5
3,4,2,lorichen,3
4,5,2,mariokeller,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6456 entries, 0 to 6455
Data columns (total 4 columns):
rating_id    6456 non-null int64
book_id      6456 non-null int64
username     6456 non-null object
rating       6456 non-null int64
dtypes: int64(3), object(1)
memory usage: 201.8+ KB
None

Table - reviews


Unnamed: 0,review_id,book_id,username,text
0,1,1,brandtandrea,Mention society tell send professor analysis. ...
1,2,1,ryanfranco,Foot glass pretty audience hit themselves. Amo...
2,3,2,lorichen,Listen treat keep worry. Miss husband tax but ...
3,4,3,johnsonamanda,Finally month interesting blue could nature cu...
4,5,3,scotttamara,Nation purpose heavy give wait song will. List...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2793 entries, 0 to 2792
Data columns (total 4 columns):
review_id    2793 non-null int64
book_id      2793 non-null int64
username     2793 non-null object
text         2793 non-null object
dtypes: int64(2), object(2)
memory usage: 87.4+ KB
None


Successfully displayed all tables. There are no gaps in the data. There are no errors in the data types.

## Counting how many books have been published since January 1, 2000

In [8]:
query = ''' SELECT COUNT(book_id) as book_count
                   FROM books
                   WHERE publication_date > '2000-01-01'
        '''
books_2000 = pd.io.sql.read_sql(query, con = engine)
books_2000

Unnamed: 0,book_count
0,819


In [13]:
print(f'Since January 1, 2000, {books_2000.iloc[0,0]} books have been published.')

Since January 1, 2000, 819 books have been published.


## Calculating the average rating and number of reviews for each book

In [15]:
query = ''' SELECT DISTINCT b.title as book_title,
                   b.book_id,
                   COUNT(DISTINCT rv.review_id) as review_count,
                   ROUND(AVG(rt.rating), 1) as ratings_avg
            FROM books b
            LEFT JOIN reviews rv on b.book_id = rv.book_id
            LEFT JOIN ratings rt on b.book_id = rt.book_id
            GROUP BY b.title,
                     b.book_id
            ORDER BY review_count DESC, ratings_avg DESC
            
        '''

reviews_cnt = pd.io.sql.read_sql(query, con = engine)
reviews_cnt

Unnamed: 0,book_title,book_id,review_count,ratings_avg
0,Twilight (Twilight #1),948,7,3.7
1,Harry Potter and the Prisoner of Azkaban (Harr...,302,6,4.4
2,Harry Potter and the Chamber of Secrets (Harry...,299,6,4.3
3,The Book Thief,656,6,4.3
4,The Glass Castle,734,6,4.2
5,Outlander (Outlander #1),497,6,4.1
6,The Curious Incident of the Dog in the Night-Time,695,6,4.1
7,The Hobbit or There and Back Again,750,6,4.1
8,The Lightning Thief (Percy Jackson and the Oly...,779,6,4.1
9,Water for Elephants,963,6,4.0


In [16]:
print(f'Most book reviews - {reviews_cnt.iloc[0,2]}, average rating - {reviews_cnt.iloc[0,3]}.')

Most book reviews - 7, average rating - 3.7.


## Identifying the publisher that has released the most books over 50 pages - so we won't analyze brochures

In [18]:
query = ''' SELECT DISTINCT p.publisher_id,
                   p.publisher,
                   COUNT(b.book_id) as book_count
            FROM publishers p
            LEFT JOIN books b on p.publisher_id = b.publisher_id
            WHERE b.num_pages > 50
            GROUP BY p.publisher_id
            ORDER BY book_count DESC
            LIMIT 1
        '''
publishers_50 = pd.io.sql.read_sql(query, con = engine)
publishers_50

Unnamed: 0,publisher_id,publisher,book_count
0,212,Penguin Books,42


In [20]:
print(f'The publisher that has released the most books over 50 pages is {publishers_50.iloc[0,1]}. Number of books - {publishers_50.iloc[0,2]}.')

The publisher that has released the most books over 50 pages is Penguin Books. Number of books - 42.


## Identifying the author with the highest average rating of books - only take into account books with 50 or more ratings

In [21]:
query = ''' SELECT a.author_id,
                   a.author,
                   ROUND(AVG(rt.rating), 1) as rating_avg
            FROM authors a
            LEFT JOIN books b on a.author_id = b.author_id
            LEFT JOIN ratings rt on b.book_id = rt.book_id
            WHERE b.book_id IN (SELECT rt.book_id
                                FROM ratings rt
                                GROUP BY rt.book_id
                                HAVING COUNT(rt.rating) >= 50)
            GROUP BY a.author_id
            ORDER BY rating_avg DESC
            
        '''
ratings_max = pd.io.sql.read_sql(query, con = engine)
ratings_max

Unnamed: 0,author_id,author,rating_avg
0,236,J.K. Rowling/Mary GrandPré,4.3
1,402,Markus Zusak/Cao Xuân Việt Khương,4.3
2,376,Louisa May Alcott,4.2
3,240,J.R.R. Tolkien,4.2
4,498,Rick Riordan,4.1
5,621,William Golding,3.9
6,469,Paulo Coelho/Alan R. Clarke/Özdemir İnce,3.8
7,630,William Shakespeare/Paul Werstine/Barbara A. M...,3.8
8,372,Lois Lowry,3.8
9,235,J.D. Salinger,3.8


In [24]:
print(f'The author whose books have received the highest average rating (only take into account books with 50 or more ratings) is {ratings_max.iloc[0,1]}, average rating - {ratings_max.iloc[0,2]}.')

The author whose books have received the highest average rating (only take into account books with 50 or more ratings) is J.K. Rowling/Mary GrandPré, average rating - 4.3.


## Calculating the average number of reviews from users who have given more than 50 ratings

In [26]:
query = ''' SELECT ROUND(AVG(rv.review_count), 1) as review_count_avg
            FROM 
                 (SELECT DISTINCT username,
                         COUNT(DISTINCT review_id) as review_count
                         FROM reviews
                         WHERE username IN 
                                           (SELECT DISTINCT username
                                                   FROM ratings
                                                   GROUP BY username
                                                   HAVING COUNT(DISTINCT rating_id) > 50)
                         GROUP BY username) as rv     
        '''
review_count_avg = pd.io.sql.read_sql(query, con = engine)
review_count_avg

Unnamed: 0,review_count_avg
0,24.3


In [28]:
print(f'The average number of reviews from users who have given more than 50 ratings is {review_count_avg.iloc[0,0]}.')

The average number of reviews from users who have given more than 50 ratings is 24.3.


## Conclusion

1. Since January 1, 2000, 819 books have been published.
2. Most book reviews - 7, average rating - 3.7.
3. The publisher that has released the most books over 50 pages is Penguin Books. Number of books - 42.
4. The author whose books have received the highest average rating (only take into account books with 50 or more ratings) is J.K. Rowling/Mary GrandPré, average rating - 4.3.
5. The average number of reviews from users who have given more than 50 ratings is 24.3.

**Recomendations**

In the reviews_cnt table there are the titles of the most popular books, the number of reviews, and the average rating of each of the books. In the publishers_50 table, there are publishers that have released books over 50 pages. In the ratings_max table, there are authors with the highest average book rating. Consider these tables when drafting startup proposals. Only books with 50 or more ratings were taken into account. In addition, it was calculated that the average number of reviews from users who have given more than 50 ratings is 24.3. As a result, we were able to make more accurate ratings.