In [1]:
#load library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [2]:
#load data from path
rating_path = 'data/ratings.csv'
book_path = 'data/books.csv'
book_tags_path = 'data/book_tags.csv'
tags_path = 'data/tags.csv'

In [3]:
def load_data(book_path, rating_path):
    """
    
    Function to load book and rating data
    - subsetting only the used columns
    - fill in missing values
    - drop duplicate rows
    
    """
        
    # reads the CSV file data and saves it as a DataFrame
    rating_data = pd.read_csv(rating_path, delimiter=',')
    book_data = pd.read_csv(book_path, delimiter=',')
    
    # copy dataframe book_data, and delete some feature.
    book_copy = book_data.copy()
    book_copy = book_copy.drop(columns=['best_book_id','work_id','books_count','isbn',
           'isbn13','title','language_code','average_rating',
           'ratings_count', 'work_ratings_count', 'work_text_reviews_count',
           'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
           'small_image_url'], axis=1)
    book_copy.head(3)
    
    # fill null values in book_data
    print("Missing values before fillna: ", book_copy.isnull().sum())
    book_copy['original_publication_year'] = book_copy['original_publication_year'].fillna(0)
    book_copy['original_title'] = book_copy['original_title'].fillna(book_data['title'])
    print("Missing values after fillna: ", book_copy.isnull().sum())
    
    # changes the data type original_publication_year column to int data type
    book_copy.loc[:, 'original_publication_year'] = book_copy['original_publication_year'].astype(int)
    book_copy.dtypes
    
    # drop duplicated rows
    print("Books shape before drop dup: ", book_copy.shape)
    print("Ratings shape before drop dup: ", rating_data.shape)
    book_copy.drop_duplicates(subset = ['book_id', 'goodreads_book_id'], inplace = True)
    rating_data.drop_duplicates(subset=['user_id','book_id'], inplace = True)
    print("Books shape after drop dup: ", book_copy.shape)
    print("Ratings shape after drop dup: ", rating_data.shape)
    
    return book_copy, rating_data

In [4]:
book_copy, rating_data = load_data(book_path, rating_path)

Missing values before fillna:  book_id                        0
goodreads_book_id              0
authors                        0
original_publication_year     21
original_title               585
image_url                      0
dtype: int64
Missing values after fillna:  book_id                      0
goodreads_book_id            0
authors                      0
original_publication_year    0
original_title               0
image_url                    0
dtype: int64
Books shape before drop dup:  (10000, 6)
Ratings shape before drop dup:  (5976479, 3)
Books shape after drop dup:  (10000, 6)
Ratings shape after drop dup:  (5976479, 3)


In [10]:
# Memisahkan 40 Genre

book_tags = pd.read_csv(book_tags_path, delimiter=',')
tags = pd.read_csv(tags_path, delimiter=',')

genres = ["Art", "Biography", "Business", "Chick Lit", "Children", "Christian", "Classics",
          "Comics", "Contemporary", "Cookbooks", "Crime", "Ebooks", "Fantasy", "Fiction",
          "Gay and Lesbian", "Graphic Novels", "Historical Fiction", "History", "Horror",
          "Humor and Comedy", "Manga", "Memoir", "Music", "Mystery", "Nonfiction", "Paranormal",
          "Philosophy", "Poetry", "Psychology", "Religion", "Romance", "Science", "Science Fiction", 
          "Self Help", "Suspense", "Spirituality", "Sports", "Thriller", "Travel", "Young Adult"]

genres = list(map(str.lower, genres))

def create_genre_list(tag):
    """
    Function for building list of extracted genres
    """
    genre_list = []
    string_tag = str(tag)
    
    for genre in genres:
        
        if ('nonfiction' in string_tag):
            genre_list.append('nonfiction')
        elif ('sci-fi' in string_tag) | ('scifi' in string_tag) | ('science fiction' in string_tag) :
            genre_list.append('science fiction')
        elif (genre in string_tag) & ('non' not in string_tag):
            genre_list.append(genre)
        else:
            pass
        
    return genre_list


def unique_array(list_):
    unique_list = list(set(list_))
    return unique_list

def extract_genres(book_tags, tags, genres):
    """
    Function to extract genres from tag names
    """
    tags['tag_name_lower'] = tags['tag_name'].str.lower()
    available_genres = tags.loc[tags.tag_name_lower.str.lower().isin(genres)]
    available_genres.head()
    
    tags['genre_list'] = [[]] * tags.shape[0]   

    # Add tags
    tags['genre_list'] = tags.apply(lambda row: create_genre_list(row['tag_name_lower']), axis = 1)
    tags_filtered = tags[tags.genre_list.str.len() != 0]
    
    # join with books
    booktags_to_genre = pd.merge(book_tags, tags_filtered, how = "left", on = "tag_id")
    booktags_to_genre.dropna(subset = ["genre_list"], inplace = True)
    booktags_to_genre.drop(['tag_id', 'tag_name', 'tag_name_lower', 'count'], axis=1, inplace = True)
    gr_book_genres = booktags_to_genre.groupby('goodreads_book_id').agg({'genre_list': 'sum'}).reset_index(drop = False)

    gr_book_genres['genres'] = gr_book_genres.apply(lambda row: unique_array(row['genre_list']), axis = 1)
    gr_book_genres.drop(['genre_list'], axis = 1, inplace = True)
    
    # Join with books
    books_with_genres = pd.merge(book_copy, gr_book_genres, how = "left", on = "goodreads_book_id")
    books_with_genres = books_with_genres[["book_id", "genres"]]
    
    return books_with_genres


In [11]:
books_with_genres = extract_genres(book_tags, tags, genres)
books_with_genres

Unnamed: 0,book_id,genres
0,1,"[romance, fiction, thriller, fantasy, science ..."
1,2,"[paranormal, mystery, fiction, fantasy, scienc..."
2,3,"[paranormal, horror, romance, fiction, fantasy..."
3,4,"[mystery, fiction, history, contemporary, crim..."
4,5,"[romance, fiction, ebooks, classics]"
...,...,...
9995,9996,"[paranormal, mystery, romance, fiction, fantas..."
9996,9997,"[memoir, nonfiction, history, art, science, bi..."
9997,9998,"[fiction, history, contemporary, ebooks, class..."
9998,9999,"[psychology, memoir, nonfiction, history, cont..."


In [12]:
books_with_genres.dtypes


book_id     int64
genres     object
dtype: object

In [13]:
mask = books_with_genres.genres.apply(lambda x: 'nonfiction' in x)
df1 = books_with_genres[mask]
df1

Unnamed: 0,book_id,genres
14,15,"[memoir, nonfiction, children, fiction, histor..."
39,40,"[religion, memoir, nonfiction, travel, romance..."
49,50,"[poetry, nonfiction, fiction, fantasy, childre..."
80,81,"[psychology, memoir, nonfiction, fiction, cont..."
81,82,"[memoir, nonfiction, travel, fiction, philosop..."
...,...,...
9987,9988,"[religion, nonfiction, philosophy, history, ar..."
9992,9993,"[nonfiction, travel, fiction, fantasy, history..."
9996,9997,"[memoir, nonfiction, history, art, science, bi..."
9998,9999,"[psychology, memoir, nonfiction, history, cont..."


In [14]:
mask = books_with_genres.genres.apply(lambda x: 'science fiction' in x)
df1 = books_with_genres[mask]
df1

Unnamed: 0,book_id,genres
0,1,"[romance, fiction, thriller, fantasy, science ..."
1,2,"[paranormal, mystery, fiction, fantasy, scienc..."
2,3,"[paranormal, horror, romance, fiction, fantasy..."
6,7,"[fiction, fantasy, science fiction, art, scien..."
8,9,"[religion, mystery, romance, fiction, thriller..."
...,...,...
9984,9985,"[mystery, fiction, thriller, fantasy, science ..."
9985,9986,"[mystery, christian, travel, romance, thriller..."
9991,9992,"[fiction, fantasy, science fiction, art, scien..."
9993,9994,"[mystery, romance, fiction, fantasy, science f..."


In [19]:
rating_data.groupby('book_id').count().sort_values('rating')

Unnamed: 0_level_0,user_id,rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
7803,8,8
9345,11,11
9486,24,24
1935,33,33
9315,36,36
...,...,...
5,16604,16604
3,16931,16931
4,19088,19088
2,21850,21850


In [20]:
rating_data[rating_data.book_id == 1].groupby(['book_id', 'rating']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id
book_id,rating,Unnamed: 2_level_1
1,1,231
1,2,636
1,3,3013
1,4,7569
1,5,11357
