In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# move wd to the project root
os.chdir("../../")

PRIM_FOLDER = "data/2_primary/"
FEAT_FOLDER = "data/3_feature/"

os.makedirs(FEAT_FOLDER, exist_ok=True)

In [None]:
books = pd.read_csv(f"{PRIM_FOLDER}books.csv")
ratings = pd.read_csv(f"{PRIM_FOLDER}ratings.csv")

## Prepare book tag features
----

In [None]:
books.tags = books.tags.apply(lambda x: x.split(","))
books

#### Check frequency of tags

In [None]:
tag_cnt = books.loc[:, ["isbn", "tags"]].explode("tags").value_counts("tags")

tag_cnt

In [None]:
plt.scatter(x=np.arange(tag_cnt.shape[0]), y=tag_cnt / books.shape[0])

#### Select subset of frequent tags

In [None]:
top_tags = tag_cnt[:50].index

tags = pd.get_dummies(books.tags.explode()).groupby(level=0).sum()

tags = tags.loc[:, top_tags]
tags.describe()

#### Check how many tags we have for each book

In [None]:
tags_per_book = tags.sum(axis=1)
tags_per_book.hist()
tags_per_book.describe()

In [None]:
# books with no tag
(tags_per_book == 0).sum()

#### Merge tags back to the book table

In [None]:
# let's ensure each book has at least one tag
# this will be helpful later when we prepare user features
tags = tags.assign(other=(tags_per_book == 0).astype(int))

book_tags = pd.concat([books.drop("price", axis=1), tags], axis=1)

book_tags = book_tags.drop(
    ["book_title", "book_author", "tags"], axis=1
).drop_duplicates()

book_tags

In [None]:
book_tags.to_csv(f"{FEAT_FOLDER}book_tags.csv", index=False)

# Prepare frequent book feature
----

In [None]:
freq_isbn = ratings.value_counts("isbn").reset_index(name="freq").head(5)
freq_isbn

In [None]:
# how many ratings (%) are for the top 20 books?
freq_isbn.freq.sum() / ratings.shape[0]

In [None]:
books_freq = books.assign(frequent=books.isbn.isin(freq_isbn.isbn)).loc[
    :, ["isbn", "frequent"]
]
books_freq

In [None]:
books_freq.to_csv(f"{FEAT_FOLDER}book_freq.csv", index=False)

# Prepare book price feature
----

In [None]:
print("share of NA prices: " + str(books.price.isna().sum() / books.shape[0]))
books.price.describe()

In [None]:
plt.scatter(np.arange(books.shape[0]), books.price.sort_values())

#### Let's build a model to imput the NaNs

In [None]:
book_price = book_tags.merge(books[["isbn", "price"]], how="inner", on="isbn")
book_price

In [None]:
book_price_tt = book_price[~book_price.price.isna()]
book_price_score = book_price[book_price.price.isna()]

X_train, X_test, y_train, y_test = train_test_split(
    book_price_tt.drop(["isbn", "price"], axis=1),
    book_price_tt.price,
    test_size=0.2,
    random_state=42,
)

In [None]:
# validation baseline - predict mean of training label everywhere
mean_price = y_train.mean()
print(
    f"mean age       = {mean_price}\n"
    + f"baseline error = {mean_squared_error(y_test, np.repeat(mean_price, y_test.shape[0]), squared=False)}"
)

In [None]:
model = RandomForestRegressor(min_samples_split=5)
model.fit(X_train, y_train)

# it's not much, but at least a bit better :D
print(
    f"train RMSE = {mean_squared_error(y_train, model.predict(X_train), squared=False)}\n"
    + f"valid RMSE = {mean_squared_error(y_test,  model.predict(X_test), squared=False)}"
)

In [None]:
plt.scatter(y_test, model.predict(X_test))
plt.plot(np.arange(120), np.arange(120), color="orange")

In [None]:
book_price_score = book_price_score.assign(
    price=model.predict(book_price_score.drop(["isbn", "price"], axis=1))
)

# distribution of predicted prices
book_price_score.price.describe()

In [None]:
book_price = pd.concat([book_price_tt, book_price_score])
book_price[["isbn", "price"]]

#### Save price feature

In [None]:
book_price[["isbn", "price"]].to_csv(f"{FEAT_FOLDER}book_price.csv", index=False)