In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# move wd to the project root
os.chdir("../../")

PRIM_FOLDER = "data/2_primary/"
FEAT_FOLDER = "data/3_feature/"

os.makedirs(FEAT_FOLDER, exist_ok=True)

In [None]:
ratings = pd.read_csv(f"{PRIM_FOLDER}ratings.csv")
users = pd.read_csv(f"{PRIM_FOLDER}users.csv")

## Prepare user preferred tag features
----

#### Combine book tags and user ratings

In [None]:
book_tags = pd.read_csv(f"{FEAT_FOLDER}book_tags.csv")

In [None]:
user_ratings = ratings.merge(book_tags, how="left", on="isbn")
user_ratings

In [None]:
# no NaN
user_ratings.isna().sum().sum()

#### Calculate how many times did user rate books with specific tag

In [None]:
user_tags_cnt = (
    user_ratings.melt(["user_id", "isbn", "book_rating"], var_name="tag")
    .groupby(["user_id", "tag"], as_index=False)
    .value.sum()
)

In [None]:
top_n = 500

# somebody reads a lot of similar books
plt.scatter(
    x=np.arange(top_n),
    y=user_tags_cnt.sort_values("value", ascending=False).value[:top_n],
)

In [None]:
user_tags_cnt.user_id.nunique()

####  Create user tags feature group

In [None]:
users_tag = user_tags_cnt.pivot_table(
    index="user_id", columns="tag", values="value", fill_value=0
).reset_index()
users_tag

In [None]:
users_tag.to_csv(f"{FEAT_FOLDER}user_tags.csv", index=False)

## Prepare user favoutre books 
----

In [None]:
ratings.book_rating.describe()

In [None]:
display(ratings.book_rating.value_counts())
ratings.book_rating.hist()

In [None]:
user_favourite = ratings.assign(rating=(ratings.book_rating > 5).astype("int")).drop(
    "book_rating", axis=1
)
user_favourite

In [None]:
# count of good ratings per book
user_favourite.groupby("isbn").rating.sum().describe()

In [None]:
user_favourite.to_csv(f"{FEAT_FOLDER}user_favourite.csv", index=False)

## Prepare user location features
----

In [None]:
users

#### Check frequency of full locations

In [None]:
locations = users.value_counts("location")

# top 50 most frequent location cover only ~ 16 % of records
print("Top 50 locations coverage = " + str(locations.iloc[:50].sum() / locations.sum()))

# there is over 57 000 unique locations for 278 000 users
locations

#### Try splitting the location into parts (~ city, region, country)

In [None]:
users_loc = (
    users.assign(loc_part=users.location.str.split(", "))
    .loc[:, ["user_id", "loc_part"]]
    .explode("loc_part")
)

top_50_loc = users_loc.value_counts("loc_part").iloc[:50]
users_loc_top = users_loc[users_loc.loc_part.isin(top_50_loc.index)]

# top 50 partial locations, we can cover 92 % of all users
print(users_loc_top.user_id.nunique() / users.shape[0])

In [None]:
# USA is used in ~ 140 000 cases
plt.scatter(x=np.arange(top_50_loc.shape[0]), y=top_50_loc)

top_50_loc.head()

#### Merge locations back to the users

In [None]:
locations = (
    users_loc_top.assign(help=1)
    .pivot_table(index="user_id", columns="loc_part", values="help", fill_value=0)
    .reset_index()
    .drop(",", axis=1)
)

fill_cols = locations.columns[1:]

users_location = users.drop(["location", "age"], axis=1).merge(
    locations, how="left", on="user_id"
)
users_location[fill_cols] = users_location[fill_cols].fillna(value=0)

#### Check how many tags we have for each user

In [None]:
# we have location for majority of users
per_user_location = users_location.iloc[:, 1:].sum(axis=1)

print("users without location = " + str((per_user_location == 0).sum()))

per_user_location.describe()

#### Save location feature group

In [None]:
users_location.to_csv(f"{FEAT_FOLDER}user_locations.csv", index=False)

## Review user age
----

In [None]:
users_age = users[["user_id", "age"]]

In [None]:
plt.scatter(np.arange(users_age.shape[0]), users_age.age.sort_values())

In [None]:
# This should be NaN as well
# Unless we have a lot of genious babies
users_age.age[users_age.age == 0]

In [None]:
# This should be NaN as well
# Oldest living person = 122 (https://en.wikipedia.org/wiki/List_of_the_verified_oldest_people)
users_age.age[users_age.age > 122]

In [None]:
users_age = users_age.assign(
    age=np.where(users_age.age.between(1, 122), users_age.age, np.nan)
)
users_age.age.describe()

In [None]:
# keep only users that rated at least one book (those that are in users_tag)
users_age = users_age.merge(users_location, how="inner", on="user_id").merge(
    users_tag, how="inner", on="user_id"
)
users_age

#### Let's build a model to imput the NaNs

In [None]:
users_age_tt = users_age[~users_age.age.isna()]
users_age_score = users_age[users_age.age.isna()]

X_train, X_test, y_train, y_test = train_test_split(
    users_age_tt.iloc[:, 2:], users_age_tt.age, test_size=0.2, random_state=42
)

In [None]:
users_age_tt.age.hist()

In [None]:
# validation baseline - predict mean of training label everywhere
mean_age = y_train.mean()
print(
    f"mean age       = {mean_age}\n"
    + f"baseline error = {mean_squared_error(y_test, np.repeat(mean_age, y_test.shape[0]), squared=False)}"
)

In [None]:
model = RandomForestRegressor(min_samples_split=80)
model.fit(X_train, y_train)

# it's not much, but at least a bit better :D
print(
    f"train RMSE = {mean_squared_error(y_train, model.predict(X_train), squared=False)}\n"
    + f"valid RMSE = {mean_squared_error(y_test,  model.predict(X_test), squared=False)}"
)

In [None]:
plt.scatter(y_test, model.predict(X_test))
plt.plot(np.arange(120), np.arange(120), color="orange")

In [None]:
users_age_score = users_age_score.assign(age=model.predict(users_age_score.iloc[:, 2:]))

# very close to predicting the mean everywhere
users_age_score.age.describe()

In [None]:
users_age = pd.concat([users_age_tt, users_age_score])
users_age.age.describe()

#### Save age feature

In [None]:
users_age.loc[:, ["user_id", "age"]].to_csv(f"{FEAT_FOLDER}user_age.csv", index=False)