In [None]:
import pandas as pd
import altair as alt

In [None]:
full_df = pd.read_csv("../data/goodreads_library_export.csv")

In [None]:
full_df.columns

In [None]:
relevant_cols = ["Book Id", 
        "Title", 
        "Author", 
        "ISBN", 
        "ISBN13", 
        "My Rating", 
        "Average Rating", 
        "Publisher", 
        "Number of Pages", 
        "Year Published",
        "Date Read",
        "Date Added",
        "Bookshelves",
        "Exclusive Shelf",
        "Read Count"
       ]

cols = {}
for col in relevant_cols:
    cols[col] = col.replace(" ", "_").lower()
cols["Title"] = "full_title"

In [None]:
df = full_df[cols.keys()].rename(columns=cols)

In [None]:
df[["isbn", "isbn13"]] = df[["isbn", "isbn13"]].replace(regex='["=]', value="")

In [None]:
df = df.astype({"date_read": "datetime64", "date_added": "datetime64"})

In [None]:
alt.Chart(df).mark_bar().encode(
    x="count()",
    y="exclusive_shelf"
)

In [None]:
year = 2019
main = alt.Chart(df.loc[df["exclusive_shelf"] == "read"]).transform_filter(
    f"year(datum.date_read) >= {year}").mark_rect(tooltip=True).encode(
        x=alt.X("day(date_read):N", axis=None),
        y="date(date_read):N",
        color=alt.Color("count():O", legend=None))

week_summary = alt.Chart(df.loc[df["exclusive_shelf"] == "read"]).transform_filter(
    f"year(datum.date_read) >= {year}").mark_text().encode(
        x="day(date_read):N",
        text="count():Q")

(main & week_summary)

In [None]:
df[df.duplicated(subset="full_title", keep=False)].sort_values(by="full_title")

In [None]:
df["genre_shelf"] = df["bookshelves"].str.extract(r"(read-[^,]*)").replace(regex="read-", value="")
df

In [None]:
df["series"] = df["title"].str.extract(r"(?:\()([A-Za-zÀ-ÖØ-öø-ÿ':\- ]*)(?:,? #)")

In [None]:
df["series_order"] = df["title"].str.extract(r"(?:,? #)([0-9]*)")

In [None]:
df["series_order"] = df["series_order"].astype(float)
df.dtypes

In [None]:
alt.Chart(df.loc[~df["series"].isna()]).mark_rect().encode(
    x="series_order:O",
    y="series:N",
    color="average_rating:Q",
    tooltip=["title", "author", "series", "series_order", "average_rating", "my_rating"]
)

In [None]:
df.loc[df["title"].str.contains(r"\(") & df["series"].isna()]

In [None]:
df.loc[df["series"].isna()]

In [None]:
df

In [None]:
test = df["title"].str.extract(r"(?P<title_trimmed>.*)(?:\()(?P<series_name>[A-Za-zÀ-ÖØ-öø-ÿ':\- ]*)(?:,? #)(?P<series_number>[0-9]*.?[0-9])(?:.*)")#.isna().sum()
test2 = pd.merge(df, test, left_index=True, right_index=True)
test2.loc[~test2["series"].isna()][["title","series", "series_order", "title_trimmed", "series_name", "series_number"]][200:250]
#test2[64:65]
test2["series_number"].str.replace("-","").astype(float)

In [None]:
def remove_series_from_title(s):
    return s["title"].str.replace()

In [None]:
alt.Chart(df.loc[df["exclusive_shelf"] == "read"]).mark_bar(tooltip=True).encode(
    y="count():Q",
    x="genre_shelf:N",
    color="genre_shelf:N",
    #tooltip=["title", "author"]
)#.properties(width=1000, height=500)

In [None]:
alt.Chart(df.loc[df["exclusive_shelf"] == "read"]).mark_bar(tooltip=True).encode(
    y="mean(average_rating):Q",
    x="genre_shelf:N",
    color="genre_shelf:N",
    #tooltip=["title", "author"]
)#.properties(width=1000, height=500)

In [None]:
alt.Chart(df.loc[df["exclusive_shelf"] == "read"]).mark_circle(tooltip=True).encode(
    y=alt.Y("average_rating:Q", scale=alt.Scale(zero=False)),
    x="number_of_pages:Q",
    color="genre_shelf:N",
    tooltip=["title", "author", "genre_shelf", "average_rating", "number_of_pages"]
).properties(width=1000, height=500).interactive()

In [None]:
alt.Chart(df.loc[df["exclusive_shelf"] == "read"]).mark_circle(tooltip=True).encode(
    y=alt.Y("my_rating:Q", scale=alt.Scale(domain=[-0.3, 5.3])),
    x="number_of_pages:Q",
    color="genre_shelf:N",
    tooltip=["title", "author", "genre_shelf", "number_of_pages"]
).properties(width=1000, height=500).interactive()

In [None]:
from bokeh.io import output_notebook
output_notebook()

In [None]:
y

In [None]:
from bokeh.plotting import figure, show


# prepare some data
x = df.loc[df["exclusive_shelf"] == "read"]["average_rating"].to_list()
y = df.loc[df["exclusive_shelf"] == "read"]["my_rating"].to_list()

# create a new plot with a title and axis labels
p = figure(y_range=[-0.5, 5.5], x_range=[3,5])

# add a line renderer with legend and line thickness
p.circle(x=x, y=y)
#p.y_range.start = 0
# show the results
show(p)