In [None]:
import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use("fivethirtyeight")

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

In [None]:
library_df = pd.read_csv("/kaggle/input/art-garfunkels-library/Art Garfunkel Library.csv")
library_df.head()

In [None]:
# Separates date and year into separate columns
def get_year_from_date_read_column(date):
    year_date_split = date.split('-')
    if len(year_date_split) == 2:
        year = int(year_date_split[1])
        if year < 41: # Art's Year of Birth: 1941
            return 2000+year
        else :
            return 1900+year
    else:
        return int(date)

library_df['Year Read'] = library_df['Date Read'].apply(lambda date: get_year_from_date_read_column(date))
library_df['Year Read'].unique()

In [None]:
library_df.describe(include="object")

In [None]:
sns.countplot(y=library_df['Favorite'],data=library_df)
plt.xlabel("# Favorite Books")
plt.ylabel("Art's Favorite Books?")
plt.show()

In [None]:
library_df[['Year Read','Pages']].hist(bins = 15)
plt.show()

In [None]:
pages_per_year = library_df[['Year Read', 'Pages']].groupby(['Year Read']).mean()
sns.lineplot(pages_per_year)
plt.show()

In [None]:
sns.histplot(data=library_df, x="Year Read", hue='Favorite', kde=True, multiple='stack')
plt.show()

In [None]:
#Can have books with same name but different author, almost all unique - so removing
#Separated Date Read into year read so dropping as well
# Making "Year Read" into str type for pred analysis as it seems more categorical
library_df.drop(columns=['Date Read', 'Books'], inplace=True)
library_df['Year Read'] = library_df['Year Read'].astype(str)
library_df.head()

In [None]:
categorical_cols = ['Year Published', 'Author']
numerical_cols = ['Year Read', 'Pages']

transformer = make_column_transformer(
    (OneHotEncoder(sparse_output=False), categorical_cols),
    (StandardScaler(), numerical_cols),
    remainder="passthrough")

transformed = transformer.fit_transform(library_df)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(transformed_df, transformed_df['remainder__Favorite'], test_size=0.2, random_state=42)
X_train.head()