In [None]:
!pip install kneed

In [None]:
import numpy as np 
import pandas as pd 
from kneed import KneeLocator

import matplotlib.pyplot as plt

import seaborn as sns
sns.set_theme(context="notebook", style="whitegrid", palette="dark")

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

Introduction
Who is art garfunkel? Where does this data come from? How was it aggregated?
Goal: Explore the dataset throuoly for trends and insights"

Info about what each row appears to be

In [None]:
library_df = pd.read_csv("/kaggle/input/art-garfunkels-library/Art Garfunkel Library.csv")
library_df.head()

'Date Read is in an odd format so we will separate into standard cols of month and year
- Add back month column + chart from git history

In [None]:
# Separates date and month into separate columns

def get_month_from_date_read_column(date):
    month_date_split = date.split('-')
    if len(month_date_split) == 2:
        return month_date_split[0]
    else:
        return np.nan

library_df['Month Read'] = library_df['Date Read'].apply(lambda date: get_month_from_date_read_column(date))
library_df['Month Read'].unique()

In [None]:
# Separates date and year into separate columns
def get_year_from_date_read_column(date):
    year_date_split = date.split('-')
    if len(year_date_split) == 2:
        year = int(year_date_split[1])
        if year < 41: # Art's Year of Birth: 1941
            return 2000+year
        else :
            return 1900+year
    else:
        return date

library_df['Year Read'] = library_df['Date Read'].apply(lambda date: str(get_year_from_date_read_column(date)))
library_df['Year Read'].unique()

In [None]:
library_df.describe(include="object")

In [None]:
sns.countplot(y=library_df['Favorite'],data=library_df)
plt.title("Art Garfunkel's Favorite Books")
plt.xlabel("Frequency")
plt.ylabel("Is Favorite?")
plt.show()

In [None]:
library_df['Pages'].hist(bins = 15)

plt.title("How long are the books in Art Garfunkel's library?")
plt.xlabel("# Pages")
plt.ylabel("Frequency")
plt.show()

In [None]:
library_df["Year Read"] = library_df["Year Read"].astype(int)
pages_per_year = library_df[['Year Read', 'Pages']].groupby(['Year Read']).mean()

sns.lineplot(pages_per_year)

plt.title("How many pages, on average, did Art read each year?")
plt.xlabel("Year Read")
plt.ylabel("# Pages")
plt.show()

In [None]:
sns.histplot(data=library_df, x="Year Read", hue='Favorite', kde=True, multiple='stack')

plt.title("How many books did Art read each year?")
plt.xlabel("Year Read")
plt.ylabel("# Books")
plt.show()

In [None]:
library_df.describe(include="object")

In [None]:
numerical_cols = ['Pages']
categorical_cols = ['Date Read', 'Month Read', 'Year Read', 'Year Published']

transformer = make_column_transformer(
    (OneHotEncoder(sparse_output=False), categorical_cols),
    (StandardScaler(), numerical_cols),
    remainder="passthrough")

transformed = transformer.fit_transform(library_df)
feature_names = [name.split("__")[-1] for name in transformer.get_feature_names_out()]

In [None]:
transformed_df = pd.DataFrame(transformed, columns=feature_names)
transformed_df = transformed_df.drop(columns=["Author", "Books"])

X = transformed_df.drop(columns=["Favorite"])
y = transformed_df.loc[:,["Favorite"]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0,train_size=0.8)
print("X_train shape: " + str(X_train.shape))
print("X_test shape: " + str(X_test.shape))

In [None]:
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X_train)
    sse.append(kmeans.inertia_)

In [None]:
sse

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
kl = KneeLocator(range(1, 11), sse, curve="convex", direction="decreasing")
kl.elbow

In [None]:
silhouette_coefficients = []
# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X_train)
    score = silhouette_score(X_train, kmeans.labels_)
    silhouette_coefficients.append(score)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(2, 11), silhouette_coefficients)
plt.xticks(range(2, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()