In [None]:
import numpy as np 
import pandas as pd 
import requests as res

import matplotlib.pyplot as plt

import seaborn as sns
sns.set_theme(context="notebook", style="whitegrid", palette="dark")

In [None]:
from sklearn import svm
from sklearn.compose import make_column_transformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Introduction
# Who is art garfunkel? Where does this data come from? How was it aggregated?
# Goal: Explore the dataset throuoly for trends and insights"

# Info about what each row appears to be

In [None]:
library_df = pd.read_csv("/kaggle/input/art-garfunkels-library/Art Garfunkel Library.csv")
library_df.head()

'Date Read is in an odd format so we will separate into standard cols of month and year
- Add back month column + chart from git history

In [None]:
# Separates date and month into separate columns

def get_month_from_date_read_column(date):
    month_date_split = date.split('-')
    if len(month_date_split) == 2:
        return month_date_split[0]
    else:
        return np.nan

library_df['Month Read'] = library_df['Date Read'].apply(lambda date: get_month_from_date_read_column(date))
library_df['Month Read'].unique()

In [None]:
# Separates date and year into separate columns
def get_year_from_date_read_column(date):
    year_date_split = date.split('-')
    if len(year_date_split) == 2:
        year = int(year_date_split[1])
        if year < 41: # Art's Year of Birth: 1941
            return 2000+year
        else :
            return 1900+year
    else:
        return date

library_df['Year Read'] = library_df['Date Read'].apply(lambda date: str(get_year_from_date_read_column(date)))
library_df['Year Read'].unique()

In [None]:
library_df.describe(include="object")

In [None]:
library_df.head()

In [None]:
sns.countplot(y=library_df['Favorite'],data=library_df)
plt.title("Art Garfunkel's Favorite Books")
plt.xlabel("Frequency")
plt.ylabel("Is Favorite?")
plt.show()

In [None]:
library_df['Pages'].hist(bins = 15)

plt.title("How long are the books in Art Garfunkel's library?")
plt.xlabel("# Pages")
plt.ylabel("Frequency")
plt.show()

In [None]:
library_df["Year Read"] = library_df["Year Read"].astype(int)
pages_per_year = library_df[['Year Read', 'Pages']].groupby(['Year Read']).mean()

sns.lineplot(pages_per_year)

plt.title("How many pages, on average, did Art read each year?")
plt.xlabel("Year Read")
plt.ylabel("# Pages")
plt.show()

In [None]:
sns.histplot(data=library_df, x="Year Read", hue='Favorite', kde=True, multiple='stack')

plt.title("How many books did Art read each year?")
plt.xlabel("Year Read")
plt.ylabel("# Books")
plt.show()

In [None]:
library_df.describe(include="object")

In [None]:
numerical_cols = ['Pages']
categorical_cols = ['Date Read', 'Month Read', 'Year Read', 'Year Published']

transformer = make_column_transformer(
    (OneHotEncoder(sparse_output=False), categorical_cols),
    (StandardScaler(), numerical_cols),
    remainder="passthrough")

transformed = transformer.fit_transform(library_df)
feature_names = [name.split("__")[-1] for name in transformer.get_feature_names_out()]

In [None]:
transformed_df = pd.DataFrame(transformed, columns=feature_names)
transformed_df = transformed_df.drop(columns=["Author", "Books"])

X = transformed_df.drop(columns=["Favorite"])
y = transformed_df.loc[:,["Favorite"]]

In [None]:
transformed_df = pd.DataFrame(transformed, columns=feature_names)
transformed_df = transformed_df.drop(columns=["Author", "Books"])

x = transformed_df.drop(columns=["Favorite"])
y = transformed_df["Favorite"].values.astype(int)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=0,test_size=0.3,stratify=y)
# Stratfy because there's a small number of 1 vs 0 in target data, need good distribution across split for accurate scoring
print("X_train shape: " + str(x_train.shape))
print("X_test shape: " + str(x_test.shape))

In [None]:
params = [
            {'kernel': ['rbf'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}, 
            {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
        ]

grid_search = GridSearchCV(svm.SVC(), 
                           params, 
                           scoring = 'accuracy',
                           cv = 5,
                           verbose = 1,
                           n_jobs= -1) 

grid_search.fit(x_train,y_train)

print('The best model was:', grid_search.best_estimator_)
print('The best accuracy score was:', grid_search.best_score_)

In [None]:
y_pred = grid_search.best_estimator_.predict(x_test)
print(classification_report(y_test,y_pred, zero_division=False))