In [19]:
import scipy.sparse
from sklearn.feature_extraction.text import CountVectorizer

import pickle
import pandas as pd
import numpy as np

In [74]:
train_df = pd.read_csv('project_data_files/book_rating_train.csv')
test_df = pd.read_csv('project_data_files/book_rating_test.csv')

In [75]:
train_df = train_df.drop(columns=['Language', 'PublishDay', 'Name', 'Authors', 'Description', 'PublishYear', 'PublishMonth'], axis=1)
train_df.head()

Unnamed: 0,Publisher,pagesNumber,rating_label
0,Teaching Resources,48,4.0
1,DoubleDay,364,4.0
2,Chronicle Books,32,4.0
3,Bison Books,293,4.0
4,Penguin Books Ltd,352,3.0


In [76]:
author_vec = scipy.sparse.load_npz('project_data_files/book_text_features_countvec/book_text_features_countvec/train_authors_vec.npz')
author_vec = pd.DataFrame.sparse.from_spmatrix(author_vec)
author_vec.columns = ['author_vec_' + str(col) for col in author_vec.columns]

name_vec = scipy.sparse.load_npz('project_data_files/book_text_features_countvec/book_text_features_countvec/train_name_vec.npz')
name_vec = pd.DataFrame.sparse.from_spmatrix(name_vec)
name_vec.columns = ['name_vec_' + str(col) for col in name_vec.columns]

In [77]:
# concat data set
train_df = pd.concat([train_df.reset_index(drop=True), author_vec.reset_index(drop=True), name_vec.reset_index(drop=True)], axis=1)
train_df

Unnamed: 0,Publisher,pagesNumber,rating_label,author_vec_0,author_vec_1,author_vec_2,author_vec_3,author_vec_4,author_vec_5,author_vec_6,...,name_vec_20756,name_vec_20757,name_vec_20758,name_vec_20759,name_vec_20760,name_vec_20761,name_vec_20762,name_vec_20763,name_vec_20764,name_vec_20765
0,Teaching Resources,48,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,DoubleDay,364,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Chronicle Books,32,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Bison Books,293,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Penguin Books Ltd,352,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23058,2.13.61,120,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23059,HMH Books for Young Readers,32,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23060,Rowman & Littlefield Publishers,132,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23061,New Amsterdam Books,136,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
train_df['pagesNumber'] = train_df['pagesNumber'].apply(lambda x: np.nan if x < 10 else x)

In [79]:
mean = train_df['pagesNumber'].mean()
sd = train_df['pagesNumber'].std()
train_df = train_df[train_df['pagesNumber'] <= mean + 3 * sd]
train_df = train_df[train_df['pagesNumber'] >= mean - 3 * sd]
train_df


Unnamed: 0,Publisher,pagesNumber,rating_label,author_vec_0,author_vec_1,author_vec_2,author_vec_3,author_vec_4,author_vec_5,author_vec_6,...,name_vec_20756,name_vec_20757,name_vec_20758,name_vec_20759,name_vec_20760,name_vec_20761,name_vec_20762,name_vec_20763,name_vec_20764,name_vec_20765
0,Teaching Resources,48.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,DoubleDay,364.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Chronicle Books,32.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Bison Books,293.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Penguin Books Ltd,352.0,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23058,2.13.61,120.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23059,HMH Books for Young Readers,32.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23060,Rowman & Littlefield Publishers,132.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23061,New Amsterdam Books,136.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
# fill nan with mean
mean = train_df['pagesNumber'].mean()
print(mean)
train_df['pagesNumber'] = train_df['pagesNumber'].fillna(mean)
train_df

275.9122690709803


Unnamed: 0,Publisher,pagesNumber,rating_label,author_vec_0,author_vec_1,author_vec_2,author_vec_3,author_vec_4,author_vec_5,author_vec_6,...,name_vec_20756,name_vec_20757,name_vec_20758,name_vec_20759,name_vec_20760,name_vec_20761,name_vec_20762,name_vec_20763,name_vec_20764,name_vec_20765
0,Teaching Resources,48.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,DoubleDay,364.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Chronicle Books,32.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Bison Books,293.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Penguin Books Ltd,352.0,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23058,2.13.61,120.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23059,HMH Books for Young Readers,32.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23060,Rowman & Littlefield Publishers,132.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23061,New Amsterdam Books,136.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
train_df['Publisher'] = train_df['Publisher'].fillna('Unknown')

In [82]:
data_df = train_df['Publisher'].value_counts()

In [83]:
# mark publishers with less than 10 books as 'Other'
train_df['Publisher'] = train_df['Publisher'].apply(lambda x: 'Other' if x in data_df[data_df < 10].index else x)

In [84]:
y_train = train_df['rating_label']
# y_test = test_df['rating_label']

X_train = train_df.drop(['rating_label'], axis=1)

In [85]:
X_train

Unnamed: 0,Publisher,pagesNumber,author_vec_0,author_vec_1,author_vec_2,author_vec_3,author_vec_4,author_vec_5,author_vec_6,author_vec_7,...,name_vec_20756,name_vec_20757,name_vec_20758,name_vec_20759,name_vec_20760,name_vec_20761,name_vec_20762,name_vec_20763,name_vec_20764,name_vec_20765
0,Other,48.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Other,364.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Chronicle Books,32.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Bison Books,293.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Penguin Books Ltd,352.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23058,Other,120.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23059,HMH Books for Young Readers,32.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23060,Rowman & Littlefield Publishers,132.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23061,Other,136.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
# fit X_train
enc.fit(X_train[['Publisher']])
# # get feature names
feature_names = enc.get_feature_names_out(['Publisher'])
# # transform X_train
X_train_ohe = enc.transform(X_train[['Publisher']])
X_train_ohe = pd.DataFrame.sparse.from_spmatrix(X_train_ohe)
X_train_ohe.columns = feature_names
X_train_ohe

Unnamed: 0,Publisher_1st World Library - Literary Society,Publisher_ADV Manga,Publisher_AMACOM/American Management Association,Publisher_Abbeville Press,Publisher_Abingdon Press,Publisher_Ace,Publisher_Ace Books,Publisher_Adams Media,Publisher_Addison Wesley Publishing Company,Publisher_Addison-Wesley Professional,...,Publisher_Yearling,Publisher_Zebra,Publisher_Zed Books,Publisher_Zonderkidz,Publisher_Zondervan,Publisher_Zondervan Academic,Publisher_Zondervan Publishing Company,Publisher_eReads.com,Publisher_iBooks,Publisher_iUniverse
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
mi = SelectKBest(mutual_info_classif, k=450)
X_train_ohe_mi = mi.fit_transform(X_train_ohe, y_train)
X_train_ohe_mi = pd.DataFrame.sparse.from_spmatrix(X_train_ohe_mi)
X_train_ohe_mi.columns = X_train_ohe.columns[mi.get_support()]
X_train_ohe_mi.head()

Unnamed: 0,Publisher_1st World Library - Literary Society,Publisher_ADV Manga,Publisher_AMACOM/American Management Association,Publisher_Abbeville Press,Publisher_Abingdon Press,Publisher_Ace,Publisher_Ace Books,Publisher_Adams Media,Publisher_Addison Wesley Publishing Company,Publisher_Addison-Wesley Professional,...,Publisher_Yearling,Publisher_Zebra,Publisher_Zed Books,Publisher_Zonderkidz,Publisher_Zondervan,Publisher_Zondervan Academic,Publisher_Zondervan Publishing Company,Publisher_eReads.com,Publisher_iBooks,Publisher_iUniverse
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
X_train_ohe_mi = pd.concat([X_train_ohe_mi.reset_index(drop=True), X_train.drop(columns = ['Publisher'], axis=1).reset_index(drop=True)], axis=1)
X_train_ohe_mi

Unnamed: 0,Publisher_1st World Library - Literary Society,Publisher_ADV Manga,Publisher_AMACOM/American Management Association,Publisher_Abbeville Press,Publisher_Abingdon Press,Publisher_Ace,Publisher_Ace Books,Publisher_Adams Media,Publisher_Addison Wesley Publishing Company,Publisher_Addison-Wesley Professional,...,name_vec_20756,name_vec_20757,name_vec_20758,name_vec_20759,name_vec_20760,name_vec_20761,name_vec_20762,name_vec_20763,name_vec_20764,name_vec_20765
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
22622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
22623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
22624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


KeyError: 'Publisher'

In [93]:
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
param_grid = {
    'n_neighbors': [5]
}
knn = KNeighborsClassifier(weights='distance')
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'], refit=False, n_jobs=-1)
grid_search.fit(X_train_ohe_mi, y_train)
search_results = pd.DataFrame(grid_search.cv_results_)
search_results



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,...,std_test_recall_weighted,rank_test_recall_weighted,split0_test_f1_weighted,split1_test_f1_weighted,split2_test_f1_weighted,split3_test_f1_weighted,split4_test_f1_weighted,mean_test_f1_weighted,std_test_f1_weighted,rank_test_f1_weighted
0,40.473634,8.880123,173.729806,6.961022,5,{'n_neighbors': 5},0.634777,0.638453,0.63779,,...,,1,0.612817,0.609218,0.613046,,0.618645,,,1
