In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(font_scale=1.5)

import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
plt.style.use('fivethirtyeight')

In [2]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import AgglomerativeClustering

In [3]:
amazon_books = pd.read_csv('../data-sources/amazon-bestsellers/amazon-bestsellers.csv')
amazon_books.columns = ['name', 'author', 'rating', 'reviews', 'price', 'year', 'genre']
amazon_books.head()

Unnamed: 0,name,author,rating,reviews,price,year,genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction


In [4]:
le = LabelEncoder()
amazon_books['genre'] = le.fit_transform(amazon_books['genre'])

In [5]:
amazon_books.head()

Unnamed: 0,name,author,rating,reviews,price,year,genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,1
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,0
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,1
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,0
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,1


In [6]:
amazon_books = pd.get_dummies(amazon_books, columns=['author'], drop_first=True)
amazon_books.head()

Unnamed: 0,name,rating,reviews,price,year,genre,author_Adam Gasiewski,author_Adam Mansbach,author_Adir Levy,author_Admiral William H. McRaven,...,author_Todd Burpo,author_Tony Hsieh,author_Tucker Carlson,author_Veronica Roth,author_W. Cleon Skousen,author_Walter Isaacson,author_William Davis,author_William P. Young,author_Wizards RPG Team,author_Zhi Gang Sha
0,10-Day Green Smoothie Cleanse,4.7,17350,8,2016,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,11/22/63: A Novel,4.6,2052,22,2011,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12 Rules for Life: An Antidote to Chaos,4.7,18979,15,2018,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1984 (Signet Classics),4.7,21424,6,2017,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"5,000 Awesome Facts (About Everything!) (Natio...",4.8,7665,12,2019,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
scalable_cols = amazon_books[['rating', 'reviews', 'price', 'year']]
scaler = StandardScaler()
scalable_cols = pd.DataFrame(scaler.fit_transform(scalable_cols), columns=scalable_cols.columns)
scalable_cols.head()

Unnamed: 0,rating,reviews,price,year
0,0.35999,0.460453,-0.47081,0.632456
1,-0.080978,-0.844786,0.821609,-0.948683
2,0.35999,0.59944,0.1754,1.264911
3,0.35999,0.80805,-0.655441,0.948683
4,0.800958,-0.36588,-0.101547,1.581139


In [8]:
amazon_books[['rating', 'reviews', 'price', 'year']] = scalable_cols

In [9]:
amazon_books

Unnamed: 0,name,rating,reviews,price,year,genre,author_Adam Gasiewski,author_Adam Mansbach,author_Adir Levy,author_Admiral William H. McRaven,...,author_Todd Burpo,author_Tony Hsieh,author_Tucker Carlson,author_Veronica Roth,author_W. Cleon Skousen,author_Walter Isaacson,author_William Davis,author_William P. Young,author_Wizards RPG Team,author_Zhi Gang Sha
0,10-Day Green Smoothie Cleanse,0.359990,0.460453,-0.470810,0.632456,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,11/22/63: A Novel,-0.080978,-0.844786,0.821609,-0.948683,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12 Rules for Life: An Antidote to Chaos,0.359990,0.599440,0.175400,1.264911,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1984 (Signet Classics),0.359990,0.808050,-0.655441,0.948683,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"5,000 Awesome Facts (About Everything!) (Natio...",0.800958,-0.365880,-0.101547,1.581139,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),1.241926,-0.216739,-0.470810,1.581139,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
546,You Are a Badass: How to Stop Doubting Your Gr...,0.359990,0.202869,-0.470810,0.632456,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
547,You Are a Badass: How to Stop Doubting Your Gr...,0.359990,0.202869,-0.470810,0.948683,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
548,You Are a Badass: How to Stop Doubting Your Gr...,0.359990,0.202869,-0.470810,1.264911,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
clustering_model = AgglomerativeClustering(n_clusters=1, affinity='euclidean', linkage='ward', compute_distances=True)
clustering_model.fit(amazon_books.iloc[:, 1:])

AgglomerativeClustering(compute_distances=True, n_clusters=1)

In [12]:
clustering_model.distances_

array([ 0.03481091,  0.09231564,  0.0932103 ,  0.27694691,  0.28254608,
        0.28823978,  0.30118265,  0.31622777,  0.31622777,  0.31622777,
        0.31622777,  0.31622777,  0.31622777,  0.31622777,  0.31622777,
        0.31622777,  0.31622777,  0.31622777,  0.31622777,  0.31622777,
        0.31622777,  0.31622777,  0.31622777,  0.31622777,  0.31622777,
        0.31622777,  0.31622777,  0.31622777,  0.31622777,  0.31622777,
        0.31622777,  0.31622777,  0.31622777,  0.31622777,  0.31622777,
        0.31622777,  0.31622777,  0.31622777,  0.31622777,  0.31622777,
        0.31622777,  0.31622777,  0.31622777,  0.31622777,  0.31622777,
        0.31622777,  0.31622777,  0.31622777,  0.31622777,  0.31622777,
        0.31622777,  0.31622777,  0.31622777,  0.31622777,  0.31622777,
        0.31622777,  0.31622777,  0.31622777,  0.31622777,  0.31622777,
        0.31622777,  0.31622777,  0.31622777,  0.31622777,  0.31622777,
        0.31622777,  0.31622777,  0.31622777,  0.31622777,  0.31