In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('df_cleaned.csv') 
df

## Categorical columns

In [None]:
categorical = df.select_dtypes(include=object)

In [None]:
categorical.columns.tolist()

We will not focus on 'bookId', 'title', 'author', 'genres' and 'publishDate' columns because it as to many unique values.\
We keep 'author' and 'genres' columns for later.\
We add also the column 'awards' since there's only to values, 1 for 'yes' and 0 for 'no'.

In [None]:
categorical = df[['series', 'language',
 'bookFormat',
 'publisher', 'awards']]

In [None]:
for col in categorical.columns:
    display(categorical[col].value_counts())

### 1. Language

Most of the books are in English, all the others languages will be grouped in 'other'

In [None]:
categorical['language'] = np.where(categorical['language'].isin(['English']),'English','Other')


In [None]:
categorical['language'].value_counts()

### 2. Book format

Most of the book are paperback or hardcover, all the others formats will be grouped in 'other'

In [None]:
categorical['bookFormat'] = np.where(categorical['bookFormat'].isin(['Paperback', 'Hardcover']),categorical['bookFormat'],'Other')
categorical['bookFormat'].value_counts()

### 3. Publisher

Since readers may be sensitive to the publisher we will take a closer look to the most 10 famous ones and group the others in 'Other'.

In [None]:
publisher = pd.DataFrame(categorical['publisher'].value_counts())
publisher.index.tolist()

In [None]:
categorical['publisher'] = np.where(categorical['publisher'].isin([
'Vintage',
 'HarperCollins',
 'Penguin Books',
 'Ballantine Books',
 'Bantam',
 'Createspace Independent Publishing Platform',
 'Pocket Books',
 'Avon',
 'Berkley',
 'Del Rey']),categorical['publisher'],'Other')
categorical['publisher'].value_counts()

## Numerical columns

In [None]:
numerical = df.select_dtypes(include=np.number)
numerical = numerical.drop(columns=['awards','bbeScore', 'bbeVotes'])
numerical

## Outliers

In [None]:
for col in numerical:
    numerical[[col]].boxplot()
    plt.show()

Only the column 'pages' seems to have problematic outliers, let's remove them 

In [None]:
iqr = np.percentile(numerical['pages'],75) - np.percentile(numerical['pages'],25)
upper_limit = np.percentile(numerical['pages'],75) + 5*iqr
lower_limit = np.percentile(numerical['pages'],25) - 5*iqr
df_no_outliers = numerical[(numerical['pages']>lower_limit) & (numerical['pages']<upper_limit)].copy()
sns.displot(df_no_outliers['pages'])
plt.show()

In [None]:
numerical['pages'] = df_no_outliers['pages']

In [None]:
numerical[['pages']].boxplot()

### Now we have removed the outliers, let's check for correlations

In [None]:
correlations_matrix = numerical.corr()
sns.heatmap(correlations_matrix, annot=True)

We can see that ratins are corralated with the percentage of liked books which can be explained by the fact that liked books are those with more than 2 stars.\


In [None]:
for column in numerical:
    sns.distplot(numerical[column])
    plt.show()