<a href="https://colab.research.google.com/github/dgadela/ML_HandsOn/blob/master/AmzonTop50BestSellingBooks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
data=pd.read_csv("bestsellers_with_categories.csv")
data.info()

#Preprocessing
stop_words = stopwords.words('english')
def process_name(name):
    name = re.sub(r'\d+', ' ', name)
    name = name.split()
    name = " ".join([word for word in name if word not in stop_words])
    return name
names = data['Name'].apply(process_name)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(names)
vocab_length = len(tokenizer.word_index) + 1
names = tokenizer.texts_to_sequences(names)
max_seq_length = np.max(list(map(lambda name: len(name), names)))
names = pad_sequences(names, maxlen=max_seq_length, padding='post')
names

data=data.drop('Name',axis=1)
genre_mapping={'Non Fiction' : 0,'Fiction' : 1}
data['Genre']=data['Genre'].replace(genre_mapping)
print("No of unique authors:" , len(data['Author'].unique()))

def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

data=onehot_encode(data,'Author','auth')
data
print("data",data)

#splitting/scaling
y=data['Genre'].copy()
X=data.drop('Genre',axis=1).copy()
scaler=StandardScaler()
X = scaler.fit_transform(X)
names_train, names_test, X_train, X_test, y_train, y_test = train_test_split(names, X, y, train_size=0.7, random_state=100)

#Model/Training
embedding_dim = 64
# Name features
name_input = tf.keras.Input(shape=(20,), name="name_input")
embedding = tf.keras.layers.Embedding(
    input_dim=vocab_length,
    output_dim=embedding_dim,
    input_length=max_seq_length,
    name="name_embedding"
)(name_input)

name_flatten = tf.keras.layers.Flatten(name="name_flatten")(embedding)
# Other features
other_input = tf.keras.Input(shape=(252,), name="other_input")
hidden_1 = tf.keras.layers.Dense(256, activation='relu', name="other_dense_1")(other_input)
hidden_2 = tf.keras.layers.Dense(256, activation='relu', name="other_dense_2")(hidden_1)
# Concatenate and output
concat = tf.keras.layers.concatenate([name_flatten, hidden_2], name="concatenate")
outputs = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(concat)
model = tf.keras.Model(inputs=[name_input, other_input], outputs=outputs)
print(model.summary())
tf.keras.utils.plot_model(model)

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)
batch_size = 32
epochs = 100
history = model.fit(
    [names_train, X_train],
    y_train,
    validation_split=0.12,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True) ]
)
model.evaluate([names_test, X_test], y_test)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550 entries, 0 to 549
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         550 non-null    object 
 1   Author       550 non-null    object 
 2   User Rating  550 non-null    float64
 3   Reviews      550 non-null    int64  
 4   Price        550 non-null    int64  
 5   Year         550 non-null    int64  
 6   Genre        550 non-null    object 
dtypes: float64(1), int64(3), object(3)
memory usage: 30.2+ KB
No of unique authors: 248
data      User Rating  Reviews  ...  auth_Wizards RPG Team  auth_Zhi Gang Sha
0            4.7    17350  ...                      0                  0
1            4.6     2052  ...                      0                  0
2            4.7    18979  ...                      0                  0
3            

[0.31249886751174927, 0.8727272748947144, 0.9393571615219116]