In [37]:
from path import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Importing our input dataset
bestsellers_df = pd.read_csv('bestsellers-sampledata.csv')
bestsellers_df.head()

Unnamed: 0,Name,Author,User Rating,Reviews,Price,Year,UMCSENT,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,91.841667,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,67.35,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,98.366667,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,96.766667,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,95.983333,Non Fiction


In [38]:
# Dropping not features columns

bestsellers_df = bestsellers_df.drop(['Author', 'Name', 'User Rating', 'Reviews'], axis = 1) 
bestsellers_df.head()

Unnamed: 0,Price,Year,UMCSENT,Genre
0,8,2016,91.841667,Non Fiction
1,22,2011,67.35,Fiction
2,15,2018,98.366667,Non Fiction
3,6,2017,96.766667,Fiction
4,12,2019,95.983333,Non Fiction


In [39]:
#Encoding labels

bestsellers_binary_encoded = pd.get_dummies(bestsellers_df, columns=["Genre"])
bestsellers_binary_encoded.head()



Unnamed: 0,Price,Year,UMCSENT,Genre_Fiction,Genre_Non Fiction
0,8,2016,91.841667,0,1
1,22,2011,67.35,1,0
2,15,2018,98.366667,0,1
3,6,2017,96.766667,1,0
4,12,2019,95.983333,0,1


In [40]:
# Dropping Genre_Non Fiction 
bestsellers_binary_encoded = bestsellers_binary_encoded.drop(['Genre_Non Fiction'], axis = 1)
bestsellers_binary_encoded.head()

Unnamed: 0,Price,Year,UMCSENT,Genre_Fiction
0,8,2016,91.841667,0
1,22,2011,67.35,1
2,15,2018,98.366667,0
3,6,2017,96.766667,1
4,12,2019,95.983333,0


In [41]:
# Define features set
X = bestsellers_binary_encoded.copy()
X = X.drop("Genre_Fiction", axis=1)
X.head()

Unnamed: 0,Price,Year,UMCSENT
0,8,2016,91.841667
1,22,2011,67.35
2,15,2018,98.366667
3,6,2017,96.766667
4,12,2019,95.983333


In [42]:
# Define the target set.
y = bestsellers_binary_encoded["Genre_Fiction"].ravel()

In [43]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [44]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(412, 3)
(138, 3)
(412,)
(138,)


In [45]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [46]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [47]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



In [48]:
#Fitting the random forest model 
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [49]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [50]:
#Making prediction using testing data
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0], dtype=uint8)

In [51]:
# Model Evaluation 
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,53,27
Actual 1,25,33


In [52]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,53,27
Actual 1,25,33


Accuracy Score : 0.6231884057971014
Classification Report
              precision    recall  f1-score   support

           0       0.68      0.66      0.67        80
           1       0.55      0.57      0.56        58

    accuracy                           0.62       138
   macro avg       0.61      0.62      0.62       138
weighted avg       0.63      0.62      0.62       138



In [53]:
## Ranking the Importance of Features
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.80905998, 0.09566295, 0.09527707])

In [54]:
#Sorting the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.8090599787928667, 'Price'),
 (0.0956629539688107, 'Year'),
 (0.0952770672383226, 'UMCSENT')]