# Data Processing

## Remove Rows and Sample Randomly

We will now drop any rows that are irrelevant to us and save it to a CSV file.

In [1]:
import pandas as pd

# load data
df = pd.read_csv('data/Books_rating.csv')

# only preserve 'review/summary', 'review/text', and 'review/score' columns
df = df[['review/summary', 'review/text', 'review/score']]

# rename columns
df.columns = ['summary', 'text', 'score']

# choose 1 million random rows
df = df.sample(n=1000000, random_state=1)

# save to new csv file
df.to_csv('data/Books_rating_relevant_columns.csv', index=False)

# print first 5 rows
df.head()

Unnamed: 0,summary,text,score
2896109,Best edition of this classic.,I've always recommended this Yale edition of F...,5.0
2381153,Great Book!!,This is required reading for my 16 yr old son....,5.0
1028690,Not just a book for consultant,"Plain-spoken, finished the book only has taken...",4.0
1945977,Outrageously Bad,Wow... this is one of the most ridiculous stor...,1.0
2812693,Cunning and determination,A crew has mutinied and threatens to hang thei...,4.0


## Stopwords, Lemmatization, and Vectorization

Process the text to remove stopwords, lemmatize, strip unneeded characters, then vectorize.

In [2]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from tqdm import tqdm

tqdm.pandas()

lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

def stemmer(text):
  if text != text:
    return ''
  
  # replace non-alphanumeric characters with space
  text = re.sub(r'[^a-zA-Z0-9]', ' ', text)

  # remove multiple spaces
  text = re.sub(' +', ' ', text)

  # # lowercase
  text = text.lower()

  # tokenize
  tokens = word_tokenize(text)

  # remove stopwords
  tokens = [w for w in tokens if not w in stopwords]

  # lemmatize
  tokens = [lemmatizer.lemmatize(w) for w in tokens]

  # join tokens
  text = ' '.join(tokens)

  return text

# load df
df = pd.read_csv('data/Books_rating_relevant_columns.csv')

# apply stemmer to text
df['stemmed_text'] = df['text'].progress_apply(stemmer)
df['stemmed_summary'] = df['summary'].progress_apply(stemmer)
df['stemmed_summary_text'] = df['stemmed_summary'] + ' ' + df['stemmed_text']

# remove trailing spaces
df['stemmed_summary_text'] = df['stemmed_summary_text'].progress_apply(lambda x: x.strip())

# replace NaN with empty string
df.fillna('', inplace=True)

# remove rows with empty stemmed_summary_text
df = df[df['stemmed_summary_text'] != '']

# drop unused columns
df.drop(columns=['text', 'summary', 'stemmed_summary', 'stemmed_text'], inplace=True)

# save to new csv file
df.to_csv('data/Books_rating_stemmed.csv', index=False)

# print first 5 rows
df.head()

100%|██████████| 1000000/1000000 [07:43<00:00, 2156.07it/s]
100%|██████████| 1000000/1000000 [00:48<00:00, 20680.71it/s]
100%|██████████| 1000000/1000000 [00:00<00:00, 1865157.22it/s]


Unnamed: 0,score,stemmed_summary_text
0,5.0,best edition classic always recommended yale e...
1,5.0,great book required reading 16 yr old son book...
2,4.0,book consultant plain spoken finished book tak...
3,1.0,outrageously bad wow one ridiculous story ever...
4,4.0,cunning determination crew mutinied threatens ...


## Vectorizing

Vectorize the text using TF-IDF.

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

tqdm.pandas()

df = pd.read_csv('data/Books_rating_stemmed.csv')
print(df.shape)

vectorizer = TfidfVectorizer(max_features=30_000, sublinear_tf=True)

# create train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['stemmed_summary_text'], df['score'], test_size=0.2, random_state=1)

# fit vectorizer on training set
vectorizer.fit(X_train)

# transform training and test sets
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

# save vectorizer to disk
pickle.dump(vectorizer, open('models/vectorizer.pkl', 'wb'))
print('Vectorizer saved')

# get shape of training and test sets
print(X_train.shape)
print(X_test.shape)

(999999, 2)
Vectorizer saved
(799999, 30000)
(200000, 30000)


## Model Testing

Test the model using a variety of classifiers.

In [12]:

from sklearn.metrics import accuracy_score, mean_squared_error

def test_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    print('Model:', model.__class__.__name__)
    print('-'*30)
    print('Accuracy:', accuracy)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('')

In [13]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Define models and their hyperparameter grids
models = {
    'Naive Bayes': {
        'model': MultinomialNB(),
        'param_grid': {'alpha': [0.1, 0.5, 1.0]}
    },
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000, n_jobs=-1, solver='saga'),
        'param_grid': {'C': [0.1, 1.0, 10.0], 'penalty': ['l1', 'l2']}
    },
    'Support Vector Machine': {
        'model': LinearSVC(max_iter=1000),
        'param_grid': {'C': [0.1, 1.0, 10.0]}
    },
    'Random Forest': {
        'model': RandomForestClassifier(n_jobs=-1),
        'param_grid': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
    },
    'K-Nearest Neighbors': {
        'model': KNeighborsClassifier(n_jobs=-1),
        'param_grid': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
    }
}

results = {}

for model_name, model_info in models.items():
    model = model_info['model']
    param_grid = model_info['param_grid']

    # Perform grid search for hyperparameter tuning
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Get the best model and its parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Test the best model
    test_model(best_model)

    # Store results
    results[model_name] = {
        'best_model': best_model,
        'best_params': best_params,
        'accuracy': grid_search.best_score_
    }

# Plot the results using a line graph
fig, ax = plt.subplots()
models_names = list(results.keys())
accuracy_values = [results[model]['accuracy'] for model in models_names]

ax.plot(models_names, accuracy_values, marker='o', linestyle='-')
ax.set_ylabel('Accuracy')
ax.set_title('Model Comparison')
ax.set_xticklabels(models_names, rotation=45, ha='right')

plt.show()

Model: MultinomialNB
------------------------------
Accuracy: 0.63337
MSE: 1.586955
RMSE: 1.2597440216170903





KeyboardInterrupt: 

### Naive Bayes

Of the Naive Bayes classifiers we are going to use Multinomial Naive Bayes, because we are dealing with discrete values (ie text sentiment to determine a rating).

In [9]:
from sklearn.naive_bayes import MultinomialNB

# try some various hyperparameters
naive_bayes = MultinomialNB(alpha=0.1)

naive_bayes.fit(X_train, y_train)
print('Multinomial Naive Bayes Trained!')

test_model(naive_bayes)

Multinomial Naive Bayes Trained!
Model: MultinomialNB
Accuracy: 0.63337
MSE: 1.586955
RMSE: 1.2597440216170903


### Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

# try some various hyperparameters
logistic_regression = LogisticRegression(max_iter=1000, n_jobs=-1, solver='saga')

logistic_regression.fit(X_train, y_train)
print('Logistic Regression Trained!')

test_model(logistic_regression)

Logistic Regression Trained!
Model: LogisticRegression
Accuracy: 0.69451
MSE: 0.837685
RMSE: 0.9152513316024183


### Support Vector Machine

In [11]:
from sklearn.svm import LinearSVC

# try some various hyperparameters
svc = LinearSVC(max_iter=1000)

svc.fit(X_train, y_train)
print('SVC Trained!')

test_model(svc)



SVC Trained!




Model: LinearSVC
Accuracy: 0.690385
MSE: 0.897875
RMSE: 0.9475626628355509
