# Data Processing

## Remove Rows and Sample Randomly

We will now drop any rows that are irrelevant to us and save it to a CSV file.

In [8]:
import pandas as pd

df = pd.read_csv('data/Books_rating.csv')

# preserve relevant columns and rename for ease of access
df = df[['review/summary', 'review/text', 'review/score']]
df.columns = ['summary', 'text', 'score']

# sample a portion of the data
n = 500_000
df = df.sample(n=n, random_state=1)

# save to new csv file and view data
df.to_csv('data/Books_rating_relevant_columns.csv', index=False)
df.head()

Unnamed: 0,summary,text,score
2896109,Best edition of this classic.,I've always recommended this Yale edition of F...,5.0
2381153,Great Book!!,This is required reading for my 16 yr old son....,5.0
1028690,Not just a book for consultant,"Plain-spoken, finished the book only has taken...",4.0
1945977,Outrageously Bad,Wow... this is one of the most ridiculous stor...,1.0
2812693,Cunning and determination,A crew has mutinied and threatens to hang thei...,4.0


## Stopwords, Lemmatization, and Vectorization

Process the text to remove stopwords, lemmatize, strip unneeded characters, then vectorize.

In [3]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from tqdm import tqdm

tqdm.pandas()

In [10]:
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))


def stemmer(text):
    if text != text:
        return ''

    # clean the text
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.lower()

    # tokenize and remove stopwords
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if not w in stopwords]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    text = ' '.join(tokens)

    return text

In [11]:
df = pd.read_csv('data/Books_rating_relevant_columns.csv')

# apply stemmer to text
df['stemmed_text'] = df['text'].progress_apply(stemmer)
df['stemmed_summary'] = df['summary'].progress_apply(stemmer)
df['stemmed_summary_text'] = df['stemmed_summary'] + ' ' + df['stemmed_text']

# remove trailing spaces
df['stemmed_summary_text'] = df['stemmed_summary_text'].progress_apply(
    lambda x: x.strip())

# clean up NaN and remove empty rows
df.fillna('', inplace=True)
df = df[df['stemmed_summary_text'] != '']

# drop unused columns
df.drop(columns=['text', 'summary', 'stemmed_summary',
        'stemmed_text'], inplace=True)

# save to new csv file and view the data
df.to_csv('data/Books_rating_stemmed.csv', index=False)
df.head()

100%|██████████| 500000/500000 [03:09<00:00, 2642.66it/s]
100%|██████████| 500000/500000 [00:17<00:00, 29370.41it/s]
100%|██████████| 500000/500000 [00:00<00:00, 2473374.53it/s]


Unnamed: 0,score,stemmed_summary_text
0,5.0,best edition classic always recommended yale e...
1,5.0,great book required reading 16 yr old son book...
2,4.0,book consultant plain spoken finished book tak...
3,1.0,outrageously bad wow one ridiculous story ever...
4,4.0,cunning determination crew mutinied threatens ...


## Vectorizing

Vectorize the text using TF-IDF.

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle
from tqdm import tqdm

tqdm.pandas()

In [5]:
df = pd.read_csv('data/Books_rating_stemmed.csv')
print(f'Shape of dataframe: {df.shape}')

# create train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['stemmed_summary_text'], df['score'], test_size=0.2, random_state=1)

# init the vectorizer and fit on the training set
vectorizer = TfidfVectorizer(max_features=30_000, sublinear_tf=True)
vectorizer.fit(X_train)

# transform training and test sets
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

# get shape of training and test sets
print(f'Shape of training set: {X_train.shape}')
print(f'Shape of testing set: {X_test.shape}')

# save vectorizer to disk
pickle.dump(vectorizer, open('models/vectorizer.pkl', 'wb'))
print('Vectorizer saved!')

Shape of dataframe: (499999, 2)
Shape of training set: (399999, 30000)
Shape of testing set: (100000, 30000)
Vectorizer saved!


## Model Testing

First testing different kinds of models, then further testing with hyperparameters.

In [12]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import mean_squared_error

In [14]:
# Create a dictionary to store model instances
models = {
    'MultinomialNB': MultinomialNB(),
    'LogisticRegression': LogisticRegression(n_jobs=-1, solver='saga'),
    'LinearSVC': LinearSVC(),
    # 'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    'SDGClassifier': SGDClassifier(n_jobs=-1),
    # 'RandomForestClassifier': RandomForestClassifier(n_jobs=-1),
    # 'GradientBoostingClassifier': GradientBoostingClassifier()
}

# Create lists to store model names and accuracies for plotting
model_names = []
accuracies = []
mses = []
rmses = []

# Iterate through models
for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # calculate MSE and RMSE
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # Print classification report for more detailed metrics
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, y_pred))
    
    # Store model name and accuracy for plotting
    model_names.append(name)
    accuracies.append(accuracy)
    mses.append(mse)
    rmses.append(rmse)

# Plotting accuracies
plt.bar(model_names, accuracies)
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Model Accuracies')
plt.show()

# Plotting MSE
plt.bar(model_names, mses)
plt.xlabel('Models')
plt.ylabel('MSE')
plt.title('Model MSE')
plt.show()

# Plotting RMSE
plt.bar(model_names, rmses)
plt.xlabel('Models')
plt.ylabel('RMSE')
plt.title('Model RMSE')
plt.show()

Classification Report for MultinomialNB:
              precision    recall  f1-score   support

         1.0       0.72      0.22      0.34      6704
         2.0       0.47      0.01      0.03      5135
         3.0       0.39      0.03      0.06      8384
         4.0       0.38      0.06      0.10     19548
         5.0       0.63      0.99      0.77     60229

    accuracy                           0.62    100000
   macro avg       0.52      0.26      0.26    100000
weighted avg       0.56      0.62      0.51    100000



AttributeError: 'numpy.float64' object has no attribute 'append'

At this point we decided to further test the LogisticRegression and LinearSVC since the Naive Bayes did not produce favorable results.

### Logistic Regression

### LinearSVC