In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_pickle('data/pkl_vector_10k_reviews.pkl')

# Extract only the feature vectors
features = df.vector.to_list()
vector_df = pd.DataFrame(features)

# Prepare features and labels
X = vector_df  # Features
y = df.good_rating  # Labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Train the classifier
gnb.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = gnb.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.982

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.98      0.84        96
           1       1.00      0.98      0.99      1904

    accuracy                           0.98      2000
   macro avg       0.87      0.98      0.91      2000
weighted avg       0.99      0.98      0.98      2000



In [2]:
f1_1_score = 0
f1_0_score = 0
iter_accuracy = 0

num_trials = 20

for test_iter in range(num_trials): 

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    gnb = GaussianNB()

    gnb.fit(X_train, y_train)

    y_pred = gnb.predict(X_test)
    iter_accuracy += accuracy_score(y_test, y_pred)

    result_dict = classification_report(y_test, y_pred, output_dict=True)
    f1_1_score += result_dict['1']['f1-score']
    f1_0_score += result_dict['0']['f1-score']

# Make predictions and evaluate
print(f"Avg Accuracy: {iter_accuracy/num_trials}")
print("\nAvg f1-score for cluster 0:\n", {f1_0_score/num_trials})
print("\nAvg f1-score for cluster 1:\n", {f1_1_score/num_trials})

Avg Accuracy: 0.9819999999999997

Avg f1-score for cluster 0:
 {0.8392857142857139}

Avg f1-score for cluster 1:
 {0.9904661016949149}
