In [1]:
### sklearn code for Naive Bayes

In [31]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import (StandardScaler, LabelEncoder,
                                 MultiLabelBinarizer, OneHotEncoder,
                                 FunctionTransformer)
from sklearn.model_selection import (train_test_split, GridSearchCV, RandomizedSearchCV)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import (MultinomialNB, GaussianNB, BernoulliNB)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv('sample_data/final_final_fr_fr.csv')

# Print unique values for each column
for column in df.columns:
    print(f"Unique values in {column}: {df[column].unique()}")

In [59]:
#### same as Karyna's implementation (for data organization) from sklearn_logistic_regression
# Define vectorized_one_hot function
def vectorized_one_hot(answers, options, attribute_to_index):
    num_samples = len(answers)
    num_attributes = len(options)
    one_hot_matrix = np.zeros((num_samples, num_attributes))

    for i, ans_list in enumerate(answers):
        for ans in ans_list:
            ans = ans.strip()
            if ans in attribute_to_index:
                one_hot_matrix[i, attribute_to_index[ans]] = 1
    return one_hot_matrix

# Question definitions
q1 = "Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)"
q2 = "Q2: How many ingredients would you expect this food item to contain?"
q3 = "Q3: In what setting would you expect this food to be served? Please check all that apply"
q4 = "Q4: How much would you expect to pay for one serving of this food item?"
q5 = "Q5: What movie do you think of when thinking of this food item?"
q6 = "Q6: What drink would you pair with this food item?"
q7 = "Q7: When you think about this food item, who does it remind you of?"
q8 = "Q8: How much hot sauce would you add to this food item?"
t = "Label"

# Data cleaning
df[q2] = df[q2].replace("none", '0').astype(str)
df[q4] = df[q4].replace("none", '0').astype(str)

# Define options for categorical columns
q1_options = [1,2,3,4,5]
q3_options = ['none','Week day lunch','Week day dinner','Weekend lunch','Weekend dinner','At a party', 'Late night snack']
q7_options = ['Parents','Siblings','Friends', 'Teachers', 'Strangers' , 'none']
q8_options = ['I will have some of this food item with my hot sauce', 'A lot (hot)', 'A moderate amount (medium)', 'A little (mild)', 'none']

# Create attribute to index mappings
q3_attribute_to_index = {attr: idx for idx, attr in enumerate(q3_options)}
q7_attribute_to_index = {attr: idx for idx, attr in enumerate(q7_options)}
q8_attribute_to_index = {attr: idx for idx, attr in enumerate(q8_options)}

# Manual feature engineering approach
df[q2] = pd.to_numeric(df[q2])
df[q1] = pd.to_numeric(df[q1])
df[q4] = pd.to_numeric(df[q4])

numerical_features = df[[q1, q2, q4]].values

# Multi-hot encoding
q3_hot = vectorized_one_hot([ans.split(",") for ans in df[q3].astype(str).tolist()], q3_options, q3_attribute_to_index)
q7_hot = vectorized_one_hot([ans.split(",") for ans in df[q7].astype(str).tolist()], q7_options, q7_attribute_to_index)
q8_hot = vectorized_one_hot([ans.split(",") for ans in df[q8].astype(str).tolist()], q8_options, q8_attribute_to_index)

# Frequency encoding
q5_encoded = df[q5].map(df[q5].value_counts(normalize=True)).to_frame()
q6_encoded = df[q6].map(df[q6].value_counts(normalize=True)).to_frame()

# Combine features
X = np.hstack([
    numerical_features,
    q3_hot,
    q7_hot,
    q8_hot,
    q5_encoded,
    q6_encoded
])

# Sparse version
X_sparse = sparse.hstack([
    sparse.csr_matrix(numerical_features),
    sparse.csr_matrix(q3_hot),
    sparse.csr_matrix(q7_hot),
    sparse.csr_matrix(q8_hot),
    sparse.csr_matrix(q5_encoded),
    sparse.csr_matrix(q6_encoded)
])

# Encode labels
le = LabelEncoder()
y = le.fit_transform(df["Label"])

# train-validatin-test split
X_train, X_temp, y_train, y_temp = train_test_split(X_sparse, y, test_size=0.2, random_state=42)  # 80% train, 20% temp (will use temp for val and test)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 10% val, 10% test

In [60]:
#### new stuff for naive bayes
nb_model = MultinomialNB()
# nb_model = GaussianNB()       # these two dont actually work well for our data
# nb_model = BernoulliNB()
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

# Evaluate Performance -- BEFORE ANY TUNING
print("BEFORE TUNING")
train_accuracy = accuracy_score(y_train, nb_model.predict(X_train))
val_accuracy = accuracy_score(y_val, nb_model.predict(X_val))
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

print("\nClassification Report:") # 0= Pizza, 1= Shawarma, 2= Sushi
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\n\n")

BEFORE TUNING
Training Accuracy: 0.73
Validation Accuracy: 0.71
Testing Accuracy: 0.73

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.72      0.70        43
           1       0.84      0.80      0.82        70
           2       0.62      0.63      0.63        52

    accuracy                           0.73       165
   macro avg       0.72      0.72      0.72       165
weighted avg       0.73      0.73      0.73       165


Confusion Matrix:
[[31  1 11]
 [ 5 56  9]
 [ 9 10 33]]





In [54]:
#### Tuning parameters to try to get better accuracy
## attempt 1 to tune parameters: use grid search
#  by including 0 as option for alpha, we are allowing MLE as an option (alpha>0 --> MAP)
param_grid = {'alpha': [0, 0.01, 0.1, 1.0, 10.0],
              'fit_prior': [True, False]}

grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
grid_params = grid_search.best_params_

new_grid_model = MultinomialNB(
    alpha = grid_params['alpha'],
    fit_prior = grid_params['fit_prior']
)
new_grid_model.fit(X_train, y_train)
# new_grid_y_pred = new_grid_model.predict(X_test)                                                    NO LONGER NEED HERE WITH NEW ORGANIZATION

# Evaluate on validation set before testing -- USING GRID SEARCH
grid_val_accuracy = accuracy_score(y_val, new_grid_model.predict(X_val))  # Validation set accuracy
print(f"Validation Accuracy after Grid Search: {grid_val_accuracy:.2f}")

# # Evaluate Performance -- USING GRID SEARCH
# print("TUNING USING GRID SEARCH")
# accuracy = accuracy_score(y_test, new_grid_y_pred) # now using the new y_pred
# print(f"Accuracy: {accuracy:.2f}")

# print("\nClassification Report:") # 0= Pizza, 1= Shawarma, 2= Sushi
# print(classification_report(y_test, new_grid_y_pred))

# print("\nConfusion Matrix:")
# print(confusion_matrix(y_test, new_grid_y_pred))
# print("\n\n")


## attempt 2 to tune parameters: use random search
# Define parameter distribution -- by including 0 as option for alpha, we are allowing MLE as an option (alpha>0 --> MAP)
param_dist = {
    'alpha': [0, 0.01, 0.1, 1.0, 10.0],
    'fit_prior': [True, False]
}

# RandomizedSearchCV
random_search = RandomizedSearchCV(MultinomialNB(), param_distributions=param_dist, n_iter=5, cv=10, scoring='accuracy', random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", random_search.best_params_)
rand_params = random_search.best_params_

# Create a model with the best parameters
new_rand_model = MultinomialNB(
    alpha=rand_params['alpha'],
    fit_prior=rand_params['fit_prior']
)

# Train on full training data
new_rand_model.fit(X_train, y_train)


# Evaluate on validation set before testing -- USING RANDOM SEARCH
rand_val_accuracy = accuracy_score(y_val, new_rand_model.predict(X_val))  # Validation set accuracy
print(f"Validation Accuracy after Random Search: {rand_val_accuracy:.2f}")


Best parameters: {'alpha': 0.1, 'fit_prior': True}
Validation Accuracy after Grid Search: 0.71
Best parameters: {'fit_prior': False, 'alpha': 0}
Validation Accuracy after Random Search: 0.70


  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


In [58]:
#### choosing the best tuned parameters and comparing against the un-tuned parameters
# figure out if grid search or random search did a better job tuning
if grid_val_accuracy > rand_val_accuracy:
    print("BETTER TUNING: GRID SEARCH")
    tuned_params = grid_params
    better_model = new_grid_model
else:
    print("BETTER TUNING: RANDOM SEARCH")
    tuned_params = rand_params
    better_model = new_rand_model

tuned_y_pred = new_grid_model.predict(X_test)

# Evaluate Performance -- USING BETTER (of 2) TUNING
accuracy = accuracy_score(y_test, tuned_y_pred) # now using the new y_pred
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:") # 0= Pizza, 1= Shawarma, 2= Sushi
print(classification_report(y_test, tuned_y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, tuned_y_pred))
print("\n\n")

# Evaluate Performance -- RECALL BEFORE ANY TUNING
print("BEFORE TUNING")
train_accuracy = accuracy_score(y_train, nb_model.predict(X_train))
val_accuracy = accuracy_score(y_val, nb_model.predict(X_val))
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

print("\nClassification Report:") # 0= Pizza, 1= Shawarma, 2= Sushi
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\n\n")

BETTER TUNING: GRID SEARCH
Accuracy: 0.72

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.72      0.69        43
           1       0.84      0.80      0.82        70
           2       0.61      0.60      0.60        52

    accuracy                           0.72       165
   macro avg       0.70      0.71      0.70       165
weighted avg       0.72      0.72      0.72       165


Confusion Matrix:
[[31  1 11]
 [ 5 56  9]
 [11 10 31]]



BEFORE TUNING
Training Accuracy: 0.73
Validation Accuracy: 0.71
Testing Accuracy: 0.73

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.72      0.70        43
           1       0.84      0.80      0.82        70
           2       0.62      0.63      0.63        52

    accuracy                           0.73       165
   macro avg       0.72      0.72      0.72       165
weighted avg       0.73      0.73      0.73       165


C