In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
def data_split(pandas_df):
  #Creating a randomized vector whose length is the length of our dataset
  total_count = pandas_df.shape[0]
  np.random.seed(0)
  shuffle = np.random.permutation(total_count)

  #Splitting the dataset into 'x' and 'y', so that it can be used in our model
  #'x' represents the reviews and 'y' represents the sentiments
  x = pandas_df.iloc[shuffle, 0]
  y = pandas_df.iloc[shuffle, 1]

  #splitting the dataset in training and testing sets in a 80:20 ratio
  split = int(total_count * 0.8) + 1  #the required 80% split
  x_train = x[:split]
  y_train = y[:split]

  x_test = x[split:]
  y_test = y[split:]

  return x_train, y_train, x_test, y_test

In [7]:
def review_cleaner(review):
  stopwords = nltk.corpus.stopwords.words("english")
  porter = PorterStemmer()
  # Make sure the reviews are not case sensitive
  review = review.lower()
  # Tokenize the words from the review
  words = nltk.word_tokenize(review)
  # Stemming and stopwords removal
  processed_words = [porter.stem(word) for word in words if word not in stopwords]
  # Join back to a single string
  return ' '.join(processed_words)

In [8]:
def MNB_pipeline(param_grid, cleaning_fn, x_train, y_train):
  # Set up the pipeline
  pipeline = Pipeline([
      ('tfidf', TfidfVectorizer(preprocessor=cleaning_fn)),  # Define review cleaner function if needed
      ('model', MultinomialNB())
  ])

  # Perform grid search
  grid_search = GridSearchCV(
      pipeline,
      param_grid=param_grid,
      scoring='f1_weighted',   # Use 'f1_weighted' for multi-class classification
      refit=True,
      cv=3,                    # 3-fold cross-validation
      verbose=2,
  )

  # Fit the grid search on training data
  grid_search.fit(x_train, y_train)

  # Output the best parameters and score
  print("Best parameters found:", grid_search.best_params_)
  print("Best F1 score:", grid_search.best_score_)

  return grid_search.best_estimator_



---

#**Approach 1 - Fuck it, we use all classes.**

In [8]:
# Load your Excel data into a pandas DataFrame
# Replace 'file_path.xlsx' with the actual file path of your Excel sheet
df = pd.read_csv('Review.csv')

# Select only the 'review' and 'rating' columns
df_filtered = df[['Review', 'Rating']]

val = df_filtered['Rating'].value_counts()[1]

# Get 10,000 instances of reviews with rating 1
df_rating_1 = df_filtered[df_filtered['Rating'] == 1].sample(n=val, random_state=1)

# Get 10,000 random samples for ratings 2, 3, 4, and 5 to balance the dataset
df_rating_2 = df_filtered[df_filtered['Rating'] == 2].sample(n=val, random_state=1)
df_rating_3 = df_filtered[df_filtered['Rating'] == 3].sample(n=val, random_state=1)
df_rating_4 = df_filtered[df_filtered['Rating'] == 4].sample(n=val, random_state=1)
df_rating_5 = df_filtered[df_filtered['Rating'] == 5].sample(n=val, random_state=1)

# Combine the samples into a single DataFrame
approach1_df = pd.concat([df_rating_1, df_rating_2, df_rating_3, df_rating_4, df_rating_5])

# Reset the index for neatness
approach1_df.reset_index(drop=True, inplace=True)

# Removing nan values
approach1_df.dropna(inplace=True)

# Display the first few rows of the final DataFrame
print(approach1_df.head())

                                              Review  Rating
0  Our 2008 Town & Country shuts off while drivin...       1
1  I purchased this new in 2012 and paid cash for...       1
2  Update:  12/28/2019 - GPS/INFOTAINMENT SCREEN ...       1
3  I thought I was getting a good deal. A mint fu...       1
4  I have had a rattle in my new VW atlas after t...       1


In [17]:
app1_x_train, app1_y_train, app1_x_test, app1_y_test = data_split(approach1_df)

# Define the parameter grid for tuning
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],   # Unigram, bigram, trigram
    'tfidf__max_features': [10000],       # Vocabulary size
    'model__alpha': [0.1, 0.5, 1.0, 10.0]             # Smoothing parameter
}

app1_best_model = MNB_pipeline(param_grid, review_cleaner, app1_x_train, app1_y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END model__alpha=0.1, tfidf__max_features=10000, tfidf__ngram_range=(1, 1); total time= 2.6min
[CV] END model__alpha=0.1, tfidf__max_features=10000, tfidf__ngram_range=(1, 1); total time= 2.9min
[CV] END model__alpha=0.1, tfidf__max_features=10000, tfidf__ngram_range=(1, 1); total time= 2.7min
[CV] END model__alpha=0.1, tfidf__max_features=10000, tfidf__ngram_range=(1, 2); total time= 2.8min
[CV] END model__alpha=0.1, tfidf__max_features=10000, tfidf__ngram_range=(1, 2); total time= 2.1min
[CV] END model__alpha=0.1, tfidf__max_features=10000, tfidf__ngram_range=(1, 2); total time= 2.1min
[CV] END model__alpha=0.1, tfidf__max_features=10000, tfidf__ngram_range=(1, 3); total time= 2.3min
[CV] END model__alpha=0.1, tfidf__max_features=10000, tfidf__ngram_range=(1, 3); total time= 2.3min
[CV] END model__alpha=0.1, tfidf__max_features=10000, tfidf__ngram_range=(1, 3); total time= 2.3min
[CV] END model__alpha=0.5, tfidf__max_f

In [18]:
# Train the final model with the best parameters
app1_best_model.fit(app1_x_train, app1_y_train)  # X_train_sparse is the precomputed training matrix

# Evaluate on the test set
y_test_pred_app1 = app1_best_model.predict(app1_x_test)  # X_test_sparse is the precomputed test matrix
app1_test_accuracy = accuracy_score(app1_y_test, y_test_pred_app1)
app1_test_f1 = f1_score(app1_y_test, y_test_pred_app1, average='weighted')

print(f"Test Set Performance: Accuracy = {app1_test_accuracy:.4f}, F1 Score = {app1_test_f1:.4f}")

Test Set Performance: Accuracy = 0.4659, F1 Score = 0.4596


In [21]:
class_accuracies = {}

data = confusion_matrix(app1_y_test, y_test_pred_app1)

for i, class_name in enumerate(['1', '2', '3', '4', '5']):
    true_positives = data[i, i]
    false_negatives = np.sum(data[i, :]) - true_positives  # Sum of the row - true positives
    accuracy = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    class_accuracies[class_name] = accuracy

# Print class-wise accuracy
for class_name, accuracy in class_accuracies.items():
    print(f'Accuracy for {class_name}: {accuracy:.4f}')

app1_y_probs = app1_best_model.predict_proba(app1_x_test)  # Probabilities for the positive class

# Calculate ROC-AUC
roc_auc_app1 = roc_auc_score(app1_y_test, app1_y_probs, multi_class = 'ovr')
print(f"ROC-AUC Score: {roc_auc_app1:.4f}")

Accuracy for 1: 0.6118
Accuracy for 2: 0.3250
Accuracy for 3: 0.3225
Accuracy for 4: 0.4553
Accuracy for 5: 0.6134
ROC-AUC Score: 0.8111


#**Approach 2 - consider twice as many data points for neutral as positive or negative**


---

####The dataset that I chose had about 290000 datapoints. However, the data was skewed towards right for Ratings, i.e., more people rated their cars as 4 or 5 than people who rated them 1 or 2. Rating of 1 had the least with approx 11000. Hence, I chose that same amount from ratings 2, 3, 4 and 5. Then I boxed them as pos, neg or nil.




In [9]:
val = df_filtered['Rating'].value_counts()[1]

# Get 10,000 instances of reviews with rating 1
df_rating_1 = df_filtered[df_filtered['Rating'] == 1].sample(n=val, random_state=1)

# Get 10,000 random samples for ratings 2, 3, 4, and 5 to balance the dataset
df_rating_2 = df_filtered[df_filtered['Rating'] == 2].sample(n=val, random_state=1)
df_rating_3 = df_filtered[df_filtered['Rating'] == 3].sample(n=val*2, random_state=1)
df_rating_4 = df_filtered[df_filtered['Rating'] == 4].sample(n=val, random_state=1)
df_rating_5 = df_filtered[df_filtered['Rating'] == 5].sample(n=val, random_state=1)

# Combine the samples into a single DataFrame
approach2_df = pd.concat([df_rating_1, df_rating_2, df_rating_3, df_rating_4, df_rating_5])

# Relabel the ratings
# 1 and 2 -> neg, 3 -> nil, 4 and 5 -> pos
def relabel_rating(rating):
    if rating in [1, 2]:
        return 'neg'
    elif rating == 3:
        return 'nil'
    elif rating in [4, 5]:
        return 'pos'

# Apply the relabeling function to the 'rating' column
approach2_df['Rating'] = approach2_df['Rating'].apply(relabel_rating)

# Reset the index for neatness
approach2_df.reset_index(drop=True, inplace=True)

# Removing nan values
approach2_df.dropna(inplace=True)

# Display the first few rows of the final DataFrame
print(approach2_df.head())

                                              Review Rating
0  Our 2008 Town & Country shuts off while drivin...    neg
1  I purchased this new in 2012 and paid cash for...    neg
2  Update:  12/28/2019 - GPS/INFOTAINMENT SCREEN ...    neg
3  I thought I was getting a good deal. A mint fu...    neg
4  I have had a rattle in my new VW atlas after t...    neg




---
#### Next, from the new dataset, I randomized it and split it in 80:20 ratio as training and test data.


In [10]:
app2_x_train, app2_y_train, app2_x_test, app2_y_test = data_split(approach2_df)

# Define the parameter grid for tuning
param_grid_2 = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],   # Unigram, bigram, trigram
    'tfidf__max_features': [5000, 7000, 10000],       # Vocabulary size
    'model__alpha': [0.1, 0.5, 1.0]             # Smoothing parameter
}

app2_best_model = MNB_pipeline(param_grid_2, review_cleaner, app2_x_train, app2_y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 2.9min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 2.8min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 2.8min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time= 3.0min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time= 3.2min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time= 3.0min
[CV] END model__alpha=0.1, tfidf__max_features=7000, tfidf__ngram_range=(1, 1); total time= 2.9min
[CV] END model__alpha=0.1, tfidf__max_features=7000, tfidf__ngram_range=(1, 1); total time= 2.8min
[CV] END model__alpha=0.1, tfidf__max_features=7000, tfidf__ngram_range=(1, 1); total time= 2.8min
[CV] END model__alpha=0.1, tfidf__max_features=7

In [11]:
# Train the final model with the best parameters
app2_best_model.fit(app2_x_train, app2_y_train)  # X_train_sparse is the precomputed training matrix

# Evaluate on the test set
y_test_pred_app2 = app2_best_model.predict(app2_x_test)  # X_test_sparse is the precomputed test matrix
app2_test_accuracy = accuracy_score(app2_y_test, y_test_pred_app2)
app2_test_f1 = f1_score(app2_y_test, y_test_pred_app2, average='weighted')

print(f"Test Set Performance: Accuracy = {app2_test_accuracy:.4f}, F1 Score = {app2_test_f1:.4f}")

Test Set Performance: Accuracy = 0.6650, F1 Score = 0.6559


In [12]:
class_accuracies = {}

data = confusion_matrix(app2_y_test, y_test_pred_app2)

for i, class_name in enumerate(['pos', 'nil', 'neg']):
    true_positives = data[i, i]
    false_negatives = np.sum(data[i, :]) - true_positives  # Sum of the row - true positives
    accuracy = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    class_accuracies[class_name] = accuracy

# Print class-wise accuracy
for class_name, accuracy in class_accuracies.items():
    print(f'Accuracy for {class_name}: {accuracy:.4f}')

app2_y_probs = app2_best_model.predict_proba(app2_x_test)  # Probabilities for the positive class

# Calculate ROC-AUC
roc_auc_app2 = roc_auc_score(app2_y_test, app2_y_probs, multi_class = 'ovr')
print(f"ROC-AUC Score: {roc_auc_app2:.4f}")

Accuracy for pos: 0.7193
Accuracy for nil: 0.4286
Accuracy for neg: 0.8425
ROC-AUC Score: 0.8382




---


#**Approach 3 - Consider 3 as 'neg' in hopes of priortizing positive datapoints**

In [13]:
# Load your Excel data into a pandas DataFrame
# Replace 'file_path.xlsx' with the actual file path of your Excel sheet
df = pd.read_csv("Review.csv")

# Select only the 'review' and 'rating' columns
df_filtered = df[['Review', 'Rating']]

# Get 10,000 instances of reviews with rating 1
df_rating_1 = df_filtered[df_filtered['Rating'] == 1].sample(n=10000, random_state=1)

# Get 10,000 random samples for ratings 2, 3, 4, and 5 to balance the dataset
df_rating_2 = df_filtered[df_filtered['Rating'] == 2].sample(n=10000, random_state=1)
df_rating_3 = df_filtered[df_filtered['Rating'] == 3].sample(n=10000, random_state=1)
df_rating_4 = df_filtered[df_filtered['Rating'] == 4].sample(n=15000, random_state=1)
df_rating_5 = df_filtered[df_filtered['Rating'] == 5].sample(n=15000, random_state=1)

# Combine the samples into a single DataFrame
approach3_df = pd.concat([df_rating_1, df_rating_2, df_rating_3, df_rating_4, df_rating_5])

# Relabel the ratings
# 1, 2 and 3 -> neg & 4 and 5 -> pos
def relabel_rating(rating):
    if rating in [1, 2, 3]:
        return 'neg'
    elif rating in [4, 5]:
        return 'pos'

# Apply the relabeling function to the 'rating' column
approach3_df['Rating'] = approach3_df['Rating'].apply(relabel_rating)

# Reset the index for neatness
approach3_df.reset_index(drop=True, inplace=True)

# Removing nan values
approach3_df.dropna(inplace=True)

# Display the first few rows of the final DataFrame
print(approach3_df.head())

                                              Review Rating
0  Our 2008 Town & Country shuts off while drivin...    neg
1  I purchased this new in 2012 and paid cash for...    neg
2  Update:  12/28/2019 - GPS/INFOTAINMENT SCREEN ...    neg
3  I thought I was getting a good deal. A mint fu...    neg
4  I have had a rattle in my new VW atlas after t...    neg


In [14]:
app3_x_train, app3_y_train, app3_x_test, app3_y_test = data_split(approach3_df)

# Define the parameter grid for tuning
param_grid_3 = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],   # Unigram, bigram, trigram
    'tfidf__max_features': [5000, 7000, 10000],       # Vocabulary size
    'model__alpha': [0.1, 0.5, 1.0]             # Smoothing parameter
}

app3_best_model = MNB_pipeline(param_grid_3, review_cleaner, app3_x_train, app3_y_train)

# Train the final model with the best parameters
app3_best_model.fit(app3_x_train, app3_y_train)  # X_train_sparse is the precomputed training matrix

# Evaluate on the test set
y_test_pred_app3 = app3_best_model.predict(app3_x_test)  # X_test_sparse is the precomputed test matrix
app3_test_accuracy = accuracy_score(app3_y_test, y_test_pred_app3)
app3_test_f1 = f1_score(app3_y_test, y_test_pred_app3, average='weighted')

print(f"Test Set Performance: Accuracy = {app3_test_accuracy:.4f}, F1 Score = {app3_test_f1:.4f}")

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 2.6min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 2.6min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 2.6min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time= 2.7min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time= 2.7min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time= 2.7min
[CV] END model__alpha=0.1, tfidf__max_features=7000, tfidf__ngram_range=(1, 1); total time= 2.5min
[CV] END model__alpha=0.1, tfidf__max_features=7000, tfidf__ngram_range=(1, 1); total time= 2.5min
[CV] END model__alpha=0.1, tfidf__max_features=7000, tfidf__ngram_range=(1, 1); total time= 2.5min
[CV] END model__alpha=0.1, tfidf__max_features=7

In [15]:
class_accuracies = {}

data = confusion_matrix(app3_y_test, y_test_pred_app3)

for i, class_name in enumerate(['pos', 'neg']):
    true_positives = data[i, i]
    false_negatives = np.sum(data[i, :]) - true_positives  # Sum of the row - true positives
    accuracy = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    class_accuracies[class_name] = accuracy

# Print class-wise accuracy
for class_name, accuracy in class_accuracies.items():
    print(f'Accuracy for {class_name}: {accuracy:.4f}')

app3_y_probs = app3_best_model.predict_proba(app3_x_test)[:, 1]  # Probabilities for the positive class

# Calculate ROC-AUC
roc_auc_app3 = roc_auc_score(app3_y_test, app3_y_probs)
print(f"ROC-AUC Score: {roc_auc_app3:.4f}")

Accuracy for pos: 0.8408
Accuracy for neg: 0.8878
ROC-AUC Score: 0.9335




---


#**Approach 4 - Ignoring Rating 3 altogether**

In [4]:
# Load your Excel data into a pandas DataFrame
# Replace 'file_path.xlsx' with the actual file path of your Excel sheet
df = pd.read_csv('Review.csv')

# Select only the 'review' and 'rating' columns
df_filtered = df[['Review', 'Rating']]

val = df_filtered['Rating'].value_counts()[1]

# Get 10,000 instances of reviews with rating 1
df_rating_1 = df_filtered[df_filtered['Rating'] == 1].sample(n=val, random_state=1)

# Get 10,000 random samples for ratings 2, 3, 4, and 5 to balance the dataset
df_rating_2 = df_filtered[df_filtered['Rating'] == 2].sample(n=val, random_state=1)
df_rating_4 = df_filtered[df_filtered['Rating'] == 4].sample(n=val, random_state=1)
df_rating_5 = df_filtered[df_filtered['Rating'] == 5].sample(n=val, random_state=1)

# Combine the samples into a single DataFrame
approach4_df = pd.concat([df_rating_1, df_rating_2, df_rating_4, df_rating_5])

# Relabel the ratings
# 1 and 2 -> neg, 3 -> nil, 4 and 5 -> pos
def relabel_rating(rating):
    if rating in [1, 2]:
        return 'neg'
    elif rating in [4, 5]:
        return 'pos'

# Apply the relabeling function to the 'rating' column
approach4_df['Rating'] = approach4_df['Rating'].apply(relabel_rating)

# Reset the index for neatness
approach4_df.reset_index(drop=True, inplace=True)

# Removing nan values
approach4_df.dropna(inplace=True)

# Display the first few rows of the final DataFrame
print(approach4_df.head())

                                              Review Rating
0  Our 2008 Town & Country shuts off while drivin...    neg
1  I purchased this new in 2012 and paid cash for...    neg
2  Update:  12/28/2019 - GPS/INFOTAINMENT SCREEN ...    neg
3  I thought I was getting a good deal. A mint fu...    neg
4  I have had a rattle in my new VW atlas after t...    neg


In [9]:
app4_x_train, app4_y_train, app4_x_test, app4_y_test = data_split(approach4_df)

# Define the parameter grid for tuning
param_grid_4 = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],   # Unigram, bigram, trigram
    'tfidf__max_features': [5000, 7000, 10000],       # Vocabulary size
    'model__alpha': [0.1, 0.5, 1.0, 5.0]             # Smoothing parameter
}

app4_best_model = MNB_pipeline(param_grid_4, review_cleaner, app4_x_train, app4_y_train)

# Train the final model with the best parameters
app4_best_model.fit(app4_x_train, app4_y_train)  # X_train_sparse is the precomputed training matrix

# Evaluate on the test set
y_test_pred_app4 = app4_best_model.predict(app4_x_test)  # X_test_sparse is the precomputed test matrix
app4_test_accuracy = accuracy_score(app4_y_test, y_test_pred_app4)
app4_test_f1 = f1_score(app4_y_test, y_test_pred_app4, average='weighted')

print(f"Test Set Performance: Accuracy = {app4_test_accuracy:.4f}, F1 Score = {app4_test_f1:.4f}")

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 2.0min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 2.0min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 2.1min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time= 2.1min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time= 2.1min
[CV] END model__alpha=0.1, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time= 2.1min
[CV] END model__alpha=0.1, tfidf__max_features=7000, tfidf__ngram_range=(1, 1); total time= 2.0min
[CV] END model__alpha=0.1, tfidf__max_features=7000, tfidf__ngram_range=(1, 1); total time= 1.9min
[CV] END model__alpha=0.1, tfidf__max_features=7000, tfidf__ngram_range=(1, 1); total time= 2.0min
[CV] END model__alpha=0.1, tfidf__max_features=7

In [10]:
class_accuracies = {}

data = confusion_matrix(app4_y_test, y_test_pred_app4)

for i, class_name in enumerate(['pos', 'neg']):
    true_positives = data[i, i]
    false_negatives = np.sum(data[i, :]) - true_positives  # Sum of the row - true positives
    accuracy = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    class_accuracies[class_name] = accuracy

# Print class-wise accuracy
for class_name, accuracy in class_accuracies.items():
    print(f'Accuracy for {class_name}: {accuracy:.4f}')

app4_y_scores = app4_best_model.predict_proba(app4_x_test)[:, 1]  # Probabilities for the positive class

# Calculate ROC-AUC
roc_auc_app4 = roc_auc_score(app4_y_test, app4_y_scores)
print(f"ROC-AUC Score: {roc_auc_app4:.4f}")

Accuracy for pos: 0.9054
Accuracy for neg: 0.9028
ROC-AUC Score: 0.9637
