In [1]:
import pandas as pd

df = pd.read_csv("camera_dataset.tsv", delimiter="\t", on_bad_lines="skip")
print(df.head())

# list the columns
df.columns

# list the unique values for column "product_category"
df["product_category"].unique()

  marketplace  customer_id       review_id  product_id  product_parent  \
0          US      2975964  R1NBG94582SJE2  B00I01JQJM       860486164   
1          US     23526356  R273DCA6Y0H9V7  B00TCO0ZAA       292641483   
2          US     52764145   RQVOXO7WUOFK6  B00B7733E0        75825744   
3          US     47348933  R1KWKSF21PO6HO  B006ZN4U34       789352955   
4          US     33680700  R38H3UO1J190GI  B00HUEBGMU        19067902   

                                       product_title product_category  \
0  GoPro Rechargeable Battery 2.0 (HERO3/HERO3+ o...           Camera   
1  Professional 58mm Center Pinch Lens Cap for CA...           Camera   
2  Spy Tec Z12 Motion Activated Intelligent Secur...           Camera   
3  Celestron UpClose G2 10x25 Monocular, Black (7...           Camera   
4  Vidpro XM-L Wired Lavalier microphone - 20' Au...           Camera   

   star_rating  helpful_votes  total_votes vine verified_purchase  \
0            5              0            0    N

array(['Camera'], dtype=object)

In [2]:
#remove all rows that have a null value in column "product_category" for dataframe df
df_remove_null = df.dropna(subset=["product_category"])

#count the rows for dataframe df
df_remove_null.count()



marketplace          1800845
customer_id          1800845
review_id            1800845
product_id           1800845
product_parent       1800845
product_title        1800845
product_category     1800845
star_rating          1800845
helpful_votes        1800845
total_votes          1800845
vine                 1800845
verified_purchase    1800845
review_headline      1800827
review_body          1800753
review_date          1800836
dtype: int64

In [3]:
# count how many rows in dataframe df have the value "[null]" in the column "product_category"
null_count = df[df['product_category'] == '-'].shape[0]
# Print the result
print(f"Number of rows with '[null]' value: {null_count}")


Number of rows with '[null]' value: 0


In [4]:
# Columns we need: customer_id, product_id, product_title, product_category, star_rating, total_votes, vine, verified_purchase, review_headline, review_body

trimmed_df = df[["customer_id", "product_id", "product_title", "star_rating", "helpful_votes", "total_votes", "vine", "verified_purchase", "review_headline", "review_body"]]

In [5]:
trimmed_df.head(50)

Unnamed: 0,customer_id,product_id,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body
0,2975964,B00I01JQJM,GoPro Rechargeable Battery 2.0 (HERO3/HERO3+ o...,5,0,0,N,Y,Five Stars,ok
1,23526356,B00TCO0ZAA,Professional 58mm Center Pinch Lens Cap for CA...,5,0,0,N,Y,Love it!!!,"Perfect, even sturdier than the original!"
2,52764145,B00B7733E0,Spy Tec Z12 Motion Activated Intelligent Secur...,2,1,1,N,Y,Another Motion Detect Fail,"If the words, &#34;Cheap Chinese Junk&#34; com..."
3,47348933,B006ZN4U34,"Celestron UpClose G2 10x25 Monocular, Black (7...",5,0,0,N,Y,Exactly what I wanted and expected.,Exactly what I wanted and expected. Perfect fo...
4,33680700,B00HUEBGMU,Vidpro XM-L Wired Lavalier microphone - 20' Au...,5,1,1,N,Y,Good mic at a Good Price...Not Canon Though.,I will look past the fact that they tricked me...
5,30301059,B008MW6Y12,NIX 8 inch Hi-Res Digital Photo Frame with Mot...,3,0,0,N,Y,"The controls are pretty slow, and I can't get ...","The controls are pretty slow, and I can't get ..."
6,28282645,B00TE8XKIS,Polaroid ZIP Mobile Printer Parent ASIN,3,8,8,N,N,"Fun toy for making stickers, but expensive to ...",The printer came in a small fairly plain box w...
7,502818,B00ZKDUFBQ,GeekPro 2.0-Inch WIFI HD 1080P 12MP Sports Cam...,5,0,1,N,Y,Five Stars,Great camera for the price.
8,1481233,B010BZ7S2Q,Sony HDR-AZ1VR Action Cam Mini Camcorder 1080p...,5,0,2,N,Y,Five Stars,Product is very good and satisfactory.<br /><b...
9,27885926,B00HRXSSRA,"ChiliPower DMW-BLC12, DMW-BLC12E, DMW-BLC12PP ...",1,0,0,N,Y,Sucky.,"Lasted a few hours with its first charge, whic..."


In [6]:
# count rows with more than 0 helpful_votes
count = (trimmed_df['helpful_votes'] > 0).sum()
print(count)


707547


In [7]:
count = (trimmed_df['total_votes'] > 0).sum()
print(count)


843673


In [8]:
one_star_sample = trimmed_df[trimmed_df['star_rating'] == 1].sample(n=10000)
two_star_sample = trimmed_df[trimmed_df['star_rating'] == 2].sample(n=10000)
three_star_sample = trimmed_df[trimmed_df['star_rating'] == 3].sample(n=10000)
four_star_sample = trimmed_df[trimmed_df['star_rating'] == 4].sample(n=10000)
five_star_sample = trimmed_df[trimmed_df['star_rating'] == 5].sample(n=10000)

random_sample = pd.concat([one_star_sample, two_star_sample, three_star_sample, four_star_sample, five_star_sample], axis=0)
random_sample.head(30)

Unnamed: 0,customer_id,product_id,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body
1144598,13418941,B006I5MKZY,Pogoplug Series 4 Backup Device,1,2,3,N,N,"Corrupts hard drive, not compatible with MAC /...",This product is fraudulently advertised from P...
737669,38981774,B004Z55VP0,Tiffen Variable ND Filter,1,7,9,N,Y,DO NOT BUY THIS PRODUCT IF YOU INTEND TO USE I...,"Having recently bought this item, I have used ..."
1504924,28139906,B00065L5TE,Canon(r) 9764a001aa Cb-2lv Battery Charger,1,2,6,N,Y,BATTERY = 0 CUSTOMER SERVICE = 1,The batter charger for my Cannon camera batter...
134231,23387162,B00EHA7MBO,GladsBuy Spring 10' x 10' Computer Printed Pho...,1,1,1,N,Y,One Star,at the bottom the colors was fadingrop the col...
99685,14059781,B003USS5JC,Millionaccessories® 52mm Hard Tulip Lens Hood ...,1,0,0,N,Y,I got what I paid for!,Mostly this is a piece of s***. This hood uses...
488322,3122631,B00N9WMBOY,SMO Lens diameter 7cm 20M Waterproof Black Cam...,1,0,0,N,Y,The SAMSUNG NX1000 camera does not easily fall...,The SAMSUNG NX1000 camera does not easily fall...
1146013,14108644,B005DPOTXG,Black Nylon Camera/shoulder / Camcorder quick ...,1,11,12,N,Y,So another lesson learned.......,"Ordered this early March 2013, used it almost ..."
1376596,32162656,B00464HT5K,Swann SW351WSA Window Shock Alarm,1,2,3,N,Y,not working,Since i bought the swann wireless security sys...
1451578,30986423,B002VSM7LE,Neewer 52MM 0.45X Wide Angle High Definition L...,1,1,5,N,Y,lens minus adaptor,I saw this lens would fit a Nikon DSLR. I hav...
64719,15340606,B000FJUMWC,Canon Digital Elph Accessory Kit for Canon SD9...,1,0,0,N,Y,i do not recall if i purchased it.,i do not recall if i purchased it.


In [9]:
# random sampler to pull 10,000 rows from trimmed_df

# random_sample = trimmed_df.sample(n=30000)

# count = random_sample[random_sample['star_rating'] == 1].shape[0]
# print(f"Number of rows = 1: {count}")

In [10]:
# def get_random_samples(df, n=5, frac=None, replace=False, random_state=None):
#   return df.sample(n=n, frac=frac, replace=replace, random_state=random_state)

# random_sample2 = get_random_samples(trimmed_df,n=10000)

# count = random_sample2[random_sample2['star_rating'] == 5].shape[0]
# print(f"Number of rows = 5: {count}")

#remove all null values from "review_body" and "review_headline" columns for random sample
random_sample = random_sample.dropna(subset=['review_body', 'review_headline'])
random_sample.columns

Index(['customer_id', 'product_id', 'product_title', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body'],
      dtype='object')

In [11]:
# #GPT TOKENIZER 

# from nltk.corpus import stopwords
# import nltk
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer
# from transformers import GPT2Tokenizer, GPT2Model
# nltk.download('stopwords')
# # Initialize GPT-2 tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# tokenizer.pad_token = tokenizer.eos_token

# random_sample['review_body'] = random_sample['review_body'].str.lower()
# random_sample['review_body'] = random_sample['review_body'].str.replace('[^\w\s]','', regex=True)
# random_sample['review_body'] = random_sample['review_body'].str.replace('\d+', '')

# random_sample['review_headline'] = random_sample['review_headline'].str.lower()
# random_sample['review_headline'] = random_sample['review_headline'].str.replace('[^\w\s]','',regex =True)
# random_sample['review_headline'] = random_sample['review_headline'].str.replace('\d+', '')

# stop_words = set(stopwords.words('english'))
# random_sample['review_body'] = random_sample['review_body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
# random_sample['review_headline'] = random_sample['review_headline'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# random_sample['review_body'] = random_sample['review_body'].apply(lambda x: ' '.join([word for word in x.split() if len(word)>2]))
# random_sample['review_headline'] = random_sample['review_headline'].apply(lambda x: ' '.join([word for word in x.split() if len(word)>2]))


# random_sample['review_body_tokens'] = random_sample['review_body'].apply(lambda x: tokenizer.tokenize(x) if isinstance(x, str) else [])
# random_sample['review_headline_tokens'] = random_sample['review_headline'].apply(lambda x: tokenizer.tokenize(x) if isinstance(x, str) else [])


# encoded_reviews = random_sample['review_body'].apply(lambda x: tokenizer.encode_plus(
#     x,
#     add_special_tokens=True,
#     max_length=1024,  # GPT-2 can handle longer sequences
#     padding='max_length',
#     truncation=True,
#     return_attention_mask=True,
#     return_tensors='pt'
# ))

# encoded_headlines = random_sample['review_headline'].apply(lambda x: tokenizer.encode_plus(
#     x,
#     add_special_tokens=True,
#     max_length=128,
#     padding='max_length',
#     truncation=True,
#     return_attention_mask=True,
#     return_tensors='pt'
# ))

# random_sample['review_input_ids'] = encoded_reviews.apply(lambda x: x['input_ids'].squeeze())
# random_sample['review_attention_mask'] = encoded_reviews.apply(lambda x: x['attention_mask'].squeeze())

# random_sample['headline_input_ids'] = encoded_headlines.apply(lambda x: x['input_ids'].squeeze())
# random_sample['headline_attention_mask'] = encoded_headlines.apply(lambda x: x['attention_mask'].squeeze())



In [12]:
# for TDIDF Tokenizer
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')

# Text preprocessing steps remain the same
random_sample['review_body'] = random_sample['review_body'].str.lower()
random_sample['review_body'] = random_sample['review_body'].str.replace('[^\w\s]','', regex=True)
random_sample['review_body'] = random_sample['review_body'].str.replace('\d+', '')

random_sample['review_headline'] = random_sample['review_headline'].str.lower()
random_sample['review_headline'] = random_sample['review_headline'].str.replace('[^\w\s]','',regex =True)
random_sample['review_headline'] = random_sample['review_headline'].str.replace('\d+', '')

# Remove stopwords
stop_words = set(stopwords.words('english'))
random_sample['review_body'] = random_sample['review_body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
random_sample['review_headline'] = random_sample['review_headline'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# Remove short words
random_sample['review_body'] = random_sample['review_body'].apply(lambda x: ' '.join([word for word in x.split() if len(word)>2]))
random_sample['review_headline'] = random_sample['review_headline'].apply(lambda x: ' '.join([word for word in x.split() if len(word)>2]))

# Initialize TF-IDF vectorizers
tfidf_vectorizer_body = TfidfVectorizer(max_features=5000)  # adjust max_features as needed
tfidf_vectorizer_headline = TfidfVectorizer(max_features=1000)  # adjust max_features as needed

# Transform the text to TF-IDF matrices
X_review_body = tfidf_vectorizer_body.fit_transform(random_sample['review_body'])
X_review_headline = tfidf_vectorizer_headline.fit_transform(random_sample['review_headline'])

# If you need the feature names (words)
review_body_features = tfidf_vectorizer_body.get_feature_names_out()
review_headline_features = tfidf_vectorizer_headline.get_feature_names_out()

# The resulting X_review_body and X_review_headline will be sparse matrices
# You can convert them to dense arrays if needed (but be careful with memory usage):
# X_review_body_dense = X_review_body.toarray()
# X_review_headline_dense = X_review_headline.toarray()


[nltk_data] Downloading package stopwords to C:\Users\Scotty
[nltk_data]     Horvath\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def map_stars_to_sentiment(star_rating):
    if star_rating == 1:
        return 'terrible'
    elif star_rating == 2:
        return 'bad'
    elif star_rating == 3:
        return 'ok'
    elif star_rating == 4:
        return 'good'
    elif star_rating == 5:
        return 'great'
    else:
        return 'unknown'

random_sample['sentiment'] = random_sample['star_rating'].apply(map_stars_to_sentiment)

sentiment_mapping = {
    'terrible': 0,
    'bad': 1,
    'ok': 2,
    'good': 3,
    'great': 4
}

random_sample['sentiment_label'] = random_sample['sentiment'].map(sentiment_mapping)

sentiment_dummies = pd.get_dummies(random_sample['sentiment'], prefix='sentiment')
random_sample = pd.concat([random_sample, sentiment_dummies], axis=1)

print("\nSentiment Distribution:")
print(random_sample['sentiment'].value_counts())

print("\nSentiment Distribution (%):")
print(random_sample['sentiment'].value_counts(normalize=True) * 100)



Sentiment Distribution:
sentiment
terrible    10000
great       10000
bad          9999
good         9999
ok           9998
Name: count, dtype: int64

Sentiment Distribution (%):
sentiment
terrible    20.0016
great       20.0016
bad         19.9996
good        19.9996
ok          19.9976
Name: proportion, dtype: float64


In [14]:
#FOR GPT TOKENIZER

# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report, accuracy_score
# import numpy as np

# # Function to pad sequences to the same length
# def pad_sequences(sequences, max_length=None):
#     if max_length is None:
#         max_length = max(len(seq) for seq in sequences)
    
#     padded_sequences = []
#     for seq in sequences:
#         if len(seq) > max_length:
#             padded_seq = seq[:max_length]
#         else:
#             padding = np.zeros(max_length - len(seq))
#             padded_seq = np.concatenate([seq, padding])
#         padded_sequences.append(padded_seq)
    
#     return np.array(padded_sequences)
# # Convert review body input_ids from tensors to numpy arrays
# review_body_arrays = random_sample['review_input_ids'].apply(
#     lambda x: x.detach().cpu().numpy() if hasattr(x, 'numpy') else x.numpy()
# )
# X_review_body = pad_sequences([arr for arr in review_body_arrays.values])

# # Convert review headline input_ids from tensors to numpy arrays
# review_headline_arrays = random_sample['headline_input_ids'].apply(
#     lambda x: x.detach().cpu().numpy() if hasattr(x, 'numpy') else x.numpy()
# )
# X_review_headline = pad_sequences([arr for arr in review_headline_arrays.values])

# # Get sentiment labels
# y = random_sample['sentiment_label']

# # Split data for review body
# X_train_body, X_test_body, y_train_body, y_test_body = train_test_split(
#     X_review_body, 
#     y, 
#     test_size=0.2, 
#     random_state=42
# )

# # Split data for headlines
# X_train_headline, X_test_headline, y_train_headline, y_test_headline = train_test_split(
#     X_review_headline, 
#     y, 
#     test_size=0.2, 
#     random_state=42
# )

# # Initialize models
# logistic_regression_body = LogisticRegression(class_weight="balanced",max_iter=10000, random_state=42)
# random_forest_body = RandomForestClassifier(class_weight="balanced",n_estimators=1024, random_state=42)

# logistic_regression_headline = LogisticRegression(class_weight="balanced",max_iter=10000, random_state=42)
# random_forest_headline = RandomForestClassifier(class_weight="balanced",n_estimators=1024, random_state=42)

# # Train models for review body
# logistic_regression_body.fit(X_train_body, y_train_body)
# random_forest_body.fit(X_train_body, y_train_body)

# # Train models for headlines
# logistic_regression_headline.fit(X_train_headline, y_train_headline)
# random_forest_headline.fit(X_train_headline, y_train_headline)

# # Make predictions for review body
# lr_predictions_body = logistic_regression_body.predict(X_test_body)
# rf_predictions_body = random_forest_body.predict(X_test_body)

# # Make predictions for headlines
# lr_predictions_headline = logistic_regression_headline.predict(X_test_headline)
# rf_predictions_headline = random_forest_headline.predict(X_test_headline)

# # Calculate accuracies for review body
# lr_accuracy_body = accuracy_score(y_test_body, lr_predictions_body)
# rf_accuracy_body = accuracy_score(y_test_body, rf_predictions_body)

# # Calculate accuracies for headlines
# lr_accuracy_headline = accuracy_score(y_test_headline, lr_predictions_headline)
# rf_accuracy_headline = accuracy_score(y_test_headline, rf_predictions_headline)

# # Print classification reports
# print("Review Body - Logistic Regression:")
# print(classification_report(y_test_body, lr_predictions_body))
# print("\nReview Body - Random Forest:")
# print(classification_report(y_test_body, rf_predictions_body))

# print("\nHeadline - Logistic Regression:")
# print(classification_report(y_test_headline, lr_predictions_headline))
# print("\nHeadline - Random Forest:")
# print(classification_report(y_test_headline, rf_predictions_headline))

# # Select best models based on accuracy
# if lr_accuracy_body > rf_accuracy_body:
#     best_body_model = logistic_regression_body
# else:
#     best_body_model = random_forest_body

# if lr_accuracy_headline > rf_accuracy_headline:
#     best_headline_model = logistic_regression_headline
# else:
#     best_headline_model = random_forest_headline

# def predict_sentiment(text, model, max_length=None):
#     encoded = tokenizer.encode_plus(
#         text,
#         add_special_tokens=True,
#         max_length=1024,
#         padding='max_length',
#         truncation=True,
#         return_tensors='pt'
#     )
    
#     input_ids = encoded['input_ids'].numpy().squeeze()
#     # Pad the input to match training data
#     if max_length is not None:
#         input_ids = pad_sequences([input_ids], max_length)[0]
    
#     prediction = model.predict([input_ids])[0]
    
#     sentiment_map = {
#         0: 'terrible',
#         1: 'bad',
#         2: 'ok',
#         3: 'good',
#         4: 'great'
#     }
#     return sentiment_map[prediction]

# # Example predictions
# sample_review = "This product exceeded my expectations"
# sample_headline = "Amazing purchase"

# # Get the lengths used for training data
# body_length = X_review_body.shape[1]
# headline_length = X_review_headline.shape[1]

# review_prediction = predict_sentiment(sample_review, best_body_model, body_length)
# headline_prediction = predict_sentiment(sample_headline, best_headline_model, headline_length)

# print(f"\nSample Review: '{sample_review}'")
# print(f"Predicted Sentiment: {review_prediction}")

# print(f"\nSample Headline: '{sample_headline}'")
# print(f"Predicted Sentiment: {headline_prediction}")



In [16]:
# for TDIDF TOKENIZER

# for TDIDF TOKENIZER
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import numpy as np

# Create TF-IDF vectorizers
tfidf_vectorizer_body = TfidfVectorizer(lowercase=True, max_features=5000)
tfidf_vectorizer_headline = TfidfVectorizer(lowercase=True, max_features=5000)

# Transform the text data
X_review_body = tfidf_vectorizer_body.fit_transform(random_sample['review_body'])
X_review_headline = tfidf_vectorizer_headline.fit_transform(random_sample['review_headline'])

# Get sentiment labels
y = random_sample['sentiment_label']

# Split data for review body
X_train_body, X_test_body, y_train_body, y_test_body = train_test_split(
    X_review_body, 
    y, 
    test_size=0.2, 
    random_state=42
)

# Split data for headlines
X_train_headline, X_test_headline, y_train_headline, y_test_headline = train_test_split(
    X_review_headline, 
    y, 
    test_size=0.2, 
    random_state=42
)

# Initialize models
logistic_regression_body = LogisticRegression(class_weight="balanced", max_iter=10000, random_state=42)
random_forest_body = RandomForestClassifier(class_weight="balanced", n_estimators=1024, random_state=42)
xgboost_body = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    scale_pos_weight=1  # for imbalanced datasets
)

logistic_regression_headline = LogisticRegression(class_weight="balanced", max_iter=10000, random_state=42)
random_forest_headline = RandomForestClassifier(class_weight="balanced", n_estimators=1024, random_state=42)
xgboost_headline = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    scale_pos_weight=1
)

# Train models for review body
logistic_regression_body.fit(X_train_body, y_train_body)
random_forest_body.fit(X_train_body, y_train_body)
xgboost_body.fit(X_train_body, y_train_body)

# Train models for headlines
logistic_regression_headline.fit(X_train_headline, y_train_headline)
random_forest_headline.fit(X_train_headline, y_train_headline)
xgboost_headline.fit(X_train_headline, y_train_headline)

# Make predictions for review body
lr_predictions_body = logistic_regression_body.predict(X_test_body)
rf_predictions_body = random_forest_body.predict(X_test_body)
xgb_predictions_body = xgboost_body.predict(X_test_body)

# Make predictions for headlines
lr_predictions_headline = logistic_regression_headline.predict(X_test_headline)
rf_predictions_headline = random_forest_headline.predict(X_test_headline)
xgb_predictions_headline = xgboost_headline.predict(X_test_headline)

# Calculate accuracies for review body
lr_accuracy_body = accuracy_score(y_test_body, lr_predictions_body)
rf_accuracy_body = accuracy_score(y_test_body, rf_predictions_body)
xgb_accuracy_body = accuracy_score(y_test_body, xgb_predictions_body)

# Calculate accuracies for headlines
lr_accuracy_headline = accuracy_score(y_test_headline, lr_predictions_headline)
rf_accuracy_headline = accuracy_score(y_test_headline, rf_predictions_headline)
xgb_accuracy_headline = accuracy_score(y_test_headline, xgb_predictions_headline)

# Print classification reports
print("Review Body - Logistic Regression:")
print(classification_report(y_test_body, lr_predictions_body))
print("\nReview Body - Random Forest:")
print(classification_report(y_test_body, rf_predictions_body))
print("\nReview Body - XGBoost:")
print(classification_report(y_test_body, xgb_predictions_body))

print("\nHeadline - Logistic Regression:")
print(classification_report(y_test_headline, lr_predictions_headline))
print("\nHeadline - Random Forest:")
print(classification_report(y_test_headline, rf_predictions_headline))
print("\nHeadline - XGBoost:")
print(classification_report(y_test_headline, xgb_predictions_headline))

# Select best models based on accuracy for body
body_accuracies = {
    'logistic_regression': lr_accuracy_body,
    'random_forest': rf_accuracy_body,
    'xgboost': xgb_accuracy_body
}
best_body_model_name = max(body_accuracies, key=body_accuracies.get)
if best_body_model_name == 'logistic_regression':
    best_body_model = logistic_regression_body
elif best_body_model_name == 'random_forest':
    best_body_model = random_forest_body
else:
    best_body_model = xgboost_body

# Select best models based on accuracy for headline
headline_accuracies = {
    'logistic_regression': lr_accuracy_headline,
    'random_forest': rf_accuracy_headline,
    'xgboost': xgb_accuracy_headline
}
best_headline_model_name = max(headline_accuracies, key=headline_accuracies.get)
if best_headline_model_name == 'logistic_regression':
    best_headline_model = logistic_regression_headline
elif best_headline_model_name == 'random_forest':
    best_headline_model = random_forest_headline
else:
    best_headline_model = xgboost_headline

# Print best models
print(f"\nBest model for review body: {best_body_model_name}")
print(f"Best model for headlines: {best_headline_model_name}")


Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.



Review Body - Logistic Regression:
              precision    recall  f1-score   support

           0       0.58      0.65      0.61      2041
           1       0.39      0.36      0.38      1992
           2       0.40      0.36      0.38      2048
           3       0.45      0.43      0.44      1981
           4       0.58      0.63      0.60      1938

    accuracy                           0.49     10000
   macro avg       0.48      0.49      0.48     10000
weighted avg       0.48      0.49      0.48     10000


Review Body - Random Forest:
              precision    recall  f1-score   support

           0       0.51      0.71      0.59      2041
           1       0.38      0.29      0.33      1992
           2       0.40      0.29      0.34      2048
           3       0.43      0.41      0.42      1981
           4       0.53      0.62      0.57      1938

    accuracy                           0.46     10000
   macro avg       0.45      0.46      0.45     10000
weighted avg