In [1]:
import pandas as pd
import torch
import numpy as np
import os
import re
from transformers import BertTokenizer, BertModel, BertForMaskedLM, pipeline
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import OneHotEncoder
from transformers import DataCollatorForLanguageModeling
import joblib
import pickle
from sklearn.metrics import silhouette_score
from torch import nn
from datasets import Dataset
from transformers import Trainer, TrainingArguments

# Set directories for saving models and loading datasets
model_dir = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project Revised/Machine Learning models/Trained Models/Version7.7'
data_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project Revised/Data/MINDlarge_dev/Cleaned Dataset/News_cleaned.csv'
output_data_dir = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project Revised/Data/MINDlarge_dev/Cleaned Dataset'
output_data_path = os.path.join(output_data_dir, 'News_Features_Engineered_Dataset.parquet')
embedding_output_path = os.path.join(output_data_dir, 'news_embeddings_BERT_and_Kmeans.npy')

# Ensure model directory exists
os.makedirs(model_dir, exist_ok=True)

# Allowing for parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Set the device to MPS if available, otherwise use CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Helper function to standardize News ID format
def standardize_news_id(nid):
    """Standardize News ID by removing extra spaces."""
    return re.sub(r"\s+", "", nid) if isinstance(nid, str) else nid

# Load the dataset
print("Loading dataset...")
data = pd.read_csv(data_path)
print("Dataset loaded.")

# Standardize News ID
print("Standardizing News IDs...")
data['News ID'] = data['News ID'].apply(standardize_news_id)

# Combine text columns into a single representation for each article
print("Combining text columns...")
data["text"] = (
    data["Category"] + " " +
    data["Subcategory"] + " " +
    data["Title"] + " " +
    data["Abstract"]
)

# Fine-Tune BERT for MLM
def fine_tune_bert_for_mlm(data, model_dir):
    """
    Fine-tune BERT for Masked Language Modeling (MLM).
    Args:
        data (pd.DataFrame): Input data containing a "text" column.
        model_dir (str): Directory to save tokenizer and model.
    """
    print("Step 1: Fine-tuning BERT for Masked Language Modeling...")
    
    # Convert dataset to Hugging Face Dataset format
    print("Converting dataset...")
    dataset = Dataset.from_pandas(data[["text"]])

    # Load or train tokenizer
    tokenizer_path = os.path.join(model_dir, "bert_tokenizer")
    if os.path.exists(tokenizer_path):
        print("Loading existing BERT tokenizer...")
        tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
    else:
        print("Training new BERT tokenizer...")
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        tokenizer.save_pretrained(tokenizer_path)
        print(f"BERT tokenizer trained and saved to {tokenizer_path}")

    # Tokenize dataset
    print("Tokenizing dataset...")
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    print("Tokenization completed.")

    # Data collator for MLM
    print("Initializing data collator for MLM...")
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

    # Load or use existing model
    model_path = os.path.join(model_dir, "bert_mlm_model")
    if os.path.exists(model_path):
        print("Loading existing BERT model for Masked Language Modeling...")
        model = BertForMaskedLM.from_pretrained(model_path)
    else:
        print("Training new BERT model for Masked Language Modeling...")
        model = BertForMaskedLM.from_pretrained("bert-base-uncased")

        # Training arguments
        print("Setting up training arguments...")
        training_args = TrainingArguments(
            output_dir=model_dir,
            eval_strategy="no",
            learning_rate=5e-5,
            per_device_train_batch_size=16,
            num_train_epochs=3,
            save_steps=10_000,
            save_total_limit=2,
            logging_dir="./logs",
            logging_steps=500
        )

        # Trainer
        print("Initializing Trainer...")
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset,
            data_collator=data_collator,
            tokenizer=tokenizer
        )

        # Train the model
        print("Starting training...")
        trainer.train()
        print("Saving trained BERT model...")
        model.save_pretrained(model_path)
        print(f"BERT model trained and saved to {model_path}")

    print("BERT fine-tuning completed successfully.")
    
fine_tune_bert_for_mlm(data, model_dir)

# Generate Embeddings
def generate_embeddings(data, model_dir, output_data_dir, device, output_dim=768):
    """
    Generate 768-dimensional embeddings for text using a fine-tuned BERT model.
    Args:
        data (pd.DataFrame): Input data containing a "text" column.
        model_dir (str): Directory containing the fine-tuned model.
        output_data_dir (str): Directory to save the generated embeddings.
        device (torch.device): Device to run the model on.
        output_dim (int): Dimension of the output embeddings.
    """
    print("Step 2: Generating embeddings...")
    
    # Load BERT model and tokenizer
    bert_model_path = os.path.join(model_dir, "bert_mlm_model")
    print(f"Loading BERT model from {bert_model_path}...")
    bert_model = BertModel.from_pretrained(bert_model_path).to(device)
    print("BERT model loaded successfully.")

    tokenizer_path = os.path.join(model_dir, "bert_tokenizer")
    print(f"Loading tokenizer from {tokenizer_path}...")
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
    print("Tokenizer loaded successfully.")

    # Batch processing for embedding generation
    def generate_embeddings_batch(texts, model, tokenizer, device, batch_size=32):
        model.eval()
        embeddings = []
        print("Generating embeddings in batches...")
        with torch.no_grad():
            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i + batch_size]
                inputs = tokenizer(
                    batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512
                ).to(device)
                outputs = model(**inputs)
                embeddings.append(outputs.pooler_output.cpu().numpy())
                print(f"Processed batch {i // batch_size + 1} of {len(texts) // batch_size + 1}")
        return np.vstack(embeddings)

    print("Generating embeddings for the dataset...")
    embeddings = generate_embeddings_batch(data["text"].tolist(), bert_model, tokenizer, device)
    print("Embeddings generation completed.")

    # Save embeddings
    print("Saving embeddings...")
    data["BERT-Embeddings"] = embeddings.tolist()
    np.save(os.path.join(output_data_dir, "news_embeddings.npy"), embeddings)
    data[["News ID", "BERT-Embeddings"]].to_parquet(os.path.join(output_data_dir, "news_BERT_Only_embeddings.parquet"), index=False)
    print(f"Embeddings saved to {output_data_dir} successfully.")

generate_embeddings(data, model_dir, output_data_dir, device, output_dim=768)

# Step 3: Sentiment and Emotion Analysis
sentiment_model_dir = os.path.join(model_dir, "sentiment_model")
if not os.path.exists(sentiment_model_dir):
    print("Training and saving sentiment model...")
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
        revision="714eb0f",
        device=0 if torch.backends.mps.is_available() else -1
    )
    sentiment_pipeline.model.save_pretrained(sentiment_model_dir)
    sentiment_pipeline.tokenizer.save_pretrained(sentiment_model_dir)
    print("Sentiment model trained and saved.")
else:
    print("Loading existing sentiment model...")
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model=sentiment_model_dir,
        device=0 if torch.backends.mps.is_available() else -1
    )

data['Sentiment'] = data.apply(
    lambda row: sentiment_pipeline(row['Title'][:512] + " " + row['Abstract'][:512])[0]['label'], axis=1
)

emotion_model_dir = os.path.join(model_dir, "emotion_model")
if not os.path.exists(emotion_model_dir):
    print("Training and saving emotion model...")
    emotion_pipeline = pipeline(
        "text-classification",
        model="bhadresh-savani/distilbert-base-uncased-emotion",
        device=0 if torch.backends.mps.is_available() else -1
    )
    emotion_pipeline.model.save_pretrained(emotion_model_dir)
    emotion_pipeline.tokenizer.save_pretrained(emotion_model_dir)
    print("Emotion model trained and saved.")
else:
    print("Loading existing emotion model...")
    emotion_pipeline = pipeline(
        "text-classification",
        model=emotion_model_dir,
        device=0 if torch.backends.mps.is_available() else -1
    )

data['Emotion'] = data.apply(
    lambda row: emotion_pipeline(row['Title'][:512] + " " + row['Abstract'][:512])[0]['label'], axis=1
)

# Step 4: One-Hot Encoding and K-means Clustering
encoder_path = os.path.join(model_dir, 'one_hot_encoder.pkl')

# Combine features as they are
combined_features = np.column_stack((
    data['Category'].astype(str), 
    data['Subcategory'].astype(str), 
    data['Sentiment'].astype(str), 
    data['Emotion'].astype(str)
))

# Train or load the one-hot encoder
if os.path.exists(encoder_path):
    print("Loading existing OneHotEncoder...")
    encoder = joblib.load(encoder_path)
else:
    print("Training new OneHotEncoder...")
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoder.fit(combined_features)
    joblib.dump(encoder, encoder_path)
    print(f"OneHotEncoder trained and saved to {encoder_path}")

# Apply one-hot encoding
print("Applying one-hot encoding to features...")
one_hot_encoded_features = encoder.transform(combined_features)

# Proceed with K-means clustering
kmeans_path = os.path.join(model_dir, 'kmeans_model.pkl')
if os.path.exists(kmeans_path):
    print("Loading existing K-means model...")
    kmeans = joblib.load(kmeans_path)
else:
    print("Training K-means model...")
    max_k = 100
    best_k = 2
    best_score = -1
    for k in range(2, max_k + 1):
        kmeans_candidate = MiniBatchKMeans(n_clusters=k, batch_size=1000, random_state=0)
        cluster_labels = kmeans_candidate.fit_predict(one_hot_encoded_features)
        score = silhouette_score(one_hot_encoded_features, cluster_labels)
        if score > best_score:
            best_k = k
            best_score = score
    kmeans = MiniBatchKMeans(n_clusters=best_k, batch_size=1000, random_state=0)
    kmeans.fit(one_hot_encoded_features)
    joblib.dump(kmeans, kmeans_path)
    print("K-means model trained and saved.")

# Assign clusters to data
data['K-means Clusters'] = kmeans.predict(one_hot_encoded_features)
centroids = kmeans.cluster_centers_

# Add K-means embeddings to data
data['K-means Embeddings'] = data['K-means Clusters'].apply(lambda x: centroids[x].tolist())

# Save final data and embeddings
data[['News ID', 'BERT-Embeddings', 'Sentiment', 'Emotion', 'K-means Clusters', 'K-means Embeddings']].to_parquet(output_data_path, index=False)
news_embeddings = {
    "News ID": data['News ID'].tolist(),
    "BERT-Embeddings": np.array(data['BERT-Embeddings'].tolist()),
    "K-means Embeddings": np.array(data['K-means Embeddings'].tolist())
}
np.save(embedding_output_path, news_embeddings)

print(f"Data and embeddings saved to {output_data_dir}")

Loading dataset...
Dataset loaded.
Standardizing News IDs...
Combining text columns...
Step 1: Fine-tuning BERT for Masked Language Modeling...
Converting dataset...
Loading existing BERT tokenizer...
Tokenizing dataset...


Map:   0%|          | 0/68395 [00:00<?, ? examples/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Tokenization completed.
Initializing data collator for MLM...
Loading existing BERT model for Masked Language Modeling...


Some weights of BertModel were not initialized from the model checkpoint at /Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project Revised/Machine Learning models/Trained Models/Version7.7/bert_mlm_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT fine-tuning completed successfully.
Step 2: Generating embeddings...
Loading BERT model from /Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project Revised/Machine Learning models/Trained Models/Version7.7/bert_mlm_model...
BERT model loaded successfully.
Loading tokenizer from /Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project Revised/Machine Learning models/Trained Models/Version7.7/bert_tokenizer...
Tokenizer loaded successfully.
Generating embeddings for the dataset...
Generating embeddings in batches...
Processed batch 1 of 2138
Processed batch 2 of 2138
Processed batch 3 of 2138
Processed batch 4 of 2138
Processed batch 5 of 2138
Processed batch 6 of 2138
Processed batch 7 of 2138
Processed batch 8 of 2138
Processed batch 9 of 2138
Processed batch 10 of 2138
Processed batch 11 of 2138
Processed batch 12 of 2138
Processed batch 13 of 2138
Processed batch 14 of 2138
Processed batch 15 of 2138
Processed batch 16 of 2138
Processed batch 17 of 2138
Process

In [2]:
import pandas as pd
import re

# Load the behavior dataset
behavior_data_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project Revised/Data/MINDlarge_dev/Cleaned Dataset/cleaned_behavior_dataset.csv'
behavior_data = pd.read_csv(behavior_data_path)

# Columns to retain
columns_to_keep = ['User ID', 'Displayed News List', 'Clicked News IDs', 'Not-Clicked News IDs']

# Drop all columns except the ones specified
behavior_data = behavior_data[columns_to_keep]

# Check and display the data types of each column before processing
print("Data types before conversion:\n", behavior_data.dtypes)

# Function to convert to comma-separated and handle concatenated news IDs
def convert_to_comma_separated(value):
    if pd.isna(value):
        return ""  # Handle NaN values
    # Remove brackets, quotes, spaces, and any unwanted characters
    value = re.sub(r"[\[\]'\" ]", "", value)
    # Add comma between concatenated IDs using regex to find patterns like 'N[digits]N[digits]'
    value = re.sub(r"(N\d+)(?=N\d+)", r"\1,", value)
    return value

# Apply conversion to 'Displayed News List', 'Clicked News IDs', and 'Not-Clicked News IDs'
behavior_data['Displayed News List'] = behavior_data['Displayed News List'].apply(convert_to_comma_separated)
behavior_data['Clicked News IDs'] = behavior_data['Clicked News IDs'].apply(convert_to_comma_separated)
behavior_data['Not-Clicked News IDs'] = behavior_data['Not-Clicked News IDs'].apply(convert_to_comma_separated)

# Validation function to confirm no concatenated IDs remain
def check_concatenated_ids(column):
    pattern = r"N\d+N\d+"  # Pattern to detect concatenated IDs
    concatenated_ids = behavior_data[column].str.contains(pattern, regex=True).sum()
    return concatenated_ids == 0

# Check for concatenated IDs in all relevant columns
columns_to_check = ['Displayed News List', 'Clicked News IDs', 'Not-Clicked News IDs']
for col in columns_to_check:
    if check_concatenated_ids(col):
        print(f"No concatenated IDs found in '{col}'.")
    else:
        print(f"Concatenated IDs detected in '{col}' after processing.")

# Ensure 'Clicked News IDs' and 'Not-Clicked News IDs' have the same data type as 'Displayed News List'
displayed_news_type = behavior_data['Displayed News List'].dtype
behavior_data['Clicked News IDs'] = behavior_data['Clicked News IDs'].astype(displayed_news_type)
behavior_data['Not-Clicked News IDs'] = behavior_data['Not-Clicked News IDs'].astype(displayed_news_type)

# Check and display the data types after conversion to confirm changes
print("Data types after conversion:\n", behavior_data.dtypes)

# Display the first few rows to confirm the dataset structure
print("Processed Behavior Dataset:\n", behavior_data.head(10))

# Save the processed dataset if needed
output_behavior_data_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project Revised/Data/MINDlarge_dev/Cleaned Dataset/processed_behavior_dataset.csv'
behavior_data.to_csv(output_behavior_data_path, index=False)

Data types before conversion:
 User ID                 object
Displayed News List     object
Clicked News IDs        object
Not-Clicked News IDs    object
dtype: object
No concatenated IDs found in 'Displayed News List'.
No concatenated IDs found in 'Clicked News IDs'.
No concatenated IDs found in 'Not-Clicked News IDs'.
Data types after conversion:
 User ID                 object
Displayed News List     object
Clicked News IDs        object
Not-Clicked News IDs    object
dtype: object
Processed Behavior Dataset:
    User ID                                Displayed News List  \
0  U134050  N12246,N128820,N119226,N4065,N67770,N33446,N10...   
1  U254959  N34011,N9375,N67397,N7936,N118985,N109453,N103...   
2  U499841  N63858,N26834,N6379,N85484,N15229,N65119,N1047...   
3  U107107  N12959,N8085,N18389,N3758,N9740,N90543,N129790...   
4  U492344  N109183,N48453,N85005,N45706,N98923,N46069,N35...   
5  U657892                       N66666,N88230,N105366,N67497   
6  U441763         N68325

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.decomposition import PCA
import os
import pickle
from ast import literal_eval
import re
import torch.optim as optim
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from collections import defaultdict


# Directory paths
model_dir = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project Revised/Machine Learning models/Trained Models/Version7.7'
validation_data_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project Revised/Data/MINDlarge_dev/Cleaned Dataset/News_Features_Engineered_Dataset.parquet'
behavior_validation_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project Revised/Data/MINDlarge_dev/Cleaned Dataset/processed_behavior_dataset.csv'
user_profiles_path = os.path.join(model_dir, 'user_profiles.pkl')
model_checkpoint_path = os.path.join(model_dir, 'best_model_epoch_13.pt')

# Device setup
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")

# Load user profiles
print("Loading saved user profiles...")
with open(user_profiles_path, 'rb') as f:
    user_profiles = pickle.load(f)

# Generate default profile for unseen users
print("Calculating default profile for unseen users...")
all_preferences = [profile['preference_profile'] for profile in user_profiles.values()]
all_non_preferences = [profile['non_preference_profile'] for profile in user_profiles.values()]

default_preference = torch.stack(all_preferences).mean(dim=0)
default_non_preference = torch.stack(all_non_preferences).mean(dim=0)
default_user_profile = {'preference_profile': default_preference, 'non_preference_profile': default_non_preference}

# Helper function to standardize News ID format
def standardize_news_id(nid):
    """Standardize News ID by removing extra spaces."""
    return re.sub(r"\s+", "", nid) if isinstance(nid, str) else nid

# Helper function to standardize User ID format
def standardize_user_id(uid):
    """Standardize User ID by removing extra spaces and applying a consistent format."""
    return re.sub(r"\s+", "", str(uid)) if isinstance(uid, (str, int)) else uid

# Step 1: Load news embeddings
print("Step 1: Loading news embeddings.")

def load_parquet_embeddings_with_error_checking(validation_data_path):
    """
    Load embeddings from a Parquet file, validate them, and count any errors during processing.

    Args:
        validation_data_path (str): Path to the Parquet file containing saved embeddings.

    Returns:
        pd.DataFrame: DataFrame containing News ID, BERT-Embeddings, and K-means Embeddings.
    """
    # Initialize error count
    error_count = 0

    # Load the Parquet file
    try:
        embeddings_df = pd.read_parquet(validation_data_path)
    except FileNotFoundError:
        raise FileNotFoundError(f"The file {validation_data_path} does not exist.")
    except Exception as e:
        raise RuntimeError(f"Failed to load Parquet file: {e}")

    # Check for required columns
    required_columns = ["News ID", "BERT-Embeddings", "K-means Embeddings"]
    for column in required_columns:
        if column not in embeddings_df.columns:
            raise KeyError(f"Missing expected column in the Parquet file: {column}")

    # Helper function to validate and convert embeddings to float32
    def validate_and_convert_embedding(embedding, expected_size):
        nonlocal error_count
        try:
            array = np.array(embedding, dtype=np.float32)  # Convert to float32
            if array.size != expected_size:
                raise ValueError(f"Unexpected size: {array.size}")
            return array
        except (ValueError, TypeError):
            error_count += 1
            return np.zeros(expected_size, dtype=np.float32)  # Default to zero array on error

    # Expected sizes
    bert_embeddings = embeddings_df["BERT-Embeddings"].iloc[0]
    kmeans_embeddings = embeddings_df["K-means Embeddings"].iloc[0]
    bert_size = len(bert_embeddings) if isinstance(bert_embeddings, (list, np.ndarray)) else 768
    kmeans_size = len(kmeans_embeddings) if isinstance(kmeans_embeddings, (list, np.ndarray)) else 304

    # Validate embeddings and convert to float32
    embeddings_df["BERT-Embeddings"] = embeddings_df["BERT-Embeddings"].apply(lambda x: validate_and_convert_embedding(x, bert_size))
    embeddings_df["K-means Embeddings"] = embeddings_df["K-means Embeddings"].apply(lambda x: validate_and_convert_embedding(x, kmeans_size))

    # Print the error count
    print(f"Number of errors encountered during embedding validation: {error_count}")
    return embeddings_df

# Load the embeddings
data = load_parquet_embeddings_with_error_checking(validation_data_path)

# Standardize News IDs
data['News ID'] = data['News ID'].apply(standardize_news_id)

# Inspect the DataFrame
print(data.head())

# Concatenate BERT-Embeddings (768 dimensions) and K-means Embeddings (304 dimensions)
bert_embedding_size = len(data['BERT-Embeddings'].iloc[0])  # Infer size from the first embedding
kmeans_embedding_size = len(data['K-means Embeddings'].iloc[0])  # Infer size from the first embedding

# Validate embedding dimensions before concatenation
print(f"BERT Embedding Size: {bert_embedding_size}")
print(f"K-means Embedding Size: {kmeans_embedding_size}")

# Ensure all embeddings are of expected dimensions
assert all(len(embedding) == bert_embedding_size for embedding in data['BERT-Embeddings']), \
    "Inconsistent BERT embedding size detected."
assert all(len(embedding) == kmeans_embedding_size for embedding in data['K-means Embeddings']), \
    "Inconsistent K-means embedding size detected."

def pad_embeddings(matrix, target_size=1088):
    """
    Pad the concatenated embeddings matrix to a fixed size along the second dimension.
    Adds zero-padding to the end of each row if the size is less than the target size (default: 1088).

    Parameters:
    - matrix (np.ndarray): Input concatenated embeddings matrix.
    - target_size (int, optional): Desired size along the second dimension (default: 1088).

    Returns:
    - np.ndarray: Padded matrix with dimensions [matrix.shape[0], target_size].
    """
    current_size = matrix.shape[1]
    if current_size < target_size:
        # Calculate the padding size
        padding_size = target_size - current_size
        # Apply zero-padding along the second dimension
        padded_matrix = np.pad(matrix, ((0, 0), (0, padding_size)), mode='constant', constant_values=0)
        print(f"Padded embeddings from size {current_size} to {target_size}.")
    else:
        padded_matrix = matrix  # No padding needed if already at or above target size
        print(f"No padding applied. Embeddings already of size {current_size}.")
    return padded_matrix

# Step 2: Concatenate embeddings and news embeddings map generation
print("Step 2: Concatenate embeddings and news embeddings map generation.")
embedding_matrix = np.hstack([
    np.vstack(data['BERT-Embeddings'].values),  # Convert BERT-Embeddings to matrix form
    np.vstack(data['K-means Embeddings'].values)  # Convert K-means Embeddings to matrix form
])

# Pad concatenated embeddings to a fixed size (e.g., 1088 dimensions)
target_size = 1088
padded_embedding_matrix = pad_embeddings(embedding_matrix, target_size)

# Convert to tensors
embedding_tensor = torch.tensor(padded_embedding_matrix, dtype=torch.float32)

# Add the tensor embeddings to the dataset
data['combined_embedding'] = [embedding_tensor[i] for i in range(embedding_tensor.size(0))]

# Create news embeddings map
news_embeddings_map = {row['News ID']: row['combined_embedding'] for _, row in data.iterrows()}

# Print information for verification
print(f"Shape of concatenated embeddings: {embedding_matrix.shape}")
print(f"Shape of padded embeddings: {padded_embedding_matrix.shape}")
print(f"Sample news embeddings map: {list(news_embeddings_map.items())[:5]}")

# Step 3: Load behavior validation data
print("Step 3: Loading behavior validation data...")
behavior_df = pd.read_csv(behavior_validation_path)

# Standardize User IDs
behavior_df['User ID'] = behavior_df['User ID'].apply(standardize_user_id)

# Standardize and process 'Clicked News IDs' and 'Not-Clicked News IDs'
behavior_df['Clicked News IDs'] = behavior_df['Clicked News IDs'].apply(
    lambda x: [standardize_news_id(nid) for nid in x.split(',')] if isinstance(x, str) else []
)
behavior_df['Not-Clicked News IDs'] = behavior_df['Not-Clicked News IDs'].apply(
    lambda x: [standardize_news_id(nid) for nid in x.split(',')] if isinstance(x, str) else []
)

# Drop rows where 'Clicked News IDs' or 'Not-Clicked News IDs' are empty
behavior_df = behavior_df[
    (behavior_df['Clicked News IDs'].map(len) > 0) & (behavior_df['Not-Clicked News IDs'].map(len) > 0)
]

# Combine 'Clicked News IDs' and 'Not-Clicked News IDs' into a randomly ordered 'Displayed News List',
behavior_df['Displayed News List'] = behavior_df.apply(
    lambda row: np.random.permutation(
        row['Clicked News IDs'] + row['Not-Clicked News IDs']
    ).tolist(),
    axis=1
)

# Standardize 'Displayed News List'
behavior_df['Displayed News List'] = behavior_df['Displayed News List'].apply(
    lambda x: [standardize_news_id(nid) for nid in x] if isinstance(x, list) else []
)

# Set display options to avoid truncation
pd.set_option("display.max_colwidth", None)  # Prevent truncation of column content
pd.set_option("display.max_rows", None)  # Prevent truncation of rows

# Print first 5 rows for verification
print("First 5 rows after processing:")
print(behavior_df[['User ID', 'Clicked News IDs', 'Not-Clicked News IDs', 'Displayed News List']].head())

# Print size of validation data after filtering
print(f"Validation data size after filtering: {len(behavior_df)} rows")

Using device: mps
Loading saved user profiles...
Calculating default profile for unseen users...
Step 1: Loading news embeddings.
Number of errors encountered during embedding validation: 0
  News ID                                    BERT-Embeddings Sentiment  \
0  N88753  [-0.03282312, -0.12860948, 0.24515766, -0.2115...  POSITIVE   
1  N23144  [0.15914527, -0.22159758, -0.027823929, -0.175...  NEGATIVE   
2  N93187  [-0.11768901, -0.13728267, 0.26656207, -0.3962...  NEGATIVE   
3  N75236  [0.18441504, -0.23967472, 0.03682767, -0.11220...  NEGATIVE   
4  N99744  [0.13281766, -0.33782175, 0.054901987, 0.10067...  NEGATIVE   

   Emotion  K-means Clusters  \
0    anger                10   
1      joy                66   
2     fear                37   
3  sadness                72   
4      joy                66   

                                  K-means Embeddings  
0  [0.05628373, 0.00539707, 0.07555898, 0.0485736...  
1  [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...  
2  [0.0

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from collections import defaultdict
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Hyperparameters
BATCH_SIZE = 10000  # Dynamically adjustable

# Step 3: Load evaluation matrix using cosine similarity
print("Using cosine similarity for evaluation...")

def calculate_cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    """
    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]

def evaluate_predictions(predicted_not_clicked, actual_not_clicked):
    """
    Evaluate predictions for not-clicked news IDs only.
    Handles edge cases to avoid NaN results.
    """
    # Ensure inputs are valid lists
    predicted_not_clicked = predicted_not_clicked or []
    actual_not_clicked = actual_not_clicked or []

    # Skip evaluation if both lists are empty
    if not predicted_not_clicked and not actual_not_clicked:
        return {"Precision": 0.0, "Recall": 0.0, "F1 Score": 0.0, "Accuracy": 0.0}

    # Generate true and predicted labels
    y_true = [1 if pred in actual_not_clicked else 0 for pred in predicted_not_clicked]
    y_pred = [1] * len(predicted_not_clicked)  # Predicted as not-clicked

    # Calculate metrics safely
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    accuracy = accuracy_score(y_true, y_pred) if y_true and y_pred else 0.0

    return {"Precision": precision, "Recall": recall, "F1 Score": f1, "Accuracy": accuracy}

def parallel_metric_calculation(metrics_list):
    """
    Calculate average metrics across all batches.
    """
    avg_metrics = defaultdict(float)
    total = len(metrics_list)
    if total == 0:
        return {"Precision": 0.0, "Recall": 0.0, "F1 Score": 0.0, "Accuracy": 0.0}
    for metrics in metrics_list:
        for key, value in metrics.items():
            avg_metrics[key] += value
    for key in avg_metrics.keys():
        avg_metrics[key] /= total
    return avg_metrics

# Step 4: Validation loop
print("Starting validation...")
all_metrics = []
news_ids_found = 0
news_ids_not_found = 0

# Check for missing values in the behavior dataframe
if behavior_df.isnull().any().any():
    raise ValueError("Behavior dataset contains missing values. Please clean the dataset before proceeding.")

for batch_start in tqdm(range(0, len(behavior_df), BATCH_SIZE)):
    batch_df = behavior_df.iloc[batch_start:batch_start + BATCH_SIZE]

    # Skip empty batches
    if len(batch_df) == 0:
        continue

    for _, row in batch_df.iterrows():
        user_id = row['User ID']
        displayed_news_ids = row['Displayed News List']
        not_clicked_news_ids = row['Not-Clicked News IDs']

        # Ensure these lists are valid
        displayed_news_ids = displayed_news_ids or []
        not_clicked_news_ids = not_clicked_news_ids or []

        user_profile = user_profiles.get(user_id, default_user_profile)
        preference_profile = user_profile['preference_profile']
        non_preference_profile = user_profile['non_preference_profile']
        predicted_not_clicked = []

        # Filter displayed news IDs to ensure valid embeddings
        displayed_news_ids = [nid for nid in displayed_news_ids if nid in news_embeddings_map]

        for news_id in displayed_news_ids:
            news_embedding = news_embeddings_map.get(news_id)
            if news_embedding is None:
                news_ids_not_found += 1
                continue
            news_ids_found += 1

            # Get cosine similarity scores
            preference_score = calculate_cosine_similarity(preference_profile, news_embedding)
            non_preference_score = calculate_cosine_similarity(non_preference_profile, news_embedding)

            # Basic comparison: Predict as not-clicked if non_preference_score > preference_score
            if non_preference_score > preference_score:
                predicted_not_clicked.append(news_id)

        # Evaluate only not-clicked predictions
        if not_clicked_news_ids:
            metrics = evaluate_predictions(predicted_not_clicked, not_clicked_news_ids)
            all_metrics.append(metrics)

with ThreadPoolExecutor() as executor:
    avg_metrics = executor.submit(parallel_metric_calculation, all_metrics).result()

# Step 5: Printing results
print(f"News IDs Found: {news_ids_found}")
print(f"News IDs Not Found: {news_ids_not_found}")

print("\nOverall Evaluation Metrics :")
for metric, value in avg_metrics.items():
    print(f"{metric}: {value:.4f}")

Using cosine similarity for evaluation...
Starting validation...


100%|███████████████████████████████████████████| 37/37 [54:44<00:00, 88.77s/it]

News IDs Found: 13310314
News IDs Not Found: 0

Overall Evaluation Metrics :
Precision: 0.8760
Recall: 0.9421
F1 Score: 0.9038
Accuracy: 0.8760



