In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import Word2Vec
from sklearn.cluster import MiniBatchKMeans
import os
import joblib
import multiprocessing
from joblib import Parallel, delayed
from torch.utils.data import DataLoader, TensorDataset

# --------- Step 1: Setup Paths and Variables ---------
print("Setting up paths and variables...")
save_dir = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Machine Learning Codes/Trained Models'
os.makedirs(save_dir, exist_ok=True)

news_file_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/Cleaned Datasets/News_cleaned.csv'
behavior_file_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/Cleaned Datasets/cleaned_behavior_dataset.csv'

embedding_size = 64
dropout_rate = 0.3
learning_rate = 1e-4
batch_size = 2048
num_heads = 64  # Adjust based on your available resources

# --------- Step 2: Set Device for PyTorch ---------
device = torch.device("mps" if torch.has_mps else "cpu")
print(f"Using device: {device}")

# --------- Step 3: Load and Encode Data with Optimization ---------
print("Loading behavior dataset in chunks...")
def load_behavior_data(file_path, chunksize=500000):
    return pd.concat(pd.read_csv(file_path, chunksize=chunksize), ignore_index=True)

behavior_df = load_behavior_data(behavior_file_path)

print("Loading and processing news dataset...")
news_df = pd.read_csv(news_file_path, usecols=['Category', 'Subcategory', 'Title', 'Abstract'])
news_df['Text'] = news_df[['Category', 'Subcategory', 'Title', 'Abstract']].fillna('').agg(' '.join, axis=1)

print("Creating dictionary-based encoders for users and news items...")
def create_encoding_map(values):
    unique_values = pd.Series(values.unique())
    return {v: i for i, v in enumerate(unique_values)}

user_map = create_encoding_map(behavior_df['User ID'])
all_news_ids = pd.concat([
    behavior_df['Clicked News IDs'].str.split(',').explode(),
    behavior_df['Not-Clicked News IDs'].str.split(',').explode()
]).dropna().unique()
news_map = {news_id: idx for idx, news_id in enumerate(all_news_ids)}

joblib.dump(user_map, os.path.join(save_dir, 'user_encoder.pkl'))
joblib.dump(news_map, os.path.join(save_dir, 'news_encoder.pkl'))

# Parallelized Safe Encoding Function
def safe_encode_parallel(values, encoding_map):
    return np.array([encoding_map.get(v, -1) for v in values])

def parallel_safe_encode(series, encoding_map, n_jobs=multiprocessing.cpu_count()):
    values = series.values
    return np.hstack(Parallel(n_jobs=n_jobs)(
        delayed(safe_encode_parallel)(chunk, encoding_map) for chunk in np.array_split(values, n_jobs)
    ))

def encode_behavior_data(df, user_map, news_map):
    print("Encoding clicked news data with parallel safe encoding...")
    clicked_df = df[['User ID', 'Clicked News IDs']].explode('Clicked News IDs').dropna()
    clicked_df['User ID'] = parallel_safe_encode(clicked_df['User ID'], user_map)
    clicked_df['News ID'] = parallel_safe_encode(clicked_df['Clicked News IDs'], news_map)
    clicked_df['rating'] = 1.0

    print("Encoding not-clicked news data with parallel safe encoding...")
    not_clicked_df = df[['User ID', 'Not-Clicked News IDs']].explode('Not-Clicked News IDs').dropna()
    not_clicked_df['User ID'] = parallel_safe_encode(not_clicked_df['User ID'], user_map)
    not_clicked_df['News ID'] = parallel_safe_encode(not_clicked_df['Not-Clicked News IDs'], news_map)
    not_clicked_df['rating'] = 0.0

    return pd.concat([clicked_df, not_clicked_df], ignore_index=True)

combined_df = encode_behavior_data(behavior_df, user_map, news_map)
num_users = len(user_map)
num_items = len(news_map)

# Prepare dataset for PyTorch
print("Preparing PyTorch dataset...")
inputs = torch.tensor(np.stack([combined_df['User ID'].values, combined_df['News ID'].values], axis=1), dtype=torch.long)
ratings = torch.tensor(combined_df['rating'].values, dtype=torch.float32)
dataset = TensorDataset(inputs, ratings)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# --------- Step 4: Define the Deep Cross Network with Multi-Head Attention ---------
class FactorizationMachinesLayer(nn.Module):
    def __init__(self, embedding_size):
        super(FactorizationMachinesLayer, self).__init__()
        self.embedding_size = embedding_size

    def forward(self, user_embedding, item_embedding):
        summed_features_emb = user_embedding + item_embedding
        summed_features_emb_square = summed_features_emb ** 2
        squared_sum_features_emb = user_embedding ** 2 + item_embedding ** 2
        cross_term = 0.5 * (summed_features_emb_square - squared_sum_features_emb)
        return cross_term

class DeepCrossNetworkWithMultiHeadAttention(nn.Module):
    def __init__(self, num_users, num_items, embedding_size, dropout_rate, num_heads):
        super(DeepCrossNetworkWithMultiHeadAttention, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        
        # Multi-Head Attention Layer
        self.multihead_attention = nn.MultiheadAttention(embed_dim=embedding_size, num_heads=num_heads, batch_first=True)

        self.fm_layer = FactorizationMachinesLayer(embedding_size)
        self.dense_cross = nn.Linear(embedding_size, embedding_size)
        self.deep_component = nn.Sequential(
            nn.Linear(embedding_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 32),
            nn.ReLU()
        )
        self.prediction_layer = nn.Linear(32, 1)

    def forward(self, inputs):
        user_input, item_input = inputs[:, 0], inputs[:, 1]
        user_vector = self.user_embedding(user_input)
        item_vector = self.item_embedding(item_input)
        
        # Concatenate and apply multi-head attention
        combined_vector = torch.cat((user_vector.unsqueeze(1), item_vector.unsqueeze(1)), dim=1)
        attn_output, _ = self.multihead_attention(combined_vector, combined_vector, combined_vector)
        attn_output = attn_output[:, 0, :]  # Focus on the user context
        
        # Factorization Machines and Deep Component
        fm_output = self.fm_layer(attn_output, attn_output)
        cross_output = torch.relu(self.dense_cross(fm_output))
        deep_output = self.deep_component(cross_output)
        return torch.sigmoid(self.prediction_layer(deep_output)).squeeze()

# --------- Step 5: Train the Model with Learning Rate Scheduler and Early Stopping ---------
print("Starting model training with adaptive learning rate and early stopping...")
model = DeepCrossNetworkWithMultiHeadAttention(num_users, num_items, embedding_size, dropout_rate, num_heads).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Learning Rate Scheduler and Early Stopping
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)
early_stop_patience = 5
max_epochs = 70

best_loss = float('inf')
epochs_no_improve = 0

for epoch in range(max_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    
    epoch_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1}, Loss: {epoch_loss}")

    # Adjust learning rate if loss plateaus
    scheduler.step(epoch_loss)

    # Early stopping check
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= early_stop_patience:
        print("Early stopping triggered.")
        break

# --------- Step 6: Save Models and Encoders ---------
print("Saving models and encoders...")
torch.save(model.state_dict(), os.path.join(save_dir, 'deep_cross_network_with_multihead_attention.pt'))
joblib.dump(user_map, os.path.join(save_dir, 'user_encoder.pkl'))
joblib.dump(news_map, os.path.join(save_dir, 'news_encoder.pkl'))

# --------- Step 7: Train and Save Word2Vec and KMeans Models ---------
print("Training Word2Vec model...")
word2vec_model = Word2Vec(vector_size=100, window=5, min_count=2, workers=multiprocessing.cpu_count())
word2vec_model.build_vocab(news_df['Text'].str.split())
word2vec_model.train(news_df['Text'].str.split(), total_examples=len(news_df), epochs=5)

# Helper function to get mean embeddings for each text entry
def get_mean_embedding(text, model, vector_size=100):
    # Retrieve embeddings only for words in the model's vocabulary
    embeddings = [model.wv[word] for word in text.split() if word in model.wv]
    # Calculate mean if embeddings are available, otherwise return a zero vector
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(vector_size)

# Generate embeddings for each text in the dataset
print("Generating mean embeddings for KMeans training...")
news_embeddings = np.vstack(news_df['Text'].apply(lambda x: get_mean_embedding(x, word2vec_model, vector_size=100)))

# Train the KMeans model on the generated embeddings
print("Training KMeans model...")
mini_batch_kmeans = MiniBatchKMeans(n_clusters=70, batch_size=500, n_init='auto')
mini_batch_kmeans.fit(news_embeddings)

# Save the Word2Vec and KMeans models
word2vec_model.save(os.path.join(save_dir, 'word2vec_model.model'))
joblib.dump(mini_batch_kmeans, os.path.join(save_dir, 'mini_batch_kmeans_news_model.pkl'))
print("All models saved successfully.")

#--------end--------