In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from gensim.models import Word2Vec
from sklearn.cluster import MiniBatchKMeans
import os
import joblib
import multiprocessing
from joblib import Parallel, delayed
from tensorflow.keras import mixed_precision

# Enable mixed precision for improved GPU utilization
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

# --------- Step 1: Setup Paths and Variables ---------
print("Setting up paths and variables...")
save_dir = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Machine Learning Codes/Trained Models'
os.makedirs(save_dir, exist_ok=True)

news_file_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/Cleaned Datasets/News_cleaned.csv'
behavior_file_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/Cleaned Datasets/cleaned_behavior_dataset.csv'

embedding_size = 64
dropout_rate = 0.3
learning_rate = 1e-4
batch_size = 512

# --------- Step 2: GPU Initialization ---------
print("Initializing GPU...")
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        print(f"Using GPU: {physical_devices[0]}")
    except Exception as e:
        print(f"Error enabling GPU: {e}. Using CPU instead.")
else:
    print("No GPU detected. Using CPU.")

# --------- Step 3: Load and Encode Data with Optimization ---------
print("Loading behavior dataset in chunks...")
def load_behavior_data(file_path, chunksize=500000):
    return pd.concat(pd.read_csv(file_path, chunksize=chunksize), ignore_index=True)

behavior_df = load_behavior_data(behavior_file_path)

print("Loading and processing news dataset...")
news_df = pd.read_csv(news_file_path, usecols=['Category', 'Subcategory', 'Title', 'Abstract'])
news_df['Text'] = news_df[['Category', 'Subcategory', 'Title', 'Abstract']].fillna('').agg(' '.join, axis=1)

# Dictionary-based encoding to reduce LabelEncoder time
print("Creating dictionary-based encoders for users and news items...")
def create_encoding_map(values):
    unique_values = pd.Series(values.unique())
    return {v: i for i, v in enumerate(unique_values)}

user_map = create_encoding_map(behavior_df['User ID'])

all_news_ids = pd.concat([
    behavior_df['Clicked News IDs'].str.split(',').explode(),
    behavior_df['Not-Clicked News IDs'].str.split(',').explode()
]).dropna().unique()

news_map = {news_id: idx for idx, news_id in enumerate(all_news_ids)}

# Save both user and news encoders
joblib.dump(user_map, os.path.join(save_dir, 'user_encoder.pkl'))
joblib.dump(news_map, os.path.join(save_dir, 'news_encoder.pkl'))

# Parallelized Safe Encoding Function
def safe_encode_parallel(values, encoding_map):
    return np.array([encoding_map.get(v, -1) for v in values])

def parallel_safe_encode(series, encoding_map, n_jobs=multiprocessing.cpu_count()):
    values = series.values
    return np.hstack(Parallel(n_jobs=n_jobs)(
        delayed(safe_encode_parallel)(chunk, encoding_map) for chunk in np.array_split(values, n_jobs)
    ))

def encode_behavior_data(df, user_map, news_map):
    print("Encoding clicked news data with parallel safe encoding...")
    clicked_df = df[['User ID', 'Clicked News IDs']].explode('Clicked News IDs').dropna()
    clicked_df['User ID'] = parallel_safe_encode(clicked_df['User ID'], user_map)
    clicked_df['News ID'] = parallel_safe_encode(clicked_df['Clicked News IDs'], news_map)
    clicked_df['rating'] = 1.0

    print("Encoding not-clicked news data with parallel safe encoding...")
    not_clicked_df = df[['User ID', 'Not-Clicked News IDs']].explode('Not-Clicked News IDs').dropna()
    not_clicked_df['User ID'] = parallel_safe_encode(not_clicked_df['User ID'], user_map)
    not_clicked_df['News ID'] = parallel_safe_encode(not_clicked_df['Not-Clicked News IDs'], news_map)
    not_clicked_df['rating'] = 0.0

    return pd.concat([clicked_df, not_clicked_df], ignore_index=True)

combined_df = encode_behavior_data(behavior_df, user_map, news_map)
num_users = len(user_map)
num_items = len(news_map)

# --------- Step 4: Factorization Machines Layer ---------
print("Defining Factorization Machines Layer...")
class FactorizationMachinesLayer(tf.keras.layers.Layer):
    def __init__(self, embedding_size):
        super(FactorizationMachinesLayer, self).__init__()
        self.embedding_size = embedding_size

    def call(self, user_embedding, item_embedding):
        summed_features_emb = tf.add(user_embedding, item_embedding)
        summed_features_emb_square = tf.square(summed_features_emb)
        squared_sum_features_emb = tf.add(tf.square(user_embedding), tf.square(item_embedding))
        cross_term = 0.5 * tf.subtract(summed_features_emb_square, squared_sum_features_emb)
        return cross_term

# --------- Step 5: Define the Advanced Model with Single Attention Mechanism ---------
print("Building Deep Cross Network (DCN) model with single attention mechanism...")
class DeepCrossNetwork(tf.keras.Model):
    def __init__(self, num_users, num_items, embedding_size, dropout_rate):
        super(DeepCrossNetwork, self).__init__()
        self.user_embedding = tf.keras.layers.Embedding(num_users, embedding_size)
        self.item_embedding = tf.keras.layers.Embedding(num_items, embedding_size)
        
        # Attention mechanism with a single dense layer
        self.attention_dense = tf.keras.layers.Dense(embedding_size, activation='tanh')
        self.attention_score = tf.keras.layers.Dense(1, activation='softmax')

        self.fm_layer = FactorizationMachinesLayer(embedding_size)
        self.dense_cross = tf.keras.layers.Dense(embedding_size, activation='relu')
        self.deep_component = tf.keras.Sequential([
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(32, activation='relu')
        ])
        self.prediction_layer = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        user_input, item_input = inputs[:, 0], inputs[:, 1]
        user_vector = self.user_embedding(user_input)
        item_vector = self.item_embedding(item_input)

        # Apply attention mechanism
        combined_vector = tf.concat([user_vector, item_vector], axis=-1)
        attention_output = self.attention_dense(combined_vector)
        attention_weights = self.attention_score(attention_output)
        weighted_vector = combined_vector * attention_weights

        fm_output = self.fm_layer(weighted_vector, weighted_vector)
        cross_output = self.dense_cross(fm_output)
        deep_output = self.deep_component(cross_output)
        return self.prediction_layer(deep_output)

# --------- Step 6: Prepare Dataset Using tf.data ---------
print("Preparing dataset using tf.data with interleave for efficient loading...")
def create_dataset(inputs, ratings, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((inputs, ratings))
    dataset = dataset.cache()
    dataset = dataset.shuffle(buffer_size=1024)
    dataset = dataset.interleave(lambda x, y: tf.data.Dataset.from_tensors((x, y)),
                                 cycle_length=multiprocessing.cpu_count(), 
                                 num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

# --------- Step 7: Train the Model with tf.function ---------
print("Starting model training...")
@tf.function
def train_model_step(inputs, labels, model, optimizer):
    with tf.GradientTape() as tape:
        predictions = model(inputs, training=True)
        loss = tf.keras.losses.binary_crossentropy(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

def train_model():
    model = DeepCrossNetwork(num_users, num_items, embedding_size, dropout_rate)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    
    inputs = np.stack([combined_df['User ID'].values, combined_df['News ID'].values], axis=1)
    ratings = combined_df['rating'].values
    dataset = create_dataset(inputs, ratings, batch_size)
    
    for epoch in range(10):
        print(f"Starting epoch {epoch + 1}")
        for batch_inputs, batch_labels in dataset:
            loss = train_model_step(batch_inputs, batch_labels, model, optimizer)
            print(f"Batch loss: {loss.numpy().mean()}")

    return model

model = train_model()

# --------- Step 8: Save Models and Encoders ---------
print("Saving models and encoders...")
model.save(os.path.join(save_dir, 'deep_cross_network'))
joblib.dump(user_map, os.path.join(save_dir, 'user_encoder.pkl'))
joblib.dump(news_map, os.path.join(save_dir, 'news_encoder.pkl'))
print("Model and encoders saved successfully.")

# --------- Step 9: Train and Save Word2Vec and KMeans Models ---------
print("Training Word2Vec model...")
word2vec_model = Word2Vec(vector_size=100, window=5, min_count=2, workers=multiprocessing.cpu_count())
word2vec_model.build_vocab(news_df['Text'].str.split())
word2vec_model.train(news_df['Text'].str.split(), total_examples=len(news_df), epochs=5)

print("Training KMeans model...")
mini_batch_kmeans = MiniBatchKMeans(n_clusters=70, batch_size=500, n_init='auto')
news_embeddings = np.vstack(news_df['Text'].apply(
    lambda x: np.mean([word for word in x.split() if word in word2vec_model.wv], axis=0)
))
mini_batch_kmeans.fit(news_embeddings)

word2vec_model.save(os.path.join(save_dir, 'word2vec_model.model'))
joblib.dump(mini_batch_kmeans, os.path.join(save_dir, 'mini_batch_kmeans_news_model.pkl'))
print("All models saved successfully.")


#--------end--------

Setting up paths and variables...
Initializing GPU...
Using GPU: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
Loading behavior dataset in chunks...
Loading and processing news dataset...
Creating dictionary-based encoders for users and news items...
Encoding clicked news data with parallel safe encoding...
Encoding not-clicked news data with parallel safe encoding...
Defining Factorization Machines Layer...
Building Deep Cross Network (DCN) model with single attention mechanism...
Preparing dataset using tf.data with interleave for efficient loading...
Starting model training...
Starting epoch 1


2024-10-31 11:43:16.829473: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Max
2024-10-31 11:43:16.829503: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2024-10-31 11:43:16.829515: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
2024-10-31 11:43:16.829533: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-10-31 11:43:16.829544: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
