In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from sklearn.cluster import MiniBatchKMeans
import os
import joblib
import multiprocessing
from joblib import Parallel, delayed

# --------- Step 1: Setup Paths and Variables ---------
save_dir = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Machine Learning Codes/Trained Models'
os.makedirs(save_dir, exist_ok=True)

news_file_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/Cleaned Datasets/News_cleaned.csv'
behavior_file_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/Cleaned Datasets/cleaned_behavior_dataset.csv'

embedding_size = 64
num_heads = 4
dropout_rate = 0.3
learning_rate = 1e-4
batch_size = 512

# --------- Step 2: GPU Initialization ---------
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        print(f"Using GPU: {physical_devices[0]}")
    except Exception as e:
        print(f"Error enabling GPU: {e}. Using CPU instead.")
else:
    print("No GPU detected. Using CPU.")

# --------- Step 3: Load and Encode Data with Optimization ---------
print("Loading and encoding datasets...")

# Load data in chunks to reduce memory load
def load_and_process_behavior(file_path, chunksize=100000):
    chunked_data = []
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        chunked_data.append(chunk)
    return pd.concat(chunked_data, ignore_index=True)

behavior_df = load_and_process_behavior(behavior_file_path)

# Optimize the merging of text columns with minimal copying
news_df = pd.read_csv(news_file_path, usecols=['Category', 'Subcategory', 'Title', 'Abstract'])
news_df['Text'] = news_df[['Category', 'Subcategory', 'Title', 'Abstract']].fillna('').agg(' '.join, axis=1)

# Fit encoders on the unique values only to minimize encoding time
user_encoder = LabelEncoder().fit(behavior_df['User ID'].unique())
news_encoder = LabelEncoder().fit(pd.concat([
    behavior_df['Clicked News IDs'].str.split(',').explode(),
    behavior_df['Not-Clicked News IDs'].str.split(',').explode()
]).dropna().unique())

# --------- Parallelized Safe Encoding Function ---------
def safe_encode_parallel(values, encoder):
    known_labels = set(encoder.classes_)
    return np.array([encoder.transform([v])[0] if v in known_labels else -1 for v in values])

# --------- Parallel Encoding for Large Dataframes ---------
def parallel_safe_encode(df, column, encoder, n_jobs=multiprocessing.cpu_count()):
    values = df[column].values
    return np.hstack(Parallel(n_jobs=n_jobs)(delayed(safe_encode_parallel)(chunk, encoder) for chunk in np.array_split(values, n_jobs)))

# Transform User and News IDs Safely with Parallel Processing
clicked_df = behavior_df[['User ID', 'Clicked News IDs']].explode('Clicked News IDs').dropna()
not_clicked_df = behavior_df[['User ID', 'Not-Clicked News IDs']].explode('Not-Clicked News IDs').dropna()

clicked_df['User ID'] = parallel_safe_encode(clicked_df, 'User ID', user_encoder)
clicked_df['News ID'] = parallel_safe_encode(clicked_df, 'Clicked News IDs', news_encoder)
clicked_df['rating'] = 1.0

not_clicked_df['User ID'] = parallel_safe_encode(not_clicked_df, 'User ID', user_encoder)
not_clicked_df['News ID'] = parallel_safe_encode(not_clicked_df, 'Not-Clicked News IDs', news_encoder)
not_clicked_df['rating'] = 0.0

# Combine clicked and not-clicked datasets
combined_df = pd.concat([clicked_df, not_clicked_df], ignore_index=True)

num_users = len(user_encoder.classes_)
num_items = len(news_encoder.classes_)

# --------- Step 4: Factorization Machines Layer ---------
class FactorizationMachinesLayer(tf.keras.layers.Layer):
    def __init__(self, embedding_size):
        super(FactorizationMachinesLayer, self).__init__()
        self.embedding_size = embedding_size

    def call(self, user_embedding, item_embedding):
        summed_features_emb = tf.add(user_embedding, item_embedding)
        summed_features_emb_square = tf.square(summed_features_emb)

        squared_sum_features_emb = tf.add(tf.square(user_embedding), tf.square(item_embedding))

        cross_term = 0.5 * tf.subtract(summed_features_emb_square, squared_sum_features_emb)
        return cross_term

# --------- Step 5: Define the Advanced Model with DCN ---------
class DeepCrossNetwork(tf.keras.Model):
    def __init__(self, num_users, num_items, embedding_size, num_heads, dropout_rate):
        super(DeepCrossNetwork, self).__init__()
        self.user_embedding = tf.keras.layers.Embedding(num_users, embedding_size)
        self.item_embedding = tf.keras.layers.Embedding(num_items, embedding_size)

        self.transformer_encoder = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_size)

        self.fm_layer = FactorizationMachinesLayer(embedding_size)

        self.dense_cross = tf.keras.layers.Dense(embedding_size, activation='relu')

        self.deep_component = tf.keras.Sequential([
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(32, activation='relu')
        ])

        self.prediction_layer = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        user_input, item_input = inputs[:, 0], inputs[:, 1]

        user_vector = self.user_embedding(user_input)
        item_vector = self.item_embedding(item_input)

        fm_output = self.fm_layer(user_vector, item_vector)
        attention_output = self.transformer_encoder(user_vector, item_vector)

        cross_output = self.dense_cross(tf.concat([fm_output, attention_output], axis=1))
        deep_output = self.deep_component(cross_output)

        return self.prediction_layer(deep_output)

# --------- Step 6: Train the Model ---------
def train_model():
    model = DeepCrossNetwork(num_users, num_items, embedding_size, num_heads, dropout_rate)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy', metrics=['accuracy'])

    inputs = np.stack([combined_df['User ID'].values, combined_df['News ID'].values], axis=1)
    ratings = combined_df['rating'].values

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
    lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=2, factor=0.5)

    model.fit(inputs, ratings, epochs=10, batch_size=batch_size, validation_split=0.1,
              callbacks=[early_stopping, lr_callback])
    return model

print("Training the model...")
model = train_model()

# --------- Step 7: Save Models and Encoders ---------
model.save(os.path.join(save_dir, 'deep_cross_network'))
joblib.dump(user_encoder, os.path.join(save_dir, 'user_encoder.pkl'))
joblib.dump(news_encoder, os.path.join(save_dir, 'news_encoder.pkl'))

print("Model and encoders saved successfully.")

# --------- Step 8: Train and Save Word2Vec and KMeans Models ---------
print("Training Word2Vec and KMeans models...")
word2vec_model = Word2Vec(vector_size=100, window=5, min_count=2, workers=multiprocessing.cpu_count())
word2vec_model.build_vocab(news_df['Text'].str.split())
word2vec_model.train(news_df['Text'].str.split(), total_examples=len(news_df), epochs=5)

mini_batch_kmeans = MiniBatchKMeans(n_clusters=70, batch_size=500, n_init='auto')
news_embeddings = np.vstack(news_df['Text'].apply(
    lambda x: np.mean([word for word in x.split() if word in word2vec_model.wv], axis=0)
))
mini_batch_kmeans.fit(news_embeddings)

word2vec_model.save(os.path.join(save_dir, 'word2vec_model.model'))
joblib.dump(mini_batch_kmeans, os.path.join(save_dir, 'mini_batch_kmeans_news_model.pkl'))

print("All models saved successfully.")


#--------end--------

Using GPU: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
Loading and encoding datasets...
