In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from sklearn.cluster import MiniBatchKMeans
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import lit
from sklearn.preprocessing import LabelEncoder
import os
import joblib
import pyspark.sql.functions as F

# ----------- Step 1: Spark Session Initialization -----------
spark = SparkSession.builder \
    .appName("NewsRecommendationALS") \
    .master("local[*]") \
    .config("spark.executor.memory", "20g") \
    .config("spark.driver.memory", "20g") \
    .config("spark.hadoop.hadoop.security.authentication", "simple") \
    .config("spark.hadoop.hadoop.security.authorization", "false") \
    .config("spark.network.timeout", "2000s") \
    .config("spark.executor.heartbeatInterval", "200s") \
    .config("spark.driver.host", "127.0.0.1") \
    .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow")\
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.0") \
    .config("spark.executor.extraLibraryPath", "/opt/homebrew/opt/openblas/lib") \
    .config("spark.driver.extraLibraryPath", "/opt/homebrew/opt/openblas/lib") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")  # Reduce log verbosity
print("Spark session created successfully!")

save_dir = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Machine Learning Codes/Trained Models'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# ----------- Step 2: Load News and Behavior Datasets in Batches -----------
batch_size = 2500  # Reduced batch size to minimize memory issues

news_file_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/Cleaned Datasets/News_cleaned.csv'
news_columns = [
    "News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Entities Mentioned", "Entities in Abstract"
]
news_df_iterator = pd.read_csv(news_file_path, sep=',', names=news_columns, chunksize=batch_size)

behavior_file_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/Cleaned Datasets/cleaned_behavior_dataset.csv'
behavior_columns = [
    "Impression ID", "User ID", "Timestamp", "Displayed News List", "Impression List (Clicked Status)",
    "Impression Dictionary", "Clicked News IDs", "Not-Clicked News IDs"
]
behavior_df_iterator = pd.read_csv(behavior_file_path, sep=',', names=behavior_columns, chunksize=batch_size)

# ----------- Step 3: Initialize Label Encoders for User ID and News ID -----------
user_encoder = LabelEncoder()
news_encoder = LabelEncoder()

# Fit the label encoders on complete datasets (to ensure consistency across batches)
user_ids = []
news_ids = []
for behavior_df in behavior_df_iterator:
    user_ids.extend(behavior_df['User ID'].unique())
    news_ids.extend(behavior_df['Clicked News IDs'].str.split(',').explode().dropna().unique())
    news_ids.extend(behavior_df['Not-Clicked News IDs'].str.split(',').explode().dropna().unique())

user_encoder.fit(user_ids)
news_encoder.fit(news_ids)

# Restart iterators after fitting label encoders
behavior_df_iterator = pd.read_csv(behavior_file_path, sep=',', names=behavior_columns, chunksize=batch_size)
news_df_iterator = pd.read_csv(news_file_path, sep=',', names=news_columns, chunksize=batch_size)

# ----------- Step 4: Counter Function -----------
# Simple batch counter function
def batch_counter(start=0):
    count = start
    while True:
        yield count
        count += 1

# Create an instance of the counter generator
counter = batch_counter()

# Initialize MiniBatchKMeans for incremental learning
best_k = 70
mini_batch_kmeans = MiniBatchKMeans(n_clusters=best_k, batch_size=500, random_state=42)

# Initialize Word2Vec model for incremental training
word2vec_model = Word2Vec(vector_size=100, window=5, min_count=2, workers=4)

# First pass to build TF-IDF vocabulary
print("Building TF-IDF vocabulary...")
all_text = []
for news_df in news_df_iterator:
    news_df['Text'] = news_df['Category'] + " " + news_df['Subcategory'] + " " + news_df['Title'] + " " + news_df['Abstract']
    all_text.extend(news_df['Text'].values)

# Initialize and fit the TF-IDF vectorizer to build a global vocabulary
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(all_text)
print("TF-IDF vocabulary built successfully.")

# Save the TF-IDF vectorizer with the complete vocabulary for reuse
joblib.dump(vectorizer, os.path.join(save_dir, 'tfidf_vectorizer.pkl'))
print("TF-IDF model saved successfully.")

# Restart the news iterator for processing batches
news_df_iterator = pd.read_csv(news_file_path, sep=',', names=news_columns, chunksize=batch_size)

# ----------- Step 5: Model Averaging Variables -----------
user_factors_dict = {}
item_factors_dict = {}
batch_count = 0

# ----------- Step 6: Process Each Batch -----------
for news_df, behavior_df in zip(news_df_iterator, behavior_df_iterator):
    
    batch_number = next(counter)
    print(f"Processing batch number: {batch_number}")
    
    # ----------- Preprocess the News Dataset (for Content-Based Filtering) -----------
    news_df['Text'] = news_df['Category'] + " " + news_df['Subcategory'] + " " + news_df['Title'] + " " + news_df['Abstract']

    # Transform the text using the pre-built TF-IDF vectorizer (Incremental transformation using pre-built vocabulary)
    tfidf_matrix = vectorizer.transform(news_df['Text'])
    cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # ----------- Word2Vec for News Embedding (Initial Training and Incremental Training) -----------
    sentences = [text.split() for text in news_df['Text']]
    if batch_number == 0:
        word2vec_model.build_vocab(sentences)
    else:
        word2vec_model.build_vocab(sentences, update=True)
    
    word2vec_model.train(sentences, total_examples=len(sentences), epochs=5)

    def get_article_embedding(text):
        words = text.split()
        word_vecs = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
        return np.mean(word_vecs, axis=0).astype(np.float32) if word_vecs else np.zeros(100, dtype=np.float32)

    news_df['Article Embedding'] = news_df['Text'].apply(get_article_embedding)

    # ----------- Mini-Batch KMeans Clustering (Incremental Update) -----------
    news_embeddings = np.vstack(news_df['Article Embedding'].values)
    mini_batch_kmeans.partial_fit(news_embeddings)

    joblib.dump(mini_batch_kmeans, os.path.join(save_dir, f'mini_batch_kmeans_news_model.pkl'))
    print("Mini-Batch KMeans Model Updated and Saved successfully")

    # ----------- ALS (Collaborative Filtering) for Clicked and Not-Clicked News -----------
    clicked_df = behavior_df[['User ID', 'Clicked News IDs']].copy()
    clicked_df = clicked_df.assign(Clicked_News=clicked_df['Clicked News IDs'].str.split(',')).explode('Clicked_News').drop(columns='Clicked News IDs')
    clicked_df['Clicked_News'] = clicked_df['Clicked_News'].astype(str)

    not_clicked_df = behavior_df[['User ID', 'Not-Clicked News IDs']].copy()
    not_clicked_df = not_clicked_df.assign(Not_Clicked_News=not_clicked_df['Not-Clicked News IDs'].str.split(',')).explode('Not_Clicked_News').drop(columns='Not-Clicked News IDs')
    not_clicked_df['Not_Clicked_News'] = not_clicked_df['Not_Clicked_News'].astype(str)

    # Encode User ID and News ID using the fitted label encoders
    clicked_df['User ID'] = user_encoder.transform(clicked_df['User ID'])
    clicked_df['Clicked_News'] = news_encoder.transform(clicked_df['Clicked_News'])
    not_clicked_df['User ID'] = user_encoder.transform(not_clicked_df['User ID'])
    not_clicked_df['Not_Clicked_News'] = news_encoder.transform(not_clicked_df['Not_Clicked_News'])

    clicked_spark_df = spark.createDataFrame(clicked_df.dropna())
    not_clicked_spark_df = spark.createDataFrame(not_clicked_df.dropna())

    clicked_spark_df = clicked_spark_df.withColumn('rating', lit(1.0))
    not_clicked_spark_df = not_clicked_spark_df.withColumn('rating', lit(0.0))

    combined_behavior_spark_df = clicked_spark_df.union(
        not_clicked_spark_df.withColumnRenamed('Not_Clicked_News', 'Clicked_News')
    ).withColumnRenamed('Clicked_News', 'News ID')
    
    # Repartition the DataFrame to increase parallelism and improve performance
    combined_behavior_spark_df = combined_behavior_spark_df.repartition(200)

    # ALS model initialization and training
    als = ALS(userCol="User ID", itemCol="News ID", ratingCol="rating", implicitPrefs=True, coldStartStrategy="drop",
              rank=10, maxIter=10, regParam=0.1)

    try:
        als_model = als.fit(combined_behavior_spark_df)
        user_factors = als_model.userFactors.collect()
        item_factors = als_model.itemFactors.collect()

        # Accumulate user and item factors in dictionaries
        for row in user_factors:
            user_id = row['id']
            features = np.array(row['features'], dtype=np.float32)
            if user_id in user_factors_dict:
                user_factors_dict[user_id] += features
            else:
                user_factors_dict[user_id] = features

        for row in item_factors:
            item_id = row['id']
            features = np.array(row['features'], dtype=np.float32)
            if item_id in item_factors_dict:
                item_factors_dict[item_id] += features
            else:
                item_factors_dict[item_id] = features

        batch_count += 1
        print("ALS Model for batch trained successfully")
    except Exception as e:
        print(f"Failed to train ALS model for batch {batch_number}: {e}")

    # Save other models for future use
    word2vec_model.save(os.path.join(save_dir, 'word2vec_model.model'))
    print("Word2Vec Model Updated and Saved successfully")

    print(f"Finished processing batch number: {batch_number}\n")

# ----------- Step 7: Average Latent Factors and Save Final Model -----------
if batch_count > 0:
    # Average user and item factors
    user_factors_avg = {user_id: features / batch_count for user_id, features in user_factors_dict.items()}
    item_factors_avg = {item_id: features / batch_count for item_id, features in item_factors_dict.items()}
    
    # Save user and item factors separately
    user_factors_avg_df = spark.createDataFrame([(user_id, [float(f) for f in features]) for user_id, features in user_factors_avg.items()], ["user_id", "user_features"])
    item_factors_avg_df = spark.createDataFrame([(item_id, [float(f) for f in features]) for item_id, features in item_factors_avg.items()], ["item_id", "item_features"])
    
    user_factors_avg_df.write.mode("overwrite").parquet(os.path.join(save_dir, 'als_user_factors_avg.parquet'))
    item_factors_avg_df.write.mode("overwrite").parquet(os.path.join(save_dir, 'als_item_factors_avg.parquet'))
    
    print("Averaged ALS User and Item Factors Saved successfully")
    print("All Models Updated and Saved successfully")

print("Training Completed and models were saved successfully")
# ----------- End -----------

24/10/22 11:15:21 WARN Utils: Your hostname, Naifs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.20.10.2 instead (on interface en0)
24/10/22 11:15:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/n7/.ivy2/cache
The jars for the packages stored in: /Users/n7/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-76db8f6e-cae7-4d3a-9f90-9f16896355cf;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.2.0 in central


:: loading settings :: url = jar:file:/Users/n7/Programs/anaconda3/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found com.amazonaws#aws-java-sdk-bundle;1.11.375 in central
:: resolution report :: resolve 89ms :: artifacts dl 2ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.375 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-76db8f6e-cae7-4d3a-9f90-9f16896355cf
	confs: [default]
	0 artifacts copied, 2 already retrieved (0kB/3ms)
24/10/22 11:15:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
S

Spark session created successfully!
Building TF-IDF vocabulary...
TF-IDF vocabulary built successfully.
TF-IDF model saved successfully.
Processing batch number: 0




Mini-Batch KMeans Model Updated and Saved successfully


24/10/22 11:16:07 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/10/22 11:16:07 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 0

Processing batch number: 1
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 1

Processing batch number: 2
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 2

Processing batch number: 3
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 3

Processing batch number: 4
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 4

Processing batch number: 5
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 5

Processing batch number: 6
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 6

Processing batch number: 7
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 7

Processing batch number: 8
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 8

Processing batch number: 9
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 9

Processing batch number: 10
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 10

Processing batch number: 11
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 11

Processing batch number: 12
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 12

Processing batch number: 13
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 13

Processing batch number: 14
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 14

Processing batch number: 15
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 15

Processing batch number: 16
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 16

Processing batch number: 17
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 17

Processing batch number: 18
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 18

Processing batch number: 19
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 19

Processing batch number: 20
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 20

Processing batch number: 21
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 21

Processing batch number: 22
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 22

Processing batch number: 23
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 23

Processing batch number: 24
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 24

Processing batch number: 25
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 25

Processing batch number: 26
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 26

Processing batch number: 27
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 27

Processing batch number: 28
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 28

Processing batch number: 29
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 29

Processing batch number: 30
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 30

Processing batch number: 31
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 31

Processing batch number: 32
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 32

Processing batch number: 33
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 33

Processing batch number: 34
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 34

Processing batch number: 35
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 35

Processing batch number: 36
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 36

Processing batch number: 37
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 37

Processing batch number: 38
Mini-Batch KMeans Model Updated and Saved successfully


                                                                                

ALS Model for batch trained successfully
Word2Vec Model Updated and Saved successfully
Finished processing batch number: 38



                                                                                

Averaged ALS User and Item Factors Saved successfully
All Models Updated and Saved successfully
Training Completed and models were saved successfully
