In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import lit
import os
import joblib

# ----------- Step 1: Load the News and Behavior Datasets -----------

# Load the news dataset
news_df = pd.read_csv('/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/Cleaned Datasets/News_cleaned.csv', sep=',', names=[
    "News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Entities Mentioned", "Entities in Abstract"
])

# Load the behavior dataset
behavior_df = pd.read_csv('/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/Cleaned Datasets/cleaned_behavior_dataset.csv', sep=',', names=[
    "Impression ID", "User ID", "Timestamp", "Displayed News List", "Impression List (Clicked Status)",
    "Impression Dictionary", "Clicked News IDs", "Not-Clicked News IDs"
])

# ----------- Step 2: Preprocess the News Dataset (for Content-Based Filtering) -----------

# Include both category and subcategory in the 'Text' field, alongside Title and Abstract
news_df['Text'] = news_df['Category'] + " " + news_df['Subcategory'] + " " + news_df['Title'] + " " + news_df['Abstract']

# TF-IDF vectorization for content-based recommendations
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(news_df['Text'])

# Compute cosine similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# ----------- Step 3: Word2Vec for News Embedding (Advanced Content-Based) -----------

# Tokenize text into sentences for Word2Vec
sentences = [text.split() for text in news_df['Text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

# Function to create embeddings for each article by averaging word embeddings
def get_article_embedding(text):
    words = text.split()
    word_vecs = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(100)

# Create embeddings for each article
news_df['Article Embedding'] = news_df['Text'].apply(get_article_embedding)

# ----------- Step 4: KMeans Clustering with Best K Value Automatically Selected -----------

def find_best_k(data, max_k=15):
    """
    Determine the best number of clusters using the Silhouette Score.
    """
    best_k = 2  # Initialize
    best_score = -1  # Silhouette score
    
    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(data)
        score = silhouette_score(data, labels)
        if score > best_score:
            best_score = score
            best_k = k
    
    return best_k

# Prepare news embeddings for clustering
news_embeddings = np.vstack(news_df['Article Embedding'].values)

# Automatically select the best K using Silhouette Score
best_k = find_best_k(news_embeddings)

# Train KMeans with the best K
kmeans_news = KMeans(n_clusters=best_k, random_state=42)
news_df['News Cluster'] = kmeans_news.fit_predict(news_embeddings)

# Save KMeans model for future use
save_dir = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Machine Learning Codes/Trained Models'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

joblib.dump(kmeans_news, os.path.join(save_dir, 'kmeans_news_model.pkl'))

# ----------- Step 5: ALS (Collaborative Filtering) for Clicked and Not-Clicked News -----------

# Initialize Spark for ALS
spark = SparkSession.builder \
    .appName("NewsRecommendationALS") \
    .getOrCreate()

# Prepare behavior data for ALS: Combine Clicked and Not-Clicked News IDs
clicked_df = behavior_df[['User ID', 'Clicked News IDs']].copy()
clicked_df = clicked_df.assign(Clicked_News=clicked_df['Clicked News IDs'].str.split(',')).explode('Clicked_News').drop(columns='Clicked News IDs')
clicked_df['Clicked_News'] = clicked_df['Clicked_News'].astype(str)

not_clicked_df = behavior_df[['User ID', 'Not-Clicked News IDs']].copy()
not_clicked_df = not_clicked_df.assign(Not_Clicked_News=not_clicked_df['Not-Clicked News IDs'].str.split(',')).explode('Not_Clicked_News').drop(columns='Not-Clicked News IDs')
not_clicked_df['Not_Clicked_News'] = not_clicked_df['Not_Clicked_News'].astype(str)

# Add a rating of 1.0 for clicked news (positive interactions)
clicked_df = clicked_df.withColumn('rating', lit(1.0))

# Add a rating of 0.0 for not-clicked news (negative interactions)
not_clicked_df = not_clicked_df.withColumn('rating', lit(0.0))

# Combine both clicked and not-clicked data
combined_behavior_df = clicked_df.union(not_clicked_df.rename(columns={'Not_Clicked_News': 'News ID'}))

# Convert combined data to Spark DataFrame
behavior_spark_df = spark.createDataFrame(combined_behavior_df)

# ALS model initialization
als = ALS(userCol="User ID", itemCol="News ID", ratingCol="rating", implicitPrefs=True, coldStartStrategy="drop")

# Hyperparameter grid search for ALS
paramGrid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 50, 100]) \
    .addGrid(als.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(als.maxIter, [10, 20]) \
    .build()

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
crossval = CrossValidator(estimator=als, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)

# Fit ALS model with cross-validation
cv_model = crossval.fit(behavior_spark_df)
best_als_model = cv_model.bestModel

# Save ALS model
best_als_model.save(os.path.join(save_dir, 'best_als_model'))

# Save other models for future use
joblib.dump(vectorizer, os.path.join(save_dir, 'tfidf_vectorizer.pkl'))
word2vec_model.save(os.path.join(save_dir, 'word2vec.model'))


# ----------- end -----------