In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.ml.recommendation import ALSModel
import joblib
from functools import partial
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import lit
import os

# ----------- Initialize Spark Session -----------
spark = SparkSession.builder.appName("NewsRecommendationGUI").getOrCreate()

# ----------- Step 1: Load Pre-trained Models -----------

# Load pre-trained models from the saved directory
save_dir = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Machine Learning Codes/Trained Models'

# Load the TF-IDF Vectorizer model
vectorizer = joblib.load(f'{save_dir}/tfidf_vectorizer.pkl')
print("TF-IDF Vectorizer model loaded successfully.")

# Load the Word2Vec model
word2vec_model = Word2Vec.load(f'{save_dir}/word2vec.model')
print("Word2Vec model loaded successfully.")

# Load the KMeans clustering model
kmeans_news = joblib.load(f'{save_dir}/kmeans_news_model.pkl')
print("KMeans clustering model loaded successfully.")

# Load the ALS model using Spark
spark = SparkSession.builder.appName("NewsRecommendationALS").getOrCreate()
als_model_path = f'{save_dir}/best_als_model'
best_als_model = ALSModel.load(als_model_path)
print("ALS model loaded successfully.")

# ----------- Step 2: Load Validation and Test Datasets -----------

validation_news_df = pd.read_csv('/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_dev/Cleaned Dataset/News_cleaned.csv', sep=',', names=[
    "News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Entities Mentioned", "Entities in Abstract"
])

validation_behavior_df = pd.read_csv('/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_dev/Cleaned Dataset/cleaned_behavior_dataset.csv', sep=',', names=[
    "Impression ID", "User ID", "Timestamp", "Displayed News List", "Impression List (Clicked Status)",
    "Impression Dictionary", "Clicked News IDs", "Not-Clicked News IDs"
])

test_news_df = pd.read_csv('/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_test/Cleaned Dataset/News_cleaned.csv', sep=',', names=[
    "News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Entities Mentioned", "Entities in Abstract"
])

test_behavior_df = pd.read_csv('/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_test/Cleaned Dataset/cleaned_behavior_dataset.csv', sep=',', names=[
    "Impression ID", "User ID", "Timestamp", "Displayed News List", "Impression List (Clicked Status)",
    "Impression Dictionary", "Clicked News IDs", "Not-Clicked News IDs"
])

# ----------- Step 3: Preprocess Validation and Test Datasets -----------

def preprocess_datasets(news_df, behavior_df, vectorizer, word2vec_model, kmeans_news):
    # Preprocess the news dataset
    news_df['Text'] = news_df['Category'] + " " + news_df['Subcategory'] + " " + news_df['Title'] + " " + news_df['Abstract']

    # Transform text using the pre-trained TF-IDF vectorizer
    tfidf_matrix = vectorizer.transform(news_df['Text'])
    cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Word2Vec embeddings for news
    def get_article_embedding(text):
        words = text.split()
        word_vecs = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
        return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(100)

    news_df['Article Embedding'] = news_df['Text'].apply(get_article_embedding)

    # Assign news clusters using the pre-trained KMeans model
    news_embeddings = np.vstack(news_df['Article Embedding'].values)
    news_df['News Cluster'] = kmeans_news.predict(news_embeddings)

    # Prepare behavior data for ALS: Combine Clicked and Not-Clicked News IDs
    # Initialize Spark session
    spark = SparkSession.builder \
        .appName("NewsRecommendationValidation") \
        .getOrCreate()

    print("Spark session initialized for validation.")

    # Prepare Clicked News Data
    clicked_df = behavior_df[['User ID', 'Clicked News IDs']].copy()
    clicked_df = clicked_df.assign(Clicked_News=clicked_df['Clicked News IDs'].str.split(',')).explode('Clicked_News').drop(columns='Clicked News IDs')
    clicked_df['Clicked_News'] = clicked_df['Clicked_News'].astype(str)

    # Prepare Not-Clicked News Data
    not_clicked_df = behavior_df[['User ID', 'Not-Clicked News IDs']].copy()
    not_clicked_df = not_clicked_df.assign(Not_Clicked_News=not_clicked_df['Not-Clicked News IDs'].str.split(',')).explode('Not_Clicked_News').drop(columns='Not-Clicked News IDs')
    not_clicked_df['Not_Clicked_News'] = not_clicked_df['Not_Clicked_News'].astype(str)

    # Add a rating of 1.0 for clicked news (positive interactions)
    clicked_df['rating'] = 1.0

    # Add a rating of 0.0 for not-clicked news (negative interactions)
    not_clicked_df['rating'] = 0.0

    # Combine both clicked and not-clicked data
    combined_behavior_df = pd.concat([clicked_df.rename(columns={'Clicked_News': 'News ID'}), 
                                  not_clicked_df.rename(columns={'Not_Clicked_News': 'News ID'})], ignore_index=True)

    # Ensure 'News ID' is a string for consistency
    combined_behavior_df['News ID'] = combined_behavior_df['News ID'].astype(str)

    # Convert combined behavior data to Spark DataFrame
    behavior_spark_df = spark.createDataFrame(combined_behavior_df)

    print("Behavior data prepared for ALS, including both clicked and not-clicked interactions.")
    
    return news_df, behavior_df, behavior_spark_df, cosine_sim_matrix


# ----------- Step 4: Recommendation Generation Function (Hybrid) -----------

def hybrid_recommendation(user_id, behavior_df, top_n_per_component=15):
    """
    Hybrid recommendation function that takes into account the user's clicked news articles
    and generates recommendations based on collaborative filtering, content-based filtering,
    and clustering.

    Parameters:
    - user_id: ID of the user for whom to generate recommendations.
    - behavior_df: The behavior dataset that contains 'Clicked News IDs'.
    - top_n_per_component: Number of recommendations to generate per technique.

    Returns:
    - List of recommendations combining collaborative, content-based, and clustering techniques.
    """
    recommendations = []
    
    # Step 1: Get Clicked News IDs for the User
    user_clicked_news = behavior_df[behavior_df['User ID'] == user_id]['Clicked News IDs'].tolist()
    
    # If the user has not clicked on any news, handle it (new user scenario)
    if not user_clicked_news:
        print(f"No clicked articles found for User {user_id}. Providing recommendations without historical data.")
        return generate_new_user_recommendations(user_id)  # Function to handle new users
    
    # Otherwise, use the clicked news articles for recommendations
    user_clicked_news = user_clicked_news[0].split(',')  # Convert the comma-separated string into a list of news IDs
    
    # Step 2: Collaborative Filtering Recommendations (based on user_id)
    collab_recs = collaborative_recommendations(user_id, top_n=top_n_per_component)
    recommendations.extend(collab_recs)
    
    # Step 3: Content-Based Recommendations (based on user's clicked news)
    for news_id in user_clicked_news:
        content_recs = content_based_recommendations(news_id, top_n=top_n_per_component)
        recommendations.extend(content_recs)
    
    # Step 4: Clustering-Based Recommendations (based on user's clicked news)
    for news_id in user_clicked_news:
        cluster_recs = get_articles_in_same_cluster(news_id, top_n=top_n_per_component)
        recommendations.extend(cluster_recs)

   # Step 5: Remove duplicates while preserving order
    seen = set()
    unique_recs = []
    for rec in recommendations:
        if rec not in seen:
            seen.add(rec)
            unique_recs.append(rec)

    # Return all unique recommendations
    return unique_recs

#--------Side Step:  Function to generate recommendations for new users--------

def generate_new_user_recommendations(user_id, top_n=15):
    """
    Generate recommendations for new users based on popular categories, content-based filtering,
    or cluster-level information.
    
    Parameters:
    - user_id: ID of the new user.
    - top_n: Number of recommendations to generate.
    
    Returns:
    - List of recommended news articles for the new user.
    """
    # For new users, we can recommend based on popularity, top clusters, or random articles
    print(f"Generating recommendations for new user (User ID: {user_id})")
    
    # Example: Recommend top articles from popular clusters or categories
    top_news_in_clusters = news_df.groupby('News Cluster').apply(lambda x: x.sample(n=1)).reset_index(drop=True)
    
    # Select top articles for new users based on clusters or categories
    recommendations = top_news_in_clusters['News ID'].head(top_n).tolist()
    
    return recommendations

#--------Side Step:  Function to generate recommendations based on collaborative filtering--------

def collaborative_recommendations(user_id, top_n=15):
    """
    Generate top N collaborative filtering recommendations for the given user using the pre-trained ALS model.
    
    Parameters:
    - user_id: The ID of the user for whom to generate recommendations.
    - top_n: The number of recommendations to generate.
    
    Returns:
    - List of recommended news IDs based on ALS predictions.
    """
    # Convert user_id to a Spark DataFrame so that it can be used for prediction
    user_df = spark.createDataFrame([(user_id,)], ["User ID"])

    # Generate predictions using the ALS model (predict ratings for all news items)
    user_recommendations = best_als_model.recommendForUserSubset(user_df, top_n)

    # Extract the recommended news IDs from the model's output
    recommendations = user_recommendations.collect()[0].recommendations
    recommended_news_ids = [row['News ID'] for row in recommendations]

    return recommended_news_ids

#--------Side Step:  Function to generate recommendations content_based filtering--------

def content_based_recommendations(news_id, top_n=15):
    """
    Generate top N content-based recommendations for a given news article using cosine similarity.
    
    Parameters:
    - news_id: The ID of the news article based on which recommendations will be generated.
    - top_n: The number of recommendations to generate.
    
    Returns:
    - List of recommended news IDs based on content similarity.
    """
    # Find the index of the given news_id in the dataset
    idx = news_df[news_df['News ID'] == news_id].index[0]
    
    # Compute the pairwise cosine similarities for the given news article using the pre-trained TF-IDF matrix
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    
    # Sort the articles based on similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top N most similar articles (excluding the given article itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Extract the indices of the recommended news articles
    recommended_indices = [i[0] for i in sim_scores]
    
    # Get the corresponding news IDs
    recommended_news_ids = news_df.iloc[recommended_indices]['News ID'].tolist()
    
    return recommended_news_ids

#--------Side Step:  Function to generate recommendations culster_based--------

def get_articles_in_same_cluster(news_id, top_n=15):
    """
    Generate top N clustering-based recommendations for a given news article by retrieving articles
    from the same cluster.
    
    Parameters:
    - news_id: The ID of the news article based on which recommendations will be generated.
    - top_n: The number of recommendations to generate.
    
    Returns:
    - List of recommended news IDs from the same cluster.
    """
    # Find the cluster label of the given news_id (from the preprocessed dataset)
    cluster_label = news_df[news_df['News ID'] == news_id]['News Cluster'].values[0]  # Use preprocessed 'News Cluster' labels
    
    # Get all articles in the same cluster
    articles_in_cluster = news_df[news_df['News Cluster'] == cluster_label]  # Select articles from the same cluster
    
    # Remove the article itself from the list of recommendations
    articles_in_cluster = articles_in_cluster[articles_in_cluster['News ID'] != news_id]
    
    # Randomly sample top_n articles from the same cluster
    recommended_news_ids = articles_in_cluster['News ID'].sample(n=top_n, replace=False).tolist()
    
    return recommended_news_ids


# ----------- Step 5: Evaluation Metrics -----------

def evaluate_recommendations(recommended_clicks, actual_clicks, recommended_not_clicks, actual_not_clicks):
    """
    Evaluate the predicted recommendations against the actual clicked and not-clicked news.

    Parameters:
    - recommended_clicks: Set of news IDs that were predicted as clicked.
    - actual_clicks: Set of news IDs that were actually clicked.
    - recommended_not_clicks: Set of news IDs that were predicted as not-clicked.
    - actual_not_clicks: Set of news IDs that were actually not clicked.

    Returns:
    - Precision, Recall, and F1 for both clicked and not-clicked predictions.
    """
    # Precision, recall, and F1 for clicked news
    precision_clicks = len(recommended_clicks.intersection(actual_clicks)) / len(recommended_clicks) if recommended_clicks else 0
    recall_clicks = len(recommended_clicks.intersection(actual_clicks)) / len(actual_clicks) if actual_clicks else 0
    f1_clicks = 2 * (precision_clicks * recall_clicks) / (precision_clicks + recall_clicks) if (precision_clicks + recall_clicks) > 0 else 0

    # Precision, recall, and F1 for not-clicked news
    precision_not_clicks = len(recommended_not_clicks.intersection(actual_not_clicks)) / len(recommended_not_clicks) if recommended_not_clicks else 0
    recall_not_clicks = len(recommended_not_clicks.intersection(actual_not_clicks)) / len(actual_not_clicks) if actual_not_clicks else 0
    f1_not_clicks = 2 * (precision_not_clicks * recall_not_clicks) / (precision_not_clicks + recall_not_clicks) if (precision_not_clicks + recall_not_clicks) > 0 else 0

    return precision_clicks, recall_clicks, f1_clicks, precision_not_clicks, recall_not_clicks, f1_not_clicks


# ----------- Step 6: Predict and Evaluate on Validation Set -----------

def predict_clicks_for_user(user_id, displayed_news_list, behavior_df, top_n_per_component=5):
    """
    Predict whether a user will click on the news articles in their displayed news list.

    Parameters:
    - user_id: ID of the user
    - displayed_news_list: List of news articles displayed to the user (from Displayed News List)
    
    Returns:
    - clicked_news_ids: List of news IDs predicted to be clicked by the user
    - not_clicked_news_ids: List of news IDs predicted to not be clicked by the user
    """
    clicked_news_ids = []
    not_clicked_news_ids = []
    
    # Go through each news ID in the displayed news list and predict whether the user will click
    for news_id in displayed_news_list:
        # For each news article, predict the click using hybrid recommendation function
        recommendations = hybrid_recommendation(user_id=user_id, behavior_df=behavior_df, top_n_per_component=top_n_per_component)
        
        # If the news_id is part of the top recommendations, assume it is clicked
        if news_id in recommendations:
            clicked_news_ids.append(news_id)
        else:
            not_clicked_news_ids.append(news_id)
    
    return clicked_news_ids, not_clicked_news_ids


def validate_predictions_on_validation_set(validation_behavior_df, validation_news_df):
    """
    Predict the clicked and not-clicked news for the users in the validation dataset, and compare
    the predictions to the actual clicked and not-clicked news for evaluation.

    Parameters:
    - validation_behavior_df: DataFrame containing the validation behavior dataset.
    - validation_news_df: DataFrame containing the validation news dataset.

    Returns:
    - Average precision, recall, and F1 scores for clicked and not-clicked news.
    """
    precision_clicks_scores_val = []
    recall_clicks_scores_val = []
    f1_clicks_scores_val = []

    precision_not_clicks_scores_val = []
    recall_not_clicks_scores_val = []
    f1_not_clicks_scores_val = []

    for index, row in validation_behavior_df.iterrows():
        user_id = row['User ID']
        displayed_news_list = row['Displayed News List'].split(',')  # Convert displayed news list to a list
        
        # Predict clicks for this user and their displayed news
        predicted_clicked_news_ids, predicted_not_clicked_news_ids = predict_clicks_for_user(user_id, displayed_news_list, validation_behavior_df)
        
        # Get actual clicked and not-clicked news from the validation set
        actual_clicked_news_ids = row['Clicked News IDs'].split(',') if row['Clicked News IDs'] else []
        actual_not_clicked_news_ids = row['Not-Clicked News IDs'].split(',') if row['Not-Clicked News IDs'] else []
        
        # Evaluate the predicted recommendations against the actual results
        precision_clicks, recall_clicks, f1_clicks, precision_not_clicks, recall_not_clicks, f1_not_clicks = evaluate_recommendations(
            set(predicted_clicked_news_ids), set(actual_clicked_news_ids),
            set(predicted_not_clicked_news_ids), set(actual_not_clicked_news_ids)
        )
        
        # Append the results to the respective lists
        precision_clicks_scores_val.append(precision_clicks)
        recall_clicks_scores_val.append(recall_clicks)
        f1_clicks_scores_val.append(f1_clicks)

        precision_not_clicks_scores_val.append(precision_not_clicks)
        recall_not_clicks_scores_val.append(recall_not_clicks)
        f1_not_clicks_scores_val.append(f1_not_clicks)

    # Calculate average metrics for validation set
    avg_precision_clicks_val = np.mean(precision_clicks_scores_val)
    avg_recall_clicks_val = np.mean(recall_clicks_scores_val)
    avg_f1_clicks_val = np.mean(f1_clicks_scores_val)

    avg_precision_not_clicks_val = np.mean(precision_not_clicks_scores_val)
    avg_recall_not_clicks_val = np.mean(recall_not_clicks_scores_val)
    avg_f1_not_clicks_val = np.mean(f1_not_clicks_scores_val)

    return (avg_precision_clicks_val, avg_recall_clicks_val, avg_f1_clicks_val, 
            avg_precision_not_clicks_val, avg_recall_not_clicks_val, avg_f1_not_clicks_val)

# Preprocess Validation dataSets
validation_news_df, validation_behavior_df, behavior_spark_df_val, cosine_sim_matrix_val = preprocess_datasets(
    validation_news_df, validation_behavior_df, vectorizer, word2vec_model, kmeans_news
)

# Run the validation process for all users in the validation set
(avg_precision_clicks_val, avg_recall_clicks_val, avg_f1_clicks_val, 
 avg_precision_not_clicks_val, avg_recall_not_clicks_val, avg_f1_not_clicks_val) = validate_predictions_on_validation_set(
    validation_behavior_df, validation_news_df
)

# Display validation results
print(f'Validation Precision (Clicked): {avg_precision_clicks_val:.4f}')
print(f'Validation Recall (Clicked): {avg_recall_clicks_val:.4f}')
print(f'Validation F1-Score (Clicked): {avg_f1_clicks_val:.4f}')

print(f'Validation Precision (Not-Clicked): {avg_precision_not_clicks_val:.4f}')
print(f'Validation Recall (Not-Clicked): {avg_recall_not_clicks_val:.4f}')
print(f'Validation F1-Score (Not-Clicked): {avg_f1_not_clicks_val:.4f}')


# ----------- Step 7: Testing Phase -----------

def predict_clicks_for_user(user_id, displayed_news_list, top_n_per_component=5):
    """
    Predict whether a user will click on the news articles in their displayed news list.
    
    Parameters:
    - user_id: ID of the user
    - displayed_news_list: List of news articles displayed to the user (from Displayed News List)
    
    Returns:
    - clicked_news_ids: List of news IDs predicted to be clicked by the user
    - not_clicked_news_ids: List of news IDs predicted to not be clicked by the user
    """
    clicked_news_ids = []
    not_clicked_news_ids = []
    
    # Go through each news ID in the displayed news list and predict whether the user will click
    for news_id in displayed_news_list:
        # For each news article, we can predict the click using our hybrid recommendation function
        recommendations = hybrid_recommendation(user_id=user_id, news_id=news_id, top_n_per_component=top_n_per_component)
        
        # If the news_id is part of the top recommendations, assume it is clicked
        if news_id in recommendations:
            clicked_news_ids.append(news_id)
        else:
            not_clicked_news_ids.append(news_id)
    
    return clicked_news_ids, not_clicked_news_ids

def test_click_predictions(test_behavior_df):
    """
    Apply the click prediction process to all users in the test dataset.
    
    Parameters:
    - test_behavior_df: DataFrame containing the test behavior dataset
    
    Returns:
    - Updated test_behavior_df with predicted Clicked News IDs and Not-Clicked News IDs.
    """
    for index, row in test_behavior_df.iterrows():
        user_id = row['User ID']
        displayed_news_list = row['Displayed News List'].split(',')  # Convert displayed news list to a list
        
        # Predict clicks for this user and their displayed news
        clicked_news_ids, not_clicked_news_ids = predict_clicks_for_user(user_id, displayed_news_list)
        
        # Update the behavior dataframe with the predicted clicks
        test_behavior_df.at[index, 'Clicked News IDs'] = ','.join(clicked_news_ids) if clicked_news_ids else ''
        test_behavior_df.at[index, 'Not-Clicked News IDs'] = ','.join(not_clicked_news_ids) if not_clicked_news_ids else ''
    
    return test_behavior_df

# Preprocess Test Set
test_news_df, test_behavior_df, behavior_spark_df_test, cosine_sim_matrix_test = preprocess_datasets(
    test_news_df, test_behavior_df, vectorizer, word2vec_model, kmeans_news
)

# Run the prediction process for all users in the test set
updated_test_behavior_df = test_click_predictions(test_behavior_df)

# Save the updated test behavior dataset with the predictions
updated_test_behavior_df.to_csv('/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_test/Behavior Predictions Test Results/updated_test_behavior_dataset.csv', index=False)

print("Testing completed and predictions saved to 'updated_test_behavior_dataset.csv'.")


# ----------- end -----------