In [2]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux

In [3]:
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, GridSearchCV, cross_validate
from surprise import accuracy
from joblib import Parallel, delayed

np.random.seed(42)

num_users = 100
num_posts = 500

# Generate user and post IDs
user_ids = np.arange(1, num_users + 1)
post_ids = np.arange(1, num_posts + 1)

tags_list = ['politics', 'sports', 'technology', 'movies', 'science', 'health', 'fashion']

post_content_samples = [
    "The latest breakthrough in AI is set to change the industry.",
    "A thrilling sports match ended with an unexpected twist!",
    "Scientists have discovered a new planet that may support life.",
    "The upcoming movie is getting fantastic early reviews.",
    "Health experts discuss the importance of mental well-being.",
    "A guide to the best travel destinations for food lovers.",
    "The history behind one of the most famous ancient civilizations.",
    "New gaming consoles are redefining the entertainment experience.",
    "An inside look at the fashion trends for the next season.",
    "Top musicians collaborate for a charity event.",
    "A revolutionary way to combat climate change has emerged.",
    "New studies suggest a healthy diet improves productivity.",
    "The most anticipated movie of the year is finally here.",
    "Exploring the wonders of deep-sea exploration.",
    "The role of AI in shaping modern healthcare systems.",
    "The top 5 gadgets you need for 2025.",
    "How to travel on a budget without missing out.",
    "The future of space exploration and human colonization.",
    "The impact of social media on modern relationships.",
    "Fashion trends to expect this winter season.",
    "Understanding the complexities of modern politics.",
    "How technology is transforming education for the better.",
    "The latest developments in the tech industry.",
    "Exploring the intersection of art and technology.",
    "The top fitness trends to try this year.",
    "Mindfulness techniques that can reduce stress levels.",
    "The best food destinations around the world.",
    "A deep dive into the most famous scientific experiments.",
    "The influence of historical events on today's society.",
    "Sports and mental health: A connection worth exploring.",
    "Future trends in environmental sustainability.",
    "How virtual reality is revolutionizing gaming.",
    "Top 10 places to visit before you die.",
    "The challenges of modern medicine in the 21st century.",
    "The effects of global warming on wildlife.",
    "Exploring the role of women in science and technology.",
    "How to stay active while working from home.",
    "Understanding the basics of cryptocurrency.",
    "The importance of sleep in maintaining health.",
    "How to build a successful online business."
]

# Generate random interactions (likes, comments, shares) between users and posts
interactions = []
for user_id in user_ids:
    for post_id in np.random.choice(post_ids, size=np.random.randint(5, 20), replace=False):  # Each user interacts with 5-20 posts
        interaction_value = np.random.choice([0, 1, 2, 3], p=[0.4, 0.3, 0.2, 0.1])  # 40% no interaction, 30% like, 20% comment, 10% share
        interactions.append((user_id, post_id, interaction_value))

# Create a DataFrame for user-item interactions
interaction_data = pd.DataFrame(interactions, columns=["user_id", "post_id", "interaction"])

# Assign random tags and content to posts
post_data = pd.DataFrame({
    "post_id": post_ids,
    "tags": [', '.join(np.random.choice(tags_list, size=np.random.randint(2, 4), replace=False)) for _ in post_ids],  # 2 to 3 tags per post
    "content": [random.choice(post_content_samples) for _ in post_ids]  # Randomly assigning content
})

# Save to CSV files (optional)
interaction_data.to_csv("user_interactions.csv", index=False)
post_data.to_csv("post_tags_content.csv", index=False)

# Display a sample of the data
print("User-Post Interactions (Top 5 Rows):")
print(interaction_data.head())
print("User-Post Interactions (Top 5 tails):")
print(interaction_data.tail())

print("\nPost Data (Top 5 Rows):")
print(post_data.head())
print("\nPost Data (Top 5 tails):")
print(post_data.tail())

# Load datasets
interaction_data = pd.read_csv("user_interactions.csv")
post_data = pd.read_csv("post_tags_content.csv")

# Prepare data for Surprise
reader = Reader(rating_scale=(0, 3))
data = Dataset.load_from_df(interaction_data[['user_id', 'post_id', 'interaction']], reader)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_factors': [50, 100, 200],
    'n_epochs': [30, 50, 70],
    'lr_all': [0.0001, 0.0005, 0.001],
    'reg_all': [0.2, 0.3, 0.4]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

# Best parameters and model
print(f"Best RMSE: {gs.best_score['rmse']}")
print(f"Best Parameters: {gs.best_params['rmse']}")
best_svd = gs.best_estimator['rmse']

# Perform cross-validation to evaluate the model with multiple train-test splits
cv_results = cross_validate(best_svd, data, measures=['rmse', 'mae'], cv=3, verbose=True)
print(f"CV Results: {cv_results}")

# Fit the best model
trainset = data.build_full_trainset()
best_svd.fit(trainset)

# Advanced content-based filtering using TF-IDF
def advanced_content_based_filtering(post_data):
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(post_data['tags'] + " " + post_data['content'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return cosine_sim

# Precompute cosine similarity matrix
content_sim = advanced_content_based_filtering(post_data)

# Get content-based recommendations
def get_content_based_recommendations(user_interactions, post_data, cosine_sim, num_recommendations=5):
    recommended_posts = []
    for post_id in user_interactions:
        post_idx = post_data[post_data['post_id'] == post_id].index[0]
        similarity_scores = list(enumerate(cosine_sim[post_idx]))
        sorted_similar_posts = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        for idx, _ in sorted_similar_posts:
            if post_data.iloc[idx]['post_id'] not in user_interactions:
                recommended_posts.append(post_data.iloc[idx]['post_id'])
                if len(recommended_posts) >= num_recommendations * 2:  # Get more candidates
                    break
    return recommended_posts

# Cold start handling: Recommend popular posts
def cold_start_recommendation(post_data, num_recommendations=5):
    popular_posts = interaction_data[interaction_data['interaction'] > 0]['post_id'].value_counts().index.tolist()
    return popular_posts[:num_recommendations]

# Diversify recommendations using Maximal Marginal Relevance (MMR)
def diversify_recommendations_mmr(recommendations, post_data, cosine_sim, lambda_param=0.5):
    diversified_recommendations = []
    remaining_recommendations = recommendations.copy()

    while remaining_recommendations:
        scores = []
        for post_id in remaining_recommendations:
            post_idx = post_data[post_data['post_id'] == post_id].index[0]
            similarity_to_selected = [cosine_sim[post_idx][post_data[post_data['post_id'] == pid].index[0]] for pid in diversified_recommendations]
            max_similarity = max(similarity_to_selected) if similarity_to_selected else 0
            scores.append((post_id, max_similarity))

        # Select the post with the lowest max similarity
        selected_post = min(scores, key=lambda x: x[1])[0]
        diversified_recommendations.append(selected_post)
        remaining_recommendations.remove(selected_post)

    return diversified_recommendations

# Hybrid recommendation combining collaborative and content-based filtering
def hybrid_recommendation(user_id, num_recommendations=5, collaborative_weight=0.7, content_weight=0.3):
    all_post_ids = interaction_data['post_id'].unique()
    user_interactions = interaction_data[interaction_data['user_id'] == user_id]['post_id'].tolist()

    if not user_interactions:
        return cold_start_recommendation(post_data, num_recommendations)

    # Adjust weights based on user behavior
    if len(user_interactions) < 10:  # Cold start or low interaction
        collaborative_weight = 0.5
        content_weight = 0.5

    # Collaborative filtering recommendations
    def predict_ratings(user_id, post_ids, model):
        return [(post_id, model.predict(user_id, post_id).est) for post_id in post_ids]

    collaborative_ratings = Parallel(n_jobs=-1)(delayed(predict_ratings)(user_id, post_ids, best_svd) for post_ids in np.array_split(all_post_ids, 10))
    collaborative_ratings = [item for sublist in collaborative_ratings for item in sublist]
    collaborative_ratings = sorted(collaborative_ratings, key=lambda x: x[1], reverse=True)
    collaborative_recommendations = [post_id for post_id, _ in collaborative_ratings[:num_recommendations * 2]]  # Get more candidates

    # Content-based filtering recommendations
    content_recommendations = get_content_based_recommendations(user_interactions, post_data, content_sim, num_recommendations * 2)  # Get more candidates

    # Combine recommendations with weights
    all_recommendations = list(set(collaborative_recommendations + content_recommendations))

    # Rank combined recommendations by weighted score
    ranked_recommendations = []
    for post_id in all_recommendations:
        collaborative_score = next((score for pid, score in collaborative_ratings if pid == post_id), 0)
        content_score = max([content_sim[post_data[post_data['post_id'] == post_id].index[0]][post_data[post_data['post_id'] == pid].index[0]] for pid in user_interactions], default=0)
        weighted_score = (collaborative_weight * collaborative_score) + (content_weight * content_score)
        ranked_recommendations.append((post_id, weighted_score))

    ranked_recommendations = sorted(ranked_recommendations, key=lambda x: x[1], reverse=True)
    final_recommendations = [post_id for post_id, _ in ranked_recommendations[:num_recommendations * 2]]  # Get more candidates

    # Diversify recommendations
    diversified_recommendations = diversify_recommendations_mmr(final_recommendations, post_data, content_sim)
    return diversified_recommendations[:num_recommendations]

# Generate recommendations for sample users
for user in range(1, 11):
    recommendations = hybrid_recommendation(user)
    print(f"Recommended posts for user {user}: {recommendations}")

User-Post Interactions (Top 5 Rows):
   user_id  post_id  interaction
0        1      363            3
1        1       74            1
2        1      376            1
3        1      156            1
4        1      105            1
User-Post Interactions (Top 5 tails):
      user_id  post_id  interaction
1302      100       86            2
1303      100      108            0
1304      100      479            0
1305      100       84            2
1306      100      298            1

Post Data (Top 5 Rows):
   post_id                        tags  \
0        1            politics, movies   
1        2              movies, health   
2        3   movies, politics, fashion   
3        4             movies, fashion   
4        5  science, fashion, politics   

                                             content  
0  Exploring the role of women in science and tec...  
1  The latest breakthrough in AI is set to change...  
2  Understanding the complexities of modern polit...  
3  The challe