In [2]:
# from elasticsearch import Elasticsearch
# import pandas as pd
# import numpy as np

# class YouTubeRecommendationSystem:
#     def __init__(self, video_data_path, user_profile_data_path, user_history_data_path, es_endpoint, index_name):
#         self.es = Elasticsearch([es_endpoint], port=443, use_ssl=True)
#         self.index_name = index_name

#         # Load CSV data
#         self.video_data = pd.read_csv(video_data_path)
#         self.user_profile_data = pd.read_csv(user_profile_data_path)
#         self.user_history_data = pd.read_csv(user_history_data_path)

#         # Preprocess video data
#         self.video_data['features'] = self.video_data['title'] + ' ' + self.video_data['description']
        
#         # Initialize vectorizer for content-based filtering
#         self.vectorizer = TfidfVectorizer(stop_words='english')
#         self.video_features_matrix = self.vectorizer.fit_transform(self.video_data['features'].values.astype('U'))

#         # Calculate user-item matrix for collaborative filtering
#         self.user_item_matrix = self.user_history_data.pivot_table(index='user_id', columns='video_id', values='watched', fill_value=0)

#     def get_user_embedding(self, user_id):
#         # Retrieve user embedding from user profile data
#         user_profile = self.user_profile_data[self.user_profile_data['user_id'] == user_id]
#         user_embedding = np.array(user_profile.drop('user_id', axis=1))
#         return user_embedding

#     def collaborative_filtering_recommendations(self, user_id, num_recommendations=5):
#         user_embedding = self.get_user_embedding(user_id)
        
#         # Calculate cosine similarity between user and other users
#         user_similarity = 1 - pairwise_distances(self.user_item_matrix, metric='cosine')
#         user_similarity = pd.DataFrame(user_similarity, index=self.user_item_matrix.index, columns=self.user_item_matrix.index)

#         # Get most similar users to the target user
#         similar_users = user_similarity[user_id].sort_values(ascending=False)[1:]

#         # Find videos watched by similar users but not watched by the target user
#         watched_videos = self.user_history_data[self.user_history_data['user_id'] == user_id]['video_id']
#         recommended_videos = []
#         for similar_user_id, similarity_score in similar_users.iteritems():
#             if similarity_score > 0:
#                 similar_user_watched_videos = self.user_history_data[self.user_history_data['user_id'] == similar_user_id]['video_id']
#                 new_videos = similar_user_watched_videos[~similar_user_watched_videos.isin(watched_videos)]
#                 recommended_videos.extend(new_videos)
#                 if len(recommended_videos) >= num_recommendations:
#                     break

#         return recommended_videos[:num_recommendations]

#     def content_based_filtering_recommendations(self, user_id, num_recommendations=5):
#         user_embedding = self.get_user_embedding(user_id)

#         # Calculate cosine similarity between user and videos
#         video_user_similarity = cosine_similarity(self.video_features_matrix, user_embedding)
#         video_user_similarity = np.squeeze(video_user_similarity)

#         # Get indices of videos sorted by similarity
#         video_indices = np.argsort(video_user_similarity)[::-1]

#         # Find videos not watched by the user
#         watched_videos = self.user_history_data[self.user_history_data['user_id'] == user_id]['video_id']
#         recommended_videos = []
#         for idx in video_indices:
#             video_id = self.video_data.iloc[idx]['video_id']
#             if video_id not in watched_videos.values:
#                 recommended_videos.append(video_id)
#             if len(recommended_videos) >= num_recommendations:
#                 break

#         return recommended_videos

#     def store_user_embedding(self, user_id, embedding):
#         embedding_dict = {'embedding': embedding.tolist()}
#         self.es.index(index=self.index_name, body=embedding_dict, id=user_id)

#     def store_video_embedding(self, video_id, embedding):
#         embedding_dict = {'embedding': embedding.tolist()}
#         self.es.index(index=self.index_name, body=embedding_dict, id=video_id)

#     def get_recommendations(self, user_id, num_recommendations=5):
#         collaborative_filtering_recommendations = self.collaborative_filtering_recommendations(user_id, num_recommendations)
#         content_based_filtering_recommendations = self.content_based_filtering_recommendations(user_id, num_recommendations)

#         # Combine recommendations from both methods (can be adjusted based on performance)
#         combined_recommendations = collaborative_filtering_recommendations + content_based_filtering_recommendations
#         unique_recommendations = list(set(combined_recommendations))[:num_recommendations]

#         return unique_recommendations

# # Example usage
# video_data_path = 'video_data.csv'
# user_profile_data_path = 'user_profile_data.csv'
# user_history_data_path = 'user_history_data.csv'
# es_endpoint = 'your-es-endpoint'
# index_name = 'embeddings'

# recommendation_system = YouTubeRecommendationSystem(video_data_path, user_profile_data_path, user_history_data_path, es_endpoint, index_name)

# # Get recommendations for a user
# user_id = 'user123'
# recommended_videos = recommendation_system.get_recommendations(user_id)
# print("Recommended Videos:", recommended_videos)


In [3]:
import time
from elasticsearch import Elasticsearch
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

In [4]:

class VideoRecommendationSystem:
    def __init__(self, video_data_path, user_profile_data_path, user_history_data_path, es_endpoint, index_name, update_interval=86400):
        self.es = Elasticsearch([es_endpoint], port=443, use_ssl=True)
        self.index_name = index_name
        self.update_interval = update_interval  # Update interval in seconds
        
        # Load CSV data
        self.video_data = pd.read_csv(video_data_path)
        self.user_profile_data = pd.read_csv(user_profile_data_path)
        self.user_history_data = pd.read_csv(user_history_data_path)

        # Preprocess video data
        self.video_data['features'] = self.video_data['title'] + ' ' + self.video_data['description']
        
        # Initialize vectorizer for content-based filtering
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.video_features_matrix = self.vectorizer.fit_transform(self.video_data['features'].values.astype('U'))

        # Calculate user-item matrix for collaborative filtering
        self.user_item_matrix = self.user_history_data.pivot_table(index='user_id', columns='video_id', values='watched', fill_value=0)

        # Initialize time tracker for last update
        self.last_update_time = time.time()

    def should_update_embeddings(self):
        current_time = time.time()
        return current_time - self.last_update_time >= self.update_interval

    def update_embeddings(self):
        # Logic to update embeddings
        print("Updating embeddings...")
        self.last_update_time = time.time()

    def get_user_embedding(self, user_id):
        # Retrieve user embedding from user profile data
        user_profile = self.user_profile_data[self.user_profile_data['user_id'] == user_id]
        user_embedding = np.array(user_profile.drop('user_id', axis=1))
        return user_embedding

    def collaborative_filtering_recommendations(self, user_id, num_recommendations=5):
        user_embedding = self.get_user_embedding(user_id)
        
        # Calculate cosine similarity between user and other users
        user_similarity = 1 - pairwise_distances(self.user_item_matrix, metric='cosine')
        user_similarity = pd.DataFrame(user_similarity, index=self.user_item_matrix.index, columns=self.user_item_matrix.index)

        # Get most similar users to the target user
        similar_users = user_similarity[user_id].sort_values(ascending=False)[1:]

        # Find videos watched by similar users but not watched by the target user
        watched_videos = self.user_history_data[self.user_history_data['user_id'] == user_id]['video_id']
        recommended_videos = []
        for similar_user_id, similarity_score in similar_users.iteritems():
            if similarity_score > 0:
                similar_user_watched_videos = self.user_history_data[self.user_history_data['user_id'] == similar_user_id]['video_id']
                new_videos = similar_user_watched_videos[~similar_user_watched_videos.isin(watched_videos)]
                recommended_videos.extend(new_videos)
                if len(recommended_videos) >= num_recommendations:
                    break

        return recommended_videos[:num_recommendations]

    def content_based_filtering_recommendations(self, user_id, num_recommendations=5):
        user_embedding = self.get_user_embedding(user_id)

        # Calculate cosine similarity between user and videos
        video_user_similarity = cosine_similarity(self.video_features_matrix, user_embedding)
        video_user_similarity = np.squeeze(video_user_similarity)

        # Get indices of videos sorted by similarity
        video_indices = np.argsort(video_user_similarity)[::-1]

        # Find videos not watched by the user
        watched_videos = self.user_history_data[self.user_history_data['user_id'] == user_id]['video_id']
        recommended_videos = []
        for idx in video_indices:
            video_id = self.video_data.iloc[idx]['video_id']
            if video_id not in watched_videos.values:
                recommended_videos.append(video_id)
            if len(recommended_videos) >= num_recommendations:
                break

        return recommended_videos

    def store_user_embedding(self, user_id, embedding):
        embedding_dict = {'embedding': embedding.tolist()}
        self.es.index(index=self.index_name, body=embedding_dict, id=user_id)

    def store_video_embedding(self, video_id, embedding):
        embedding_dict = {'embedding': embedding.tolist()}
        self.es.index(index=self.index_name, body=embedding_dict, id=video_id)

    def get_recommendations(self, user_id, num_recommendations=5):
        if self.should_update_embeddings():
            self.update_embeddings()

        collaborative_filtering_recommendations = self.collaborative_filtering_recommendations(user_id, num_recommendations)
        content_based_filtering_recommendations = self.content_based_filtering_recommendations(user_id, num_recommendations)

        # Combine recommendations from both methods (can be adjusted based on performance)
        combined_recommendations = collaborative_filtering_recommendations + content_based_filtering_recommendations
        unique_recommendations = list(set(combined_recommendations))[:num_recommendations]

        return unique_recommendations

In [5]:
# Example usage
video_data_path = 'video_data.csv'
user_profile_data_path = 'user_profile_data.csv'
user_history_data_path = 'user_history_data.csv'
es_endpoint = 'your-es-endpoint'
index_name = 'embeddings'

recommendation_system = VideoRecommendationSystem(video_data_path, user_profile_data_path, user_history_data_path, es_endpoint, index_name)

# Get recommendations for a user
user_id = 'user123'
recommended_videos = recommendation_system.get_recommendations(user_id)
print("Recommended Videos:", recommended_videos)


TypeError: __init__() got an unexpected keyword argument 'port'