# Import libraries

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import torch
from torch import nn
from sklearn.metrics.pairwise import cosine_similarity

# Loading Datasets

In [None]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
links = pd.read_csv('links.csv')

# Merging links to movies to use IMDb IDs (For web-scrapping)

In [None]:
movies = movies.merge(links, on='movieId')

# Scrapping movie description

In [None]:
def scrape_movie_description(imdb_id):
    url = f"https://www.imdb.com/title/tt{imdb_id}/"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            description_tag = soup.find('div', attrs={'class': 'summary_text'})
            return description_tag.text.strip() if description_tag else 'No description available'
    except:
        return 'No description available'

# Apply scraping

In [None]:
movies['description'] = movies['imdbId'].apply(scrape_movie_description)

# Using GloVe Embeddings

In [None]:
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

glove_path = 'glove.6B.100d.txt'
glove_embeddings = load_glove_embeddings(glove_path)

# Convert descriptions to embeddings

In [None]:
def description_to_embedding(description):
    words = description.lower().split()
    embeddings = [glove_embeddings[word] for word in words if word in glove_embeddings]
    if embeddings:
        embeddings = torch.tensor(embeddings)
        return torch.mean(embeddings, axis=0)
    else:
        return torch.zeros(100, dtype=torch.float32)

movies['embedding'] = movies['description'].apply(description_to_embedding)

# Cosine Similarity matrix for CBF

In [None]:
embeddings_matrix = torch.stack(movies['embedding'].tolist()).numpy()
item_similarity = cosine_similarity(embeddings_matrix)

# Cosine Similarity matrix for CF (User-Item)

In [None]:
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_similarity = cosine_similarity(user_item_matrix.to_numpy())

# Get CF & CBF scores

In [None]:
def get_cf_scores(user_id, user_similarity_matrix, user_item_matrix):
    sim_scores = user_similarity_matrix[user_id - 1]
    weighted_scores = np.dot(sim_scores, user_item_matrix)
    sim_scores_sum = np.array([np.abs(sim_scores).sum(axis=0)])
    return weighted_scores / sim_scores_sum

def get_cbf_scores(user_id, item_similarity_matrix, user_item_matrix):
    user_ratings = user_item_matrix.iloc[user_id - 1]
    weighted_scores = np.dot(item_similarity_matrix, user_ratings.T)
    return weighted_scores / np.sum(item_similarity_matrix, axis=1)

# Using XGBoost to combine the scores

In [None]:
def get_recommendations(user_id, top_n=10):
    cf_scores = get_cf_scores(user_id, user_similarity, user_item_matrix)
    cbf_scores = get_cbf_scores(user_id, item_similarity, user_item_matrix)
    features = np.vstack((cf_scores, cbf_scores)).T
    
    X_train, X_test, y_train, y_test = train_test_split(features, ratings['rating'], test_size=0.2, random_state=42)
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    recommended_movie_ids = np.argsort(predictions)[::-1][:top_n]
    recommended_movies = movies.loc[movies['movieId'].isin(recommended_movie_ids)]
    return recommended_movies

# Recommendations for a user

In [None]:
try:
    user_id = int(input("Enter the User ID for which you want recommendations: "))
    
    if user_id not in user_item_matrix.index:
        print(f"User ID {user_id} does not exist in the dataset. Please try again.")
    else:
        recommendations = get_recommendations(user_id, w_cf=0.5, w_cbf=0.5, top_n=10)
        
        # Display the recommendations
        print(f"\nTop recommended movies for you, {user_id}:")
        print(recommendations)
except ValueError:
    print("Invalid input. Please enter a numeric User ID.")