In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from pathlib import Path
import joblib
import torch

In [3]:
# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Define directories
DATA_DIR = Path('/home/abdellah-ennajari/Desktop/AI-Powered-Academic-Research-Assistant')
PROCESSED_DIR = DATA_DIR / 'Data/processed'

In [5]:
# Load cleaned data
papers_df = pd.read_csv(PROCESSED_DIR / 'cleaned_papers.csv')


# ====================================================
# TF-IDF Vectorization
# ====================================================

In [6]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [7]:
# Fit and transform the paper titles
tfidf_matrix = tfidf_vectorizer.fit_transform(papers_df['title'])


In [8]:
# ====================================================
# Recommendation Function (Using Sparse Matrices)
# ====================================================

In [9]:
def recommend_papers(query, top_n=5):
    """
    Recommend papers based on a query (paper title or keywords).
    """
    # Transform the query into TF-IDF vector
    query_vector = tfidf_vectorizer.transform([query])

    # Compute cosine similarity between the query and all papers
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get the indices of the top N most similar papers
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]

    # Return the top N recommended papers
    recommendations = papers_df.iloc[top_indices]
    return recommendations

# ====================================================
# Example Usage
# ====================================================

In [10]:
# Example query
query = "machine learning in healthcare"
print(f"Recommendations for query: '{query}'")
recommendations = recommend_papers(query, top_n=5)
print(recommendations[['paper_id', 'title']])


Recommendations for query: 'machine learning in healthcare'
          paper_id                                              title
489050  2009.11087      Probabilistic Machine Learning for Healthcare
458467  1909.07370  Machine learning in healthcare - a system’s pe...
504751  2305.02474  MLHOps: Machine Learning for Healthcare Operat...
235553  2004.03360  Machine Learning-based Framework for Smart Hea...
408646  2206.14397      Fair Machine Learning in Healthcare: A Survey


# ====================================================
# Save the TF-IDF Matrix and Vectorizer
# ====================================================

In [11]:
# Save the TF-IDF matrix and vectorizer for later use
joblib.dump(tfidf_matrix, PROCESSED_DIR / 'tfidf_matrix.pkl')
joblib.dump(tfidf_vectorizer, PROCESSED_DIR / 'tfidf_vectorizer.pkl')

print("\nTF-IDF matrix and vectorizer saved to 'processed' directory.")



TF-IDF matrix and vectorizer saved to 'processed' directory.
