In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download necessary NLTK resources (run this once)
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    WordNetLemmatizer().lemmatize('running')
except LookupError:
    nltk.download('wordnet')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Cleans and preprocesses the text."""
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove punctuation and numbers
        tokens = text.split()
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    return ''

def get_content_based_recommendations_improved(item_id, tfidf_matrix, indices, data, top_n=5):
    """
    Recommends items similar to the given item based on preprocessed descriptions
    using TF-IDF and cosine similarity.
    """
    try:
        idx = indices[item_id]
        similarity_scores = list(enumerate(cosine_similarity(tfidf_matrix[idx], tfidf_matrix)[0]))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        similar_indices = [i[0] for i in similarity_scores[1:top_n+1]]
        return data['Item_ID'].iloc[similar_indices].tolist()
    except KeyError:
        return f"Item ID '{item_id}' not found in the dataset."

# Load the dataset
data = pd.read_csv('item_descriptions.csv')

# Apply text preprocessing to the descriptions
data['Cleaned_Description'] = data['Description'].apply(preprocess_text)

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer() # Removed stop_words here as we handle them in preprocessing

# Fit and transform the cleaned descriptions
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Cleaned_Description'])

# Calculate the cosine similarity between items
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping of item ID to index
indices = pd.Series(data.index, index=data['Item_ID']).drop_duplicates()

# Example: Get recommendations for Item I1
item_id_to_recommend = 'I1'
recommendations = get_content_based_recommendations_improved(item_id_to_recommend, tfidf_matrix, indices, data)
print(f"Recommendations for Item {item_id_to_recommend}: {recommendations}")



Recommendations for Item I1: ['I6', 'I2', 'I3', 'I4', 'I5']


In [5]:
data.Description[0]

'Exciting sci-fi adventure in space with alien encounters and thrilling action.'

In [6]:
data.Description[5]

'Fast-paced action thriller with car chases and intense fight sequences.'