In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sqlalchemy import create_engine

In [2]:
# df = pd.read_csv(r"C:/Users/swapn/Desktop/for evaluation/merged_data.csv")

In [3]:
engine = create_engine('mysql+pymysql://root:sunny106133@localhost/project')

# Fetch data into Pandas DataFrames
products_df = pd.read_sql("SELECT * FROM products", con=engine)
users_df = pd.read_sql("SELECT * FROM users", con=engine)
reviews_df = pd.read_sql("SELECT * FROM product_reviews", con=engine)

missing_users = set(reviews_df['user_id'].unique()) - set(users_df['user_id'].unique())
missing_products = set(reviews_df['product_id'].unique()) - set(products_df['product_id'].unique())

merged_1 = pd.merge(reviews_df, products_df, how='outer', on='product_id')


merged_2 = pd.merge(merged_1, users_df, how='outer', on='user_id')
merged_df = merged_2


In [4]:
merged_df.fillna(value=np.nan, inplace=True)


# Now merged_df contains your merged data
df = merged_df
mask_all_nan_except_credentials = df.drop(columns=['user_id', 'user_name', 'password']).isna().all(axis=1)

# Create a mask for rows where rating and rating_count are NOT NaN.
mask_valid_ratings = ~(df['rating'].isna() | df['rating_count'].isna())

# Combine both conditions using OR (|)
final_mask = mask_all_nan_except_credentials | mask_valid_ratings

# Filter the dataframe using the combined mask
df = df[final_mask]


In [5]:
df['discounted_price'] = df['discounted_price'].astype(str).str.replace('₹', '').str.replace(',', '').astype(float)
df['actual_price'] = df['actual_price'].astype(str).str.replace('₹', '').str.replace(',', '').astype(float)
df['discount_percentage'] = df['discount_percentage'].astype(str).str.replace('%','').astype(float)/100
df['rating'] = df['rating'].astype(str)
count = df['rating'].str.contains('\|').sum()

df = df[df['rating'].apply(lambda x: '|' not in str(x))]
count = df['rating'].str.contains('\|').sum()


df['rating'] = df['rating'].astype(str).str.replace(',', '').astype(float)
df['rating_count'] = df['rating_count'].astype(str).str.replace(',', '').astype(float)

In [6]:
le = LabelEncoder()
df['user_id_encoded'] = le.fit_transform(df['user_id'])

def userIdEncoder(userID):
    uuid = userID
    encoded_value = le.transform([uuid])[0]
    return encoded_value

In [7]:
# Calculate the mean rating across all products
C = df['rating'].mean()

# Calculate the 90th percentile of the number of ratings
m = df['rating_count'].quantile(0.9)

# Filter out movies that have a rating count less than m
qualified_products = df[df['rating_count'] >= m]

# Compute the weighted rating for each qualified product
def weighted_rating(x, m=m, C=C):
    v = x['rating_count']
    R = x['rating']
    return (v / (v + m) * R) + (m / (v + m) * C)

# Apply the function to the DataFrame
qualified_products['weighted_rating'] = qualified_products.apply(weighted_rating, axis=1)

# Sort products based on score
qualified_products = qualified_products.sort_values('weighted_rating', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified_products['weighted_rating'] = qualified_products.apply(weighted_rating, axis=1)


In [8]:

def get_popular_products(N=5):
    # Get top N products from the sorted qualified_products DataFrame
    popular_recommendations = []
    for _, row in qualified_products.head(N).iterrows():
        model_response = {}
        model_response['product_id'] = row['product_id']
        model_response['product_name'] = row['product_name']
        model_response['img_link'] = row['img_link']
        model_response['actual_price'] = row['actual_price']
        model_response['discounted_price'] = row['discounted_price']
        model_response['discount_percentage'] = row['discount_percentage']
        popular_recommendations.append(model_response)
        
    return pd.DataFrame({'Product': popular_recommendations, 'Score': [row['weighted_rating'] for _, row in qualified_products.head(N).iterrows()]})


In [9]:

from sklearn.model_selection import LeavePOut

def leave_p_out_cv(df, p, user_column, item_column):
    lpo = LeavePOut(p)
    train_sets = []
    test_sets = []

    for user in df[user_column].unique():
        user_data = df[df[user_column] == user]
        
        # Ensure that the user has more than 'p' interactions
        if len(user_data) > p:
            for train_index, test_index in lpo.split(user_data):
                train = user_data.iloc[train_index]
                test = user_data.iloc[test_index]
                train_sets.append(train)
                test_sets.append(test)

    # Concatenate all the different train and test dataframes
    train_data = pd.concat(train_sets, ignore_index=True)
    test_data = pd.concat(test_sets, ignore_index=True)

    return train_data, test_data

# Example usage
p = 1
train, test = leave_p_out_cv(df, p, 'user_id', 'product_id')

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

def content_based_filtering(train):
    train['combined_text'] = train['about_product'].fillna('') + ' ' + train['review_title'].fillna('') + ' ' + train['review_content'].fillna('')
    index_mapping = {index: i for i, index in enumerate(train.index)}
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(train['combined_text'])
    user_profiles = {}


    for user_id_encoded in train['user_id_encoded'].unique():
        user_data = train[train['user_id_encoded'] == user_id_encoded]
        tfidf_indices = [index_mapping[idx] for idx in user_data.index.tolist()]

        # Get the corresponding tf-idf vectors and ratings for the user
        user_tfidf_vectors = tfidf_matrix[tfidf_indices]
        user_ratings = user_data['rating'].values

        # Compute the weighted sum of the tf-idf vectors
        weighted_user_vector = np.sum(user_tfidf_vectors.multiply(user_ratings[:, np.newaxis]), axis=0)
        weighted_user_vector = np.asarray(weighted_user_vector).reshape(1, -1)

        if np.linalg.norm(weighted_user_vector) != 0:
            weighted_user_vector_norm = weighted_user_vector / np.linalg.norm(weighted_user_vector)
            user_profiles[user_id_encoded] = weighted_user_vector_norm
        else:
            print(f"Zero norm detected for user_id_encoded: {user_id_encoded}")
    

    def recommend_products_with_profiles(user_id_encoded):
        user_vector = user_profiles.get(user_id_encoded, None)
        if user_vector is None:
            return qualified_products

        cosine_sim_user = cosine_similarity(user_vector, tfidf_matrix)
    
        # Getting products already interacted with by the user
        interacted_products = train.loc[train['user_id_encoded'] == user_id_encoded]['product_id'].tolist()

        # Sorting the similarity scores
        similarity_scores = sorted(list(enumerate(cosine_sim_user[0])), key=lambda x: x[1], reverse=True)

        all_top_products = []
        for idx, score in similarity_scores:
            if len(all_top_products) == 5:  # break if we already have 5 recommendations
                break
            product_id = train.iloc[idx]['product_id']
            if product_id not in interacted_products and product_id not in [prod['product_id'] for prod in all_top_products]:
                all_top_products.append(train.iloc[idx])

        recommendation_response = []
        for product in all_top_products:
            model_response = {}
            model_response['product_id'] = product['product_id']
            model_response['product_name'] = product['product_name']
            model_response['img_link'] = product['img_link']
            model_response['actual_price'] = product['actual_price']
            model_response['discounted_price'] = product['discounted_price']
            model_response['discount_percentage'] = product['discount_percentage']
            recommendation_response.append(model_response)

        valid_scores = [score for idx, score in similarity_scores if train.iloc[idx]['product_id'] in [prod['product_id'] for prod in all_top_products]][:5]

        results_df = pd.DataFrame({
            'Id_Encoded': [user_id_encoded] * len(recommendation_response),
            'recommended_product': recommendation_response,
            'score_recommendation': valid_scores
        })
        
        return results_df
    return recommend_products_with_profiles, tfidf_matrix

recommender_function, tfidf_matrix_out = content_based_filtering(train)




In [11]:
def prerequisites_collaborative():
    # Filter users who've rated at least 1 product
    x = train.groupby('user_id_encoded').count()['rating'] > 1
    users_rated = x[x].index
    filtered_df = train[train['user_id_encoded'].isin(users_rated)]

    # Consider all products that have been rated at least once
    y = filtered_df.groupby('product_id').count()['rating'] > 1
    high_rated_products = y[y].index
    final_rating = filtered_df[filtered_df['product_id'].isin(high_rated_products)]

    # Create a user-item matrix
    pt = final_rating.pivot_table(index='user_id_encoded', columns='product_id', values='rating')

    pt.fillna(0, inplace=True)
    return pt


pt = prerequisites_collaborative()


# OGG
from sklearn.metrics.pairwise import cosine_similarity
similarity_score = cosine_similarity(pt)



def get_recommendations(user_id_encoded):
    """Return a list of recommended products for a given user."""
    if user_id_encoded not in pt.index:
        print(f"Error: user_id_encoded {user_id_encoded} not found in index.")
        return qualified_products[['product_name', 'rating', 'rating_count', 'weighted_rating']]

    # Find similar users
    index = pt.index.get_loc(user_id_encoded)
    similar_users = sorted(list(enumerate(similarity_score[index])), key=lambda x: x[1], reverse=True)[1:6]

    # Get the items that these users have interacted with
    recommended_products = {}
    for i in similar_users:
        user_id = pt.index[i[0]]
        rated_products = pt.columns[(pt.loc[user_id] > 0)].tolist()
        for product in rated_products:
            if product not in recommended_products:
                recommended_products[product] = i[1]
            else:
                recommended_products[product] += i[1]

    # Filter out products with a score of 0 and format the top products
    top_products = sorted([(product, score) for product, score in recommended_products.items() if score > 0], key=lambda x: x[1], reverse=True)[:5]

    # Extracting product details
    recommendation_response = []
    scores = []  # New list to keep track of the scores for products that weren't skipped
    for product_id, score in top_products:
        product_data = train[train['product_id'] == product_id]
        if product_data.empty:
            print(f"Warning: No data found for product_id: {product_id}")
            continue
        
        product = product_data.iloc[0]
        model_response = {}
        model_response['product_id'] = product['product_id']
        model_response['product_name'] = product['product_name']
        model_response['img_link'] = product['img_link']
        model_response['actual_price'] = product['actual_price']
        model_response['discounted_price'] = product['discounted_price']
        model_response['discount_percentage'] = product['discount_percentage']
        recommendation_response.append(model_response)
        scores.append(score)  # Add the score for this product

    # Now use the adjusted lists to construct the DataFrame
    results_df = pd.DataFrame({
        'Id_Encoded': [user_id_encoded] * len(recommendation_response),
        'recommended_product': recommendation_response,
        'score_recommendation': scores
    })
    return results_df

In [12]:
from sklearn.metrics import mean_squared_error
predicted_scores = []
actual_ratings = []

for user_id_encoded in test['user_id_encoded'].unique():
    user_data = test[test['user_id_encoded'] == user_id_encoded]
    recommendations_df = get_recommendations(user_id_encoded)
    
    if user_id_encoded not in pt.index or 'recommended_product' not in recommendations_df.columns:
        continue

    for idx, row in user_data.iterrows():
        product_name = row['product_name']
        predicted_score = recommendations_df[recommendations_df['recommended_product'].apply(lambda x: x['product_name']) == product_name]['score_recommendation'].values
        predicted_scores.append(predicted_score[0] if len(predicted_score) > 0 else 0)
        actual_ratings.append(row['rating'])

rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_scores))
print("RMSE:", rmse)


Error: user_id_encoded 5301 not found in index.
Error: user_id_encoded 1835 not found in index.
Error: user_id_encoded 8340 not found in index.
Error: user_id_encoded 8855 not found in index.
Error: user_id_encoded 332 not found in index.
Error: user_id_encoded 1654 not found in index.
Error: user_id_encoded 4786 not found in index.
Error: user_id_encoded 8273 not found in index.
Error: user_id_encoded 4857 not found in index.
Error: user_id_encoded 1847 not found in index.
Error: user_id_encoded 5722 not found in index.
Error: user_id_encoded 5423 not found in index.
Error: user_id_encoded 3109 not found in index.
Error: user_id_encoded 4216 not found in index.
Error: user_id_encoded 4278 not found in index.
Error: user_id_encoded 4888 not found in index.
Error: user_id_encoded 7984 not found in index.
Error: user_id_encoded 1759 not found in index.
Error: user_id_encoded 4434 not found in index.
Error: user_id_encoded 628 not found in index.
Error: user_id_encoded 5847 not found in i

In [13]:
true_positives = 0
false_positives = 0
false_negatives = 0

for user_id_encoded in test['user_id_encoded'].unique():
    user_data = test[test['user_id_encoded'] == user_id_encoded]
    actual_products = set(user_data['product_name'])
    
    recommendations_df = get_recommendations(user_id_encoded)
    
    # Check if the user was found and if 'recommended_product' exists in the DataFrame
    if user_id_encoded not in pt.index or 'recommended_product' not in recommendations_df.columns:
        continue
    
    recommended_products = set([item['product_name'] for item in recommendations_df['recommended_product']])

    true_positives += len(recommended_products.intersection(actual_products))
    false_positives += len(recommended_products - actual_products)
    false_negatives += len(actual_products - recommended_products)

precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0 else 0

print("Precision:", precision)
print("Recall:", recall)



Error: user_id_encoded 5301 not found in index.
Error: user_id_encoded 1835 not found in index.
Error: user_id_encoded 8340 not found in index.
Error: user_id_encoded 8855 not found in index.
Error: user_id_encoded 332 not found in index.
Error: user_id_encoded 1654 not found in index.
Error: user_id_encoded 4786 not found in index.
Error: user_id_encoded 8273 not found in index.
Error: user_id_encoded 4857 not found in index.
Error: user_id_encoded 1847 not found in index.
Error: user_id_encoded 5722 not found in index.
Error: user_id_encoded 5423 not found in index.
Error: user_id_encoded 3109 not found in index.
Error: user_id_encoded 4216 not found in index.
Error: user_id_encoded 4278 not found in index.
Error: user_id_encoded 4888 not found in index.
Error: user_id_encoded 7984 not found in index.
Error: user_id_encoded 1759 not found in index.
Error: user_id_encoded 4434 not found in index.
Error: user_id_encoded 628 not found in index.
Error: user_id_encoded 5847 not found in i

In [14]:
def hybrid_recommendation_v2(user_id_encoded, alpha=0.5, N=50):
    # Get top N recommendations from both systems
    content_based_recommendations = recommender_function(user_id_encoded)[:N]
    collaborative_recommendations = get_recommendations(user_id_encoded)[:N]
    
    # Normalize scores for collaborative recommendations using Min-Max scaling
    max_collab_score = collaborative_recommendations['score_recommendation'].max()
    collaborative_recommendations['normalized_score'] = collaborative_recommendations['score_recommendation'] / max_collab_score

    # Create a dictionary for both content and collab recommendations for O(1) access
    content_dict = {row['recommended_product']['product_id']: row['score_recommendation'] for _, row in content_based_recommendations.iterrows()}
    collab_dict = {row['recommended_product']['product_id']: row['normalized_score'] for _, row in collaborative_recommendations.iterrows()}

    # Combine both recommendation lists
    combined_recommendations = list(set(content_dict.keys()) | set(collab_dict.keys()))

    # Compute combined score for each product in combined_recommendations
    combined_scores = {}
    for product_id in combined_recommendations:
        content_score = content_dict.get(product_id, 0)
        collab_score = collab_dict.get(product_id, 0)
        combined_scores[product_id] = alpha * content_score + (1 - alpha) * collab_score

    # Sort products based on combined scores
    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:5]

    # Extract product details for the top 5 sorted recommendations
    final_recommendations = []
    final_scores = []
    for product_id, score in sorted_recommendations:
        product_data = train[train['product_id'] == product_id].iloc[0]
        model_response = {}
        model_response['product_id'] = product_data['product_id']
        model_response['product_name'] = product_data['product_name']
        model_response['img_link'] = product_data['img_link']
        model_response['actual_price'] = product_data['actual_price']
        model_response['discounted_price'] = product_data['discounted_price']
        model_response['discount_percentage'] = product_data['discount_percentage']
        final_recommendations.append(model_response)
        final_scores.append(score)

    return pd.DataFrame({'Product': final_recommendations, 'Score': final_scores})

In [15]:
true_positives = 0
false_positives = 0
false_negatives = 0
actual_scores = []  # to store the actual scores (ratings)
predicted_scores = []  # to store the predicted scores

for user_id_encoded in test['user_id_encoded'].unique():
    
    # Check if user_id_encoded exists in the matrix
    if user_id_encoded not in pt.index:
        continue

    user_data = test[test['user_id_encoded'] == user_id_encoded]
    
    actual_products = set(user_data['product_name'])
    
    if len(user_data) > 0:
        print(f"User {user_id_encoded} has {len(user_data)} interactions in the test_data.")
    else:
        print(f"User {user_id_encoded} has no interactions in the test_data.")
    # Get hybrid recommendations
    recommendations_df = hybrid_recommendation_v2(user_id_encoded)
    print(recommendations_df.head())

    


    recommended_products = set([item['product_name'] for item in recommendations_df['Product']])
    
    true_positives += len(recommended_products.intersection(actual_products))
    false_positives += len(recommended_products - actual_products)
    false_negatives += len(actual_products - recommended_products)
    
    # Assuming you have a column that denotes the actual rating/score given by the user
    for _, interaction in user_data.iterrows():
        actual_product_name = interaction['product_name']

        recommended_product_ids = [item['product_id'] for item in recommendations_df['Product']]
        
        # Check if the actual product id is in the recommended product ids
        if actual_product_name not in [item['product_name'] for item in recommendations_df['Product']]:
            print(f"Product name {actual_product_name} not found in recommendations.")
            continue

        predicted_score = recommendations_df[recommendations_df['Product'].apply(lambda x: x['product_name'] == actual_product_name)]['Score'].values[0]

        # Assuming 'score_interaction' column exists in test_data
        actual_score = interaction['rating']

        actual_scores.append(actual_score)
        predicted_scores.append(predicted_score)

print(f"Length of actual scores: {len(actual_scores)}")
print(f"Length of predicted scores: {len(predicted_scores)}")


precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0 else 0

rmse = np.sqrt(mean_squared_error(actual_scores, predicted_scores))

print("Precision for Hybrid System:", precision)
print("Recall for Hybrid System:", recall)
print("RMSE for Hybrid System:", rmse)


User 5 has 10 interactions in the test_data.
                                             Product     Score
0  {'product_id': 'B0765B3TH7', 'product_name': '...  0.500000
1  {'product_id': 'B077T3BG5L', 'product_name': '...  0.234315
2  {'product_id': 'B08ZJDWTJ1', 'product_name': '...  0.207107
3  {'product_id': 'B014SZO90Y', 'product_name': '...  0.125997
4  {'product_id': 'B08CS3BT4L', 'product_name': '...  0.124734
Product name Kodak 80 cm (32 Inches) HD Ready LED TV Kodak 32HDX900S (Black) not found in recommendations.
Product name Tata Sky Universal Remote not found in recommendations.
Product name iQOO Z6 Pro 5G by vivo (Legion Sky, 6GB RAM, 128GB Storage) | Snapdragon 778G 5G | 66W FlashCharge | 1300 nits Peak Brightness | HDR10+ not found in recommendations.
Product name Faber-Castell Connector Pen Set - Pack of 25 (Assorted) not found in recommendations.
Product name Dr Trust Electronic Kitchen Digital Scale Weighing Machine (Blue) not found in recommendations.
Product name D

In [16]:
print(test.columns)


def precision_at_k(r, k):
    """Return precision at k for a single user's recommendations.
    
    r: list of relevance values. E.g., [0, 1, 1, 0, 1]
    """
    assert k >= 1
    r = np.asarray(r)[:k]
    return np.mean(r)

def average_precision_at_k(r, k):
    """Return average precision at k for a single user's recommendations."""
    r = np.asarray(r)[:k]
    out = [precision_at_k(r, j + 1) for j, rel in enumerate(r) if rel]
    if not out:
        return 0
    return np.mean(out)

def mean_average_precision_at_k(rs, k):
    """Calculate MAP@K over all users (rs: list of lists of relevance values)."""
    return np.mean([average_precision_at_k(r, k) for r in rs])

def ndcg_at_k(r, k):
    """Compute NDCG@K for a single user's ranking."""
    r = np.asarray(r)[:k]
    dcg = np.sum(r / np.log2(np.arange(2, len(r) + 2)))
    idcg = np.sum(np.ones_like(r) / np.log2(np.arange(2, len(r) + 2)))
    return dcg / idcg

# For each user in your test set:
# 1. Get the top-K recommended items' ids.
# 2. Determine the relevance of these items based on true interactions/ratings.
# 3. Calculate MAP@K and NDCG@K

all_users_relevance = []  # List of lists. Each inner list is the relevance values for one user's recommendations.

for user in test:
    # Assuming you've generated top-K recommendations for this user:
    recommended_items = hybrid_recommendation_v2(user) # Replace with your hybrid recommender's function
    print(recommended_items.columns)
    true_relevant_items = test.get(user, [])
    
    # Generate binary relevance values. 1 if the item was truly relevant, 0 otherwise.
    relevance = [1 if item in true_relevant_items else 0 for item in recommended_items]
    all_users_relevance.append(relevance)

# Calculate MAP@K and NDCG@K
k = 10  # or any other value you're interested in
print("MAP@K:", mean_average_precision_at_k(all_users_relevance, k))
print("NDCG@K:", np.mean([ndcg_at_k(r, k) for r in all_users_relevance]))



Index(['review_id', 'user_id', 'product_id', 'review_title', 'review_content',
       'product_name', 'category', 'discounted_price', 'actual_price',
       'discount_percentage', 'rating', 'rating_count', 'about_product',
       'img_link', 'product_link', 'user_name', 'password', 'user_id_encoded'],
      dtype='object')
Error: user_id_encoded review_id not found in index.


KeyError: 'score_recommendation'