In [32]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [33]:
df = pd.read_csv(r"C:/Users/swapn/Desktop/testing/modified_dataset3.csv")

In [34]:
df['discounted_price'] = df['discounted_price'].astype(str).str.replace('₹', '').str.replace(',', '').astype(float)
df['actual_price'] = df['actual_price'].astype(str).str.replace('₹', '').str.replace(',', '').astype(float)
df['discount_percentage'] = df['discount_percentage'].astype(str).str.replace('%','').astype(float)/100

In [35]:
count = df['rating'].str.contains('\|').sum()
print(f"Total de linhas com '|' na coluna 'rating': {count}")
df = df[df['rating'].apply(lambda x: '|' not in str(x))]
count = df['rating'].str.contains('\|').sum()
print(f"Total de linhas com '|' na coluna 'rating': {count}")

Total de linhas com '|' na coluna 'rating': 8
Total de linhas com '|' na coluna 'rating': 0


In [36]:
df['rating'] = df['rating'].astype(str).str.replace(',', '').astype(float)
df['rating_count'] = df['rating_count'].astype(str).str.replace(',', '').astype(float)

le = LabelEncoder()
df['user_id_encoded'] = le.fit_transform(df['user_id'])

In [37]:
# Calculate the mean rating across all products
C = df['rating'].mean()

# Calculate the 90th percentile of the number of ratings
m = df['rating_count'].quantile(0.9)

# Filter out movies that have a rating count less than m
qualified_products = df[df['rating_count'] >= m]

# Compute the weighted rating for each qualified product
def weighted_rating(x, m=m, C=C):
    v = x['rating_count']
    R = x['rating']
    return (v / (v + m) * R) + (m / (v + m) * C)

# Apply the function to the DataFrame
qualified_products['weighted_rating'] = qualified_products.apply(weighted_rating, axis=1)

# Sort products based on score
qualified_products = qualified_products.sort_values('weighted_rating', ascending=False)

# Print the top recommendations
print(qualified_products[['product_name', 'rating', 'rating_count', 'weighted_rating']])

                                           product_name  rating  rating_count  \
9007  Swiffer Instant Electric Water Heater Faucet T...     4.8       53803.0   
9006  Swiffer Instant Electric Water Heater Faucet T...     4.8       53803.0   
9005  Swiffer Instant Electric Water Heater Faucet T...     4.8       53803.0   
6793  SanDisk Extreme SD UHS I 64GB Card for 4K Vide...     4.5      205052.0   
6789  SanDisk Extreme SD UHS I 64GB Card for 4K Vide...     4.5      205052.0   
...                                                 ...     ...           ...   
3239  PTron Tangent Lite Bluetooth 5.0 Earphones wit...     3.5       83996.0   
3238  PTron Tangent Lite Bluetooth 5.0 Earphones wit...     3.5       83996.0   
3237  PTron Tangent Lite Bluetooth 5.0 Earphones wit...     3.5       83996.0   
3236  PTron Tangent Lite Bluetooth 5.0 Earphones wit...     3.5       83996.0   
3234  PTron Tangent Lite Bluetooth 5.0 Earphones wit...     3.5       83996.0   

      weighted_rating  
900

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified_products['weighted_rating'] = qualified_products.apply(weighted_rating, axis=1)


In [38]:
# DATA SPLIT

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

def content_based_filtering(df):
    index_mapping = {index: i for i, index in enumerate(df.index)}

    def create_tfidf_matrix(df_inner):
        tfidf = TfidfVectorizer(stop_words='english')
        df_inner['about_product'] = df_inner['about_product'].fillna('')
        tfidf_matrix = tfidf.fit_transform(df_inner['about_product'])
        return tfidf_matrix, tfidf

    tfidf_matrix, tfidf = create_tfidf_matrix(df)

    user_profiles = {}

    for user_id_encoded in df['user_id_encoded'].unique():
        user_data = df[df['user_id_encoded'] == user_id_encoded]
        tfidf_indices = [index_mapping[idx] for idx in user_data.index.tolist()]
        user_vector = np.sum(tfidf_matrix[tfidf_indices], axis=0)
        user_vector = np.asarray(user_vector).reshape(1, -1)
        user_vector_norm = user_vector / np.linalg.norm(user_vector)
        user_profiles[user_id_encoded] = user_vector_norm

    def recommend_products_with_profiles(user_id_encoded, tfidf_matrix):
    # Fetch the user profile
        user_vector = user_profiles.get(user_id_encoded, None)
        
        if user_vector is None:
            return qualified_products[['product_name', 'rating', 'rating_count', 'weighted_rating']]  # Fallback to popular recommendations if no profile found
        
        # Calculate similarity between user profile and all products
        cosine_sim_user = cosine_similarity(user_vector, tfidf_matrix)
        cosine_sim_user = np.asarray(cosine_sim_user)

        # Products the user has already interacted with
        interacted_products = df.loc[df['user_id_encoded'] == user_id_encoded]['product_name'].tolist()
        
        similarity_scores = list(enumerate(cosine_sim_user[0]))
        
        # Filter out products the user has already interacted with
        similarity_scores = [item for item in similarity_scores if df.iloc[item[0]]['product_name'] not in interacted_products]
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

        recommended_products_set = set()  # Use a set to keep track of unique recommended products
        all_top_products = []
        for idx, score in similarity_scores:
            product_name = df.iloc[idx]['product_name']
            if product_name not in recommended_products_set:
                recommended_products_set.add(product_name)
                all_top_products.append(idx)
            if len(all_top_products) == 5:  # Stop once we have 5 unique recommendations
                break

        recommended_products_list = df.iloc[all_top_products]['product_name'].tolist()
        valid_scores = [score for idx, score in similarity_scores if idx in all_top_products][:5]

        results_df = pd.DataFrame({
            'Id Encoded': [user_id_encoded] * len(recommended_products_list),
            'recommended product': recommended_products_list,
            'score recommendation': valid_scores
        })

        return results_df

    return recommend_products_with_profiles, tfidf_matrix

# recommender_function, tfidf_matrix_out = content_based_filtering(df)
# recommendations = recommender_function(4223423424, tfidf_matrix_out)
# print(recommendations)


In [40]:
recommender_function, tfidf_matrix_out = content_based_filtering(df)
recommendations = recommender_function(4223423424, tfidf_matrix_out)
print(recommendations)

In [41]:

def prerequisites_collaborative():
    x = df.groupby('user_id_encoded').count()['rating'] > 3
    users_rated = x[x].index
    filtered_df = df[df['user_id_encoded'].isin(users_rated)]
    y = filtered_df.groupby('product_name').count()['rating'] > 1
    high_rated_products = y[y].index
    final_rating = filtered_df[filtered_df['product_name'].isin(high_rated_products)]
    
    # Create a user-item matrix
    pt = final_rating.pivot_table(index='user_id_encoded', columns='product_name', values='rating')
    pt.fillna(0, inplace=True)
    return pt

pt = prerequisites_collaborative()

from sklearn.metrics.pairwise import cosine_similarity
similarity_score = cosine_similarity(pt)


In [42]:
def get_recommendations(user_id_encoded):
    """Return a list of recommended products for a given user."""
    try:
        index = np.where(pt.index == user_id_encoded)[0][0]
        print(f"Index for user_id_encoded: {user_id_encoded}: {index}")
        
        # Find similar users
        similar_users = sorted(list(enumerate(similarity_score[index])), key=lambda x: x[1], reverse=True)[1:6]
        print(f"Similar users: {similar_users}")
        
        # Get the items that these users have interacted with
        recommended_products = []
        for i in similar_users:
            user_id = pt.index[i[0]]
            rated_products = pt.columns[(pt.loc[user_id] > 0)].tolist()
            recommended_products.extend(rated_products)
        
        # Remove duplicates and return
        recommended_products = list(set(recommended_products))
        print(f"Recommended products: {recommended_products}")
    except IndexError:
        print(f"Error: user_id_encoded {user_id_encoded} not found in index.")
        recommended_products = qualified_products[['product_name', 'rating', 'rating_count', 'weighted_rating']]

    return recommended_products

In [43]:
def hybrid_recommendation(user_id_encoded, alpha=0.6):
    """
    alpha: Determines the weightage to be given to content-based vs collaborative filtering.
    An alpha of 0.5 means both will have equal weightage.
    """
    # 1. Get recommendations from Content-based Recommendation System
    recommender_function, tfidf_matrix_out = content_based_filtering(df)
    content_based_recommendations = recommender_function(user_id_encoded, tfidf_matrix_out)
    
    # 2. Get recommendations from Collaborative Filtering System (user-based CF)
    collaborative_recommendations = get_recommendations(user_id_encoded)[:6]
    
    # 3. Combine the scores (if possible, else can just concatenate lists and de-duplicate)
    
    # Let's say the content-based system gave scores in the 'score recommendation' column.
    # Multiply the scores with alpha for content-based recommendations
    content_based_recommendations['score recommendation'] *= alpha
    
    # For the collaborative filtering system, we can assign scores based on ranks.
    collaborative_scores = [(1 - idx * 0.1) * (1 - alpha) for idx in range(len(collaborative_recommendations))]
    
    # Concatenate the lists and scores
    combined_products = content_based_recommendations['recommended product'].tolist() + collaborative_recommendations
    combined_scores = content_based_recommendations['score recommendation'].tolist() + collaborative_scores
    
    # Deduplicate
    seen = set()
    final_recommendations = []
    final_scores = []
    for prod, score in zip(combined_products, combined_scores):
        if prod not in seen:
            final_recommendations.append(prod)
            final_scores.append(score)
            seen.add(prod)
    
    return pd.DataFrame({'Product': final_recommendations, 'Score': final_scores})

# Use
recommendations = hybrid_recommendation(1048)
print(recommendations)



Error: user_id_encoded 1048 not found in index.


ValueError: Unable to coerce to Series, length must be 4: given 5

In [None]:
def get_actual_ratings(user_id, df):
    """
    Fetches the actual ratings given by a user.

    Parameters:
    - user_id: The encoded user ID for which you want to fetch actual ratings.
    - df: The DataFrame containing the user ratings data.

    Returns:
    - A dictionary where keys are product names and values are the corresponding ratings.
    """
    # Filter the dataframe to get rows where user ID matches
    user_ratings = df[df['user_id_encoded'] == user_id]
    
    # Convert the product names and ratings to dictionary
    ratings_dict = dict(zip(user_ratings['product_name'], user_ratings['rating']))

    return ratings_dict




In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def compute_rmse(actual_ratings, predicted_scores):
    # Ensure both lists are of the same length
    if len(actual_ratings) != len(predicted_scores):
        raise ValueError("The lists of actual ratings and predicted scores must have the same length")
    
    rmse = sqrt(mean_squared_error(actual_ratings, predicted_scores))
    return rmse


def get_actual_ratings(user_id, df):
    """
    Fetches the actual ratings given by a user.

    Parameters:
    - user_id: The encoded user ID for which you want to fetch actual ratings.
    - df: The DataFrame containing the user ratings data.

    Returns:
    - A dictionary where keys are product names and values are the corresponding ratings.
    """
    # Filter the dataframe to get rows where user ID matches
    user_ratings = df[df['user_id_encoded'] == user_id]
    
    # Convert the product names and ratings to dictionary
    ratings_dict = dict(zip(user_ratings['product_name'], user_ratings['rating']))

    return ratings_dict

# For demonstration:
user_id = 462

# Fetch recommended products for the user
recommendations_df = hybrid_recommendation(user_id)

# Fetch actual ratings for the user
actual_ratings_dict = get_actual_ratings(user_id, df)

# Extract actual ratings for the recommended products
actual_ratings = [actual_ratings_dict.get(prod, 0) for prod in recommendations_df['Product'].tolist()]

# Now you can use the compute_rmse function as shown before
predicted_scores = recommendations_df['Score'].tolist()
rmse = compute_rmse(actual_ratings, predicted_scores)
print(f"RMSE for user {user_id}: {rmse}")

Index for user_id_encoded: 462: 314
Similar users: [(491, 0.28453810178851835), (450, 0.2694451124826193), (173, 0.25959425744026654), (49, 0.24454656773381714), (75, 0.236200508765741)]
Recommended products: ['Duracell CR2016 3V Lithium Coin Battery, 5 pcs, 2016 Coin Button Cell Battery, DL2016', 'Bajaj New Shakti Neo 10L Vertical Storage Water Heater (Geyser 10 Litres) 4 Star BEE Rated Heater For Water Heating with Titanium Armour, Swirl Flow Technology, Glasslined Tank(White), 1 Yr Warranty', 'Camel Artist Acrylic Color Box - 9ml Tubes, 12 Shades', 'Redmi 80 cm (32 inches) Android 11 Series HD Ready Smart LED TV | L32M6-RA/L32M7-RA (Black)', 'Mi 100 cm (40 inches) Horizon Edition Full HD Android LED TV 4A | L40M6-EI (Black)', 'boAt Airdopes 121v2 in-Ear True Wireless Earbuds with Upto 14 Hours Playback, 8MM Drivers, Battery Indicators, Lightweight Earbuds & Multifunction Controls (Active Black, with Mic)', 'boAt Bassheads 152 in Ear Wired Earphones with Mic(Active Black)', 'Firestic

In [None]:
# def evaluate_over_users(user_ids, df):
#     rmses = []
#     for user_id in user_ids:
#         recommendations_df = hybrid_recommendation(user_id)
#         actual_ratings_dict = get_actual_ratings(user_id, df)

#         common_products = set(recommendations_df['Product']) & set(actual_ratings_dict.keys())
#         filtered_recommendations_df = recommendations_df[recommendations_df['Product'].isin(common_products)]

#         if len(filtered_recommendations_df) == 0:
#             continue  # skip this user if no common products

#         actual_ratings = [actual_ratings_dict[prod] for prod in filtered_recommendations_df['Product'].tolist()]
#         predicted_scores = filtered_recommendations_df['Score'].tolist()
        
#         rmse = compute_rmse(actual_ratings, predicted_scores)
#         rmses.append(rmse)
    
#     return np.mean(rmses)  # Return the average RMSE over all users

# # Use the function
# user_ids_sample = [1048, 1050, 1065]  # You can choose a representative sample of user_ids
# average_rmse = evaluate_over_users(user_ids_sample, df)
# print(f"RMSE: {average_rmse}")




In [None]:
user_id = 1048

# Assuming a function `get_actual_ratings` returns the actual ratings for a user for the items in your recommendation list
actual_ratings = get_actual_ratings(user_id,df)

# Fetch recommended products for the user
recommendations_df = hybrid_recommendation(user_id)

# Fetch actual ratings for the user
actual_ratings_dict = get_actual_ratings(user_id, df)

# Filter the recommendations to include only products that are also present in the actual ratings
common_products = set(recommendations_df['Product']) & set(actual_ratings_dict.keys())

filtered_recommendations_df = recommendations_df[recommendations_df['Product'].isin(common_products)]

# Extract ratings for the common products
actual_ratings = [actual_ratings_dict[prod] for prod in filtered_recommendations_df['Product'].tolist()]
predicted_scores = filtered_recommendations_df['Score'].tolist()

# Ensure lengths match
assert len(actual_ratings) == len(predicted_scores), "Length mismatch between actual and predicted ratings!"

# Now you can use the compute_rmse function
rmse = compute_rmse(actual_ratings, predicted_scores)
print(f"RMSE for user {user_id}: {rmse}")

Index for user_id_encoded: 1048: 716
Similar users: [(337, 0.24753688574416852), (548, 0.20907512365103711), (810, 0.19754591932991789), (397, 0.19214858184417052), (21, 0.17960530202677488)]
Recommended products: ['Flix (Beetel) Usb To Type C Pvc Data Sync And 2A 480Mbps Data Sync, Tough Fast Charging Long Cable For Usb Type C Devices, Charging Adapter (White, 1 Meter) - Xcd-C12', 'Aqua d pure Active Copper 12-L RO+UV Water Filter Purifier for Home, Kitchen Fully Automatic UF+TDS Controller', 'Sure From Aquaguard Delight NXT RO+UV+UF+Taste Adjuster(MTDS),6L water purifier,8 stages purification,Suitable for borewell,tanker,municipal water(Black) from Eureka Forbes', 'GIZGA Club-laptop Neoprene Reversible for 15.6-inches Laptop Sleeve - Black-Red', 'Myvn 30W Warp/20W Dash Charging Usb Type C Charger Cable Compatible For Cellular Phones Oneplus 8T 8 8Pro 7 Pro / 7T / 7T Pro Nord And Oneplus 3 / 3T / 5 / 5T / 6 / 6T / 7', 'ZEBRONICS Zeb-Astra 20 Wireless BT v5.0 Portable Speaker with 10W 