In [42]:
from flask import Flask, jsonify
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [43]:
old_df = pd.read_csv('comment.csv')


In [44]:
selected_columns = ['user_id', 'product_id', 'rating']
df = old_df[selected_columns]
df = df.drop_duplicates(['user_id', 'product_id', 'rating'])
df['rating'] = df['rating'].astype(float)  # Convert rating column to float
df = df.head(5000)
df


Unnamed: 0,user_id,product_id,rating
0,1,1,5.0
1,1,2,4.0
2,1,4,2.0
3,1,5,2.0
4,2,1,5.0
5,2,3,4.0
6,2,4,2.0
7,2,5,0.0
8,3,1,2.0
9,3,3,1.0


In [64]:
# Reset user_id and product_id to sequential numbers starting from 1
df_copy = df
df['user_id'] = df['user_id'].astype('category').cat.codes + 1
df['product_id'] = df['product_id'].astype('category').cat.codes + 1
df = df.sort_values('user_id')
df_sort_product=df.sort_values('product_id')

df_copy

Unnamed: 0,user_id,product_id,rating
0,1,1,5.0
1,1,2,4.0
2,1,4,2.0
3,1,5,2.0
4,2,1,5.0
5,2,3,4.0
6,2,4,2.0
7,2,5,0.0
9,3,3,1.0
8,3,1,2.0


In [62]:
user_id_to_codes = dict(zip(df['user_id'], df))
input_user_id = 7

# Retrieve the corresponding cat.code
if input_user_id in user_id_to_codes:
    cat_code = user_id_to_codes[input_user_id]
    print(f"The cat.code for user_id '{input_user_id}' is {cat_code}.")
else:
    print(f"No cat.code found for user_id '{input_user_id}'.")

No cat.code found for user_id '7'.


In [46]:
df_sort_product

Unnamed: 0,user_id,product_id,rating
0,1,1,5.0
15,5,1,1.0
12,4,1,0.0
4,2,1,5.0
8,3,1,2.0
13,4,2,0.0
17,6,2,2.0
1,1,2,4.0
18,6,3,1.0
9,3,3,1.0


In [47]:
pivot_df = df.pivot(index='user_id', columns='product_id', values='rating')
# Reset the column names
pivot_df.columns.name = None
# Reset the index name
pivot_df.index.name = None
# Calculate the average rating for each product
pivot_df

Unnamed: 0,1,2,3,4,5
1,5.0,4.0,,2.0,2.0
2,5.0,,4.0,2.0,0.0
3,2.0,,1.0,3.0,4.0
4,0.0,0.0,,4.0,
5,1.0,,,4.0,
6,,2.0,1.0,,
7,,,1.0,4.0,5.0


In [48]:
product_avg = pivot_df.mean(axis=0)

product_avg

1    2.600000
2    2.000000
3    1.750000
4    3.166667
5    2.750000
dtype: float64

In [49]:
# Fill NaN values with 0
pivot_df = pivot_df.fillna(0)
# Calculate the user-product matrix by subtracting the product average from each rating
matrix_avg = pivot_df.sub(product_avg, axis=1)

matrix_avg

Unnamed: 0,1,2,3,4,5
1,2.4,2.0,-1.75,-1.166667,-0.75
2,2.4,-2.0,2.25,-1.166667,-2.75
3,-0.6,-2.0,-0.75,-0.166667,1.25
4,-2.6,-2.0,-1.75,0.833333,-2.75
5,-1.6,-2.0,-1.75,0.833333,-2.75
6,-2.6,0.0,-0.75,-3.166667,-2.75
7,-2.6,-2.0,-0.75,0.833333,2.25


In [50]:
# Calculate the cosine similarity matrix
similarity = cosine_similarity(matrix_avg)
# Get the row of the similarity matrix corresponding to the user
similarity

array([[ 1.        ,  0.06659196, -0.49700027, -0.33735437, -0.22708244,
         0.04327504, -0.73013592],
       [ 0.06659196,  1.        , -0.19062101,  0.01802711,  0.13650928,
         0.13688771, -0.55054096],
       [-0.49700027, -0.19062101,  1.        ,  0.27486829,  0.24984902,
        -0.06180209,  0.83400894],
       [-0.33735437,  0.01802711,  0.27486829,  1.        ,  0.98040493,
         0.55412567,  0.33881161],
       [-0.22708244,  0.13650928,  0.24984902,  0.98040493,  1.        ,
         0.49258686,  0.22772388],
       [ 0.04327504,  0.13688771, -0.06180209,  0.55412567,  0.49258686,
         1.        , -0.07290676],
       [-0.73013592, -0.55054096,  0.83400894,  0.33881161,  0.22772388,
        -0.07290676,  1.        ]])

In [51]:
similarity.shape


(7, 7)

In [52]:
user_similarity = similarity[user_id-1]
user_similarity

array([-0.73013592, -0.55054096,  0.83400894,  0.33881161,  0.22772388,
       -0.07290676,  1.        ])

In [53]:
similar_users = user_similarity.argsort()[:-2-1:-1]
test_similar_users =  np.sort(user_similarity)
test_similar_users



array([-0.73013592, -0.55054096, -0.07290676,  0.22772388,  0.33881161,
        0.83400894,  1.        ])

In [54]:
test_similar_users_arg=user_similarity.argsort()
test_similar_users_arg

array([0, 1, 5, 4, 3, 2, 6], dtype=int64)

In [56]:
similar_users

array([6, 2], dtype=int64)

In [57]:
unrated_products = pivot_df.loc[user_id][pivot_df.loc[user_id] == 0].index
unrated_products


Int64Index([1, 2], dtype='int64')

In [58]:
product_predictions = {}
for product in unrated_products:
    product_ratings = matrix_avg[product].values
    # Only consider ratings from similar users
    similar_ratings = product_ratings[similar_users]
    # Ignore ratings of 0 (unrated products)
    similar_ratings = similar_ratings[similar_ratings != 0]
    if len(similar_ratings) > 0:
        # Calculate the predicted rating as the weighted average of similar ratings
        weights = user_similarity[similar_users][similar_ratings.nonzero()]
        prediction = (similar_ratings * weights).sum() / weights.sum()
        product_predictions[product] = prediction
recommended_products = sorted(product_predictions, key=product_predictions.get, reverse=True)[:4]

    

In [59]:
recommended_products

[1, 2]