In [1]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

In [2]:
df = pd.read_excel('train_data_2023.xlsx')

In [3]:
# 이상치 제거
pivot = df.pivot_table(index = 'Customer_ID', values = 'Order_Date', aggfunc = 'count' )

In [4]:
pivot[pivot['Order_Date'] < 10].index

Int64Index([100018, 100021, 100042, 100051, 100068, 100080, 100097, 100137,
            100144, 100155,
            ...
            999899, 999905, 999910, 999924, 999925, 999935, 999952, 999987,
            999994, 999998],
           dtype='int64', name='Customer_ID', length=104098)

In [5]:
df0 = df[df['Customer_ID'].isin(pivot[pivot['Order_Date'] < 10].index)]

In [8]:
df0['rating'] = df0.groupby(['Customer_ID', 'Product_No'])['Order_Date'].transform('count')
df0 = df0.drop_duplicates(subset=['Customer_ID', 'Product_No'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df0['rating'] = df0.groupby(['Customer_ID', 'Product_No'])['Order_Date'].transform('count')


In [9]:
reader = Reader(rating_scale=(df0['rating'].min(), df0['rating'].max()))
data = Dataset.load_from_df(df0[['Customer_ID', 'Product_No', 'rating']], reader)

In [10]:
algo = SVD()

In [11]:
# 교차 검증
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.4911  0.4868  0.4930  0.5021  0.5128  0.4972  0.0093  
MAE (testset)     0.2949  0.2936  0.2943  0.2978  0.2990  0.2959  0.0021  
Fit time          1.82    1.87    1.96    1.91    1.86    1.88    0.05    
Test time         0.19    0.19    0.35    0.26    0.27    0.25    0.06    


{'test_rmse': array([0.49106741, 0.48679781, 0.49303738, 0.50211304, 0.51278667]),
 'test_mae': array([0.29489472, 0.29360985, 0.29433701, 0.29775518, 0.29904802]),
 'fit_time': (1.819817304611206,
  1.8682701587677002,
  1.955850601196289,
  1.9113245010375977,
  1.864725112915039),
 'test_time': (0.19312167167663574,
  0.1865217685699463,
  0.3456716537475586,
  0.26001858711242676,
  0.2707836627960205)}

In [12]:
# 전체 데이터 학습
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x214a2f5fa60>

In [13]:
# 고객별 제품 추천
recommendations = []
for customer_id in df0['Customer_ID'].unique():
    customer_ratings = df0[df0['Customer_ID'] == customer_id]
    recommended_products = []
    for product in df0['Product_No'].unique():
        if product not in customer_ratings['Product_No']:
            prediction = algo.predict(customer_id, product)
            recommended_products.append((product, prediction[3]))
    recommended_products.sort(key=lambda x: x[1], reverse=True)
    recommendations.append({'Customer_ID': customer_id, 'Recommended_Products': [product for product, _ in recommended_products[:1]]})
recommendations_df = pd.DataFrame(recommendations)

In [14]:
print(recommendations_df)

        Customer_ID Recommended_Products
0            881247              [10712]
1            319285              [10715]
2            956846              [10715]
3            260350              [10712]
4            938614              [10712]
...             ...                  ...
104093       954101              [10712]
104094       365038              [10712]
104095       939046              [10715]
104096       227381              [10715]
104097       352930              [10715]

[104098 rows x 2 columns]
