# 1. 필요 라이브러리 import 및 사용 데이터 load

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 모든 column을 확인하도록 설정
pd.set_option('display.max_columns', None)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_parquet("/content/drive/MyDrive/cp2_phase2_data/fashion_logs.parquet")
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201009 entries, 0 to 1254584
Data columns (total 23 columns):
 #   Column              Non-Null Count    Dtype   
---  ------              --------------    -----   
 0   created_at          1201009 non-null  category
 1   session_id          1201009 non-null  int32   
 2   payment_method      1201009 non-null  category
 3   promo_amount        1201009 non-null  int32   
 4   promo_code          1201009 non-null  category
 5   shipment_fee        1201009 non-null  int32   
 6   total_amount        1201009 non-null  int32   
 7   product_id          1201009 non-null  int32   
 8   productDisplayName  1201009 non-null  category
 9   quantity            1201009 non-null  int32   
 10  item_price          1201009 non-null  int32   
 11  product_gender      1201009 non-null  category
 12  baseColor           1201009 non-null  category
 13  season              1201009 non-null  category
 14  year                1201009 non-null  int32   
 15

In [4]:
data.head()

Unnamed: 0,created_at,session_id,payment_method,promo_amount,promo_code,shipment_fee,total_amount,product_id,productDisplayName,quantity,item_price,product_gender,baseColor,season,year,usage,Category,customer_id,customer_gender,birthdate,device_type,home_location,first_join_date
0,2018-07-29T15:22:01,194913,Debit Card,1415,WEEKENDSERU,10000,199832,54728,Vans Men Black Shoes,1,191247,Men,Black,Summer,2012,Casual,Footwear Shoes Casual Shoes,5868,F,2000-08-20,Android,Jakarta Raya,2018-07-28
1,2018-07-30T12:40:22,155874,Credit Card,0,No,10000,155526,16193,Puma Men Knitted Vest Green Sweater,1,145526,Men,Green,Fall,2011,Casual,Apparel Topwear Sweaters,4774,F,1996-03-15,Android,Sulawesi Selatan,2018-07-28
2,2018-09-15T11:51:17,489413,OVO,0,No,10000,550696,53686,Kiara Women Purple & Yellow Handbag,4,135174,Women,Purple,Summer,2012,Casual,Accessories Bags Handbags,4774,F,1996-03-15,Android,Sulawesi Selatan,2018-07-28
3,2018-11-01T11:23:48,627839,Credit Card,0,No,0,271012,20228,Wrangler Women Cable Red Sweater,1,271012,Women,Red,Fall,2011,Casual,Apparel Topwear Sweaters,4774,F,1996-03-15,Android,Sulawesi Selatan,2018-07-28
4,2018-12-18T11:20:30,742846,Credit Card,0,No,0,198753,55220,Lakme Absolute Forever Silk Chestnut Lip Liner 03,1,198753,Women,Brown,Spring,2017,Casual,Personal Care Lips Lip Liner,4774,F,1996-03-15,Android,Sulawesi Selatan,2018-07-28


# 2. 베이스라인 모델 구축
- 가장 인기있는 상위 10개 추천

## 1. 학습 데이터와 테스트 데이터 분리

In [5]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=2)

In [6]:
print("학습 데이터의 수 :", train_data.shape[0])
print("테스트 데이터의 수 :", test_data.shape[0])

학습 데이터의 수 : 960807
테스트 데이터의 수 : 240202


## 2. 빈도가 높은 상위 아이템 10개 추출

In [9]:
# 빈도가 높은 상위 20개 아이템 추출
popular_item = train_data['product_id'].value_counts()
top_20 = popular_item.head(20)
top_20

3933     44
40615    44
35425    43
37848    43
12787    42
55022    42
26988    42
54932    42
19558    40
40020    40
12836    40
24135    39
4391     39
21753    39
19223    39
5675     39
5501     39
58871    39
2116     38
56040    38
Name: product_id, dtype: int64

In [10]:
# 데이터 프레임으로 변경
recomm_data = pd.DataFrame({'product_id' : top_20.index ,
              'product_amount' : top_20.values})
recomm_data

Unnamed: 0,product_id,product_amount
0,3933,44
1,40615,44
2,35425,43
3,37848,43
4,12787,42
5,55022,42
6,26988,42
7,54932,42
8,19558,40
9,40020,40


# 3. 성능 평가

In [11]:
# precision@k 계산 함수
def precision_k(test_data, recommend_data) : 
    recommend_list = recommend_data['product_id'].to_list()
    common_product = test_data[test_data['product_id'].isin(recommend_list)]

    return len(common_product) / len(recommend_list)

# recall@k 계산 함수 
def recall_k(test_data, recommend_data) : 
    recommend_list = recommend_data['product_id'].to_list()
    common_product = test_data[test_data['product_id'].isin(recommend_list)]

    return len(common_product) / len(test_data)

In [12]:
# 고객 id가 29496인 샘플 데이터로 테스트
sample_data = test_data[test_data['customer_id'] == 29496]

print(precision_k(sample_data, recomm_data))
print(recall_k(sample_data, recomm_data))

0.0
0.0


In [13]:
# 모든 고객 id에 대해서 테스트
customer_list = test_data['customer_id'].unique()
precision_total = 0
recall_total = 0

for customer_id in customer_list : 
    sample_data = test_data[test_data['customer_id'] == customer_id]
    precision_total += precision_k(sample_data, recomm_data)
    recall_total += recall_k(sample_data, recomm_data)

print("precision :", precision_total/len(customer_list))
print("reall :", recall_total/len(customer_list))

precision : 0.00013472549680026923
reall : 0.00036301053382020325
