## order=해당 카테고리 주문 수 / 특정 유저 총 주문 수 (특정 유저의 카테고리별 주문율 파악 가능)

## 기존 data에 order 데이터 합치기

In [9]:
import pandas

from sklearn.metrics.pairwise import cosine_similarity

data = pandas.read_csv('file/user_order_data.csv')

orderFrequency = (data['orderFrequency'] / data['total']).to_frame()
orderFrequency.columns = ['order']

data = pandas.merge(data, orderFrequency, left_index=True, right_index=True)

print(data)

     userId  categoryId  orderFrequency  total     order
0         1           1              10    215  0.046512
1         1           2              21    215  0.097674
2         1           3               5    215  0.023256
3         1           4              16    215  0.074419
4         1           5               7    215  0.032558
..      ...         ...             ...    ...       ...
367      31           8              30    266  0.112782
368      31           9              30    266  0.112782
369      31          10               9    266  0.033835
370      31          11              10    266  0.037594
371      31          12              49    266  0.184211

[372 rows x 5 columns]


## index=userId, columns=categoryId, values=order 값으로 채워지는 행렬로 치환


In [11]:
pivot_data = data.pivot(index='userId', columns='categoryId', values='order')

print(pivot_data)

categoryId        1         2         3         4         5         6   \
userId                                                                   
1           0.046512  0.097674  0.023256  0.074419  0.032558  0.065116   
2           0.055556  0.134259  0.138889  0.069444  0.087963  0.148148   
3           0.055814  0.069767  0.074419  0.106977  0.097674  0.134884   
4           0.079365  0.074074  0.095238  0.089947  0.063492  0.047619   
5           0.010067  0.003356  0.093960  0.060403  0.114094  0.137584   
6           0.090452  0.080402  0.065327  0.040201  0.120603  0.115578   
7           0.074205  0.176678  0.038869  0.106007  0.045936  0.067138   
8           0.150160  0.028754  0.137380  0.006390  0.028754  0.092652   
9           0.026455  0.052910  0.153439  0.026455  0.063492  0.084656   
10          0.141361  0.104712  0.094241  0.031414  0.010471  0.041885   
11          0.011152  0.144981  0.092937  0.022305  0.026022  0.059480   
12          0.046832  0.057851  0.1101

## 생성한 pivot_data로 유사도 계산

In [13]:
user_similarity = cosine_similarity(pivot_data)

print("user_similarity\n\n", user_similarity)

user_similarity

 [[1.         0.76009722 0.80254962 0.90028441 0.76239155 0.78714053
  0.81792375 0.80147549 0.77276619 0.80645525 0.81094501 0.83429242
  0.85868736 0.74254692 0.85024619 0.79058199 0.73647113 0.82079547
  0.70717049 0.86378334 0.87413959 0.71495951 0.86830757 0.78207984
  0.76870444 0.82945369 0.73961565 0.75128084 0.77643764 0.83054189
  0.809753  ]
 [0.76009722 1.         0.87789445 0.76343676 0.79946059 0.8237541
  0.8048533  0.72758607 0.76099484 0.74867374 0.77633691 0.8931051
  0.91195801 0.87322528 0.78324602 0.82112766 0.84371694 0.80490676
  0.7716812  0.80905685 0.72658    0.88964971 0.7717934  0.74895785
  0.70756541 0.77602794 0.79396856 0.65890157 0.74516553 0.90993337
  0.72234949]
 [0.80254962 0.87789445 1.         0.71880842 0.7686912  0.66449513
  0.67320309 0.72999415 0.67835664 0.63504029 0.64561835 0.92819953
  0.8698934  0.81295495 0.7508622  0.64440729 0.80508438 0.69464173
  0.67456362 0.78074412 0.64761712 0.72733344 0.71976431 0.71028961
  0.

## 계산한 user_similarity 행렬을 

## index=pivot_data.index(=userId), columns=pivot_data.index(=userId), values=유사도값

## 행렬로 치환

In [14]:
user_similarity_df = pandas.DataFrame(data=user_similarity, index=pivot_data.index, columns=pivot_data.index)

print("result\n", user_similarity_df)

result
 userId        1         2         3         4         5         6         7   \
userId                                                                         
1       1.000000  0.760097  0.802550  0.900284  0.762392  0.787141  0.817924   
2       0.760097  1.000000  0.877894  0.763437  0.799461  0.823754  0.804853   
3       0.802550  0.877894  1.000000  0.718808  0.768691  0.664495  0.673203   
4       0.900284  0.763437  0.718808  1.000000  0.683819  0.814598  0.813160   
5       0.762392  0.799461  0.768691  0.683819  1.000000  0.785574  0.682288   
6       0.787141  0.823754  0.664495  0.814598  0.785574  1.000000  0.757849   
7       0.817924  0.804853  0.673203  0.813160  0.682288  0.757849  1.000000   
8       0.801475  0.727586  0.729994  0.799915  0.699126  0.728874  0.576352   
9       0.772766  0.760995  0.678357  0.811381  0.703597  0.777721  0.672971   
10      0.806455  0.748674  0.635040  0.750437  0.685011  0.848838  0.730176   
11      0.810945  0.776337  0.64

## 생성한 user_similarity_df 행렬 -> user_similarity_df[x] 로 x번 유저의 유사도값 가져오기 가능

## 후에 함수 파라미터로 인덱스값 받을 예정

In [15]:
print('-------------------------------------------------------------------------------')

print(user_similarity_df[1].sort_values(ascending=False)[:10])

-------------------------------------------------------------------------------
userId
1     1.000000
4     0.900284
21    0.874140
23    0.868308
20    0.863783
13    0.858687
15    0.850246
12    0.834292
30    0.830542
26    0.829454
Name: 1, dtype: float64
