# 1.Sparse 한 matrix를 채우기

In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans

from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

In [38]:
#rating_data = pd.read_csv("data/train.csv")
rating_data = pd.read_csv("data/train.csv")
rating_data=rating_data.sort_values(by = 'user_id', ascending = True)
rating_data=rating_data.reset_index(drop=True, inplace = False)
print('Row_n :',rating_data.shape[0],'Col_n :',rating_data.shape[1])
print()
print(rating_data.head())

Row_n : 1254441 Col_n : 3

   user_id  item_id  rating
0        0    15825     5.0
1        0    41241     5.0
2        0    29131     5.0
3        1    14309     5.0
4        1    52670     5.0


In [39]:
rating_data = rating_data[200048:300046]
data = rating_data.pivot_table(values='rating', index='item_id', columns='user_id')
data

user_id,30718,30719,30720,30721,30722,30723,30724,30725,30726,30727,...,46091,46092,46093,46094,46095,46096,46097,46098,46099,46100
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62994,,,,,,,,,,,...,,,,,,,,,,
62995,,,,,,,,,,,...,,,,,,,,,,
62996,,,,,,,,,,,...,,,,,,,,,,
62998,,,,,,,,,,,...,,,,,,,,,,


# 후우..

In [40]:
print('item_num :',data.shape[0])
print('user_num :',data.shape[1])

item_num : 34653
user_num : 15383


# Now let's matrix factorization

In [7]:
from sklearn.metrics import mean_squared_error
from tqdm import trange

In [14]:
class SGD:

    def __init__(self, sparse_matrix, K, lr, beta, n_epochs):
        """
        Arguments
        - sparse_matrix : user-item rating matrix
        - K (int)       : number of latent dimensions
        - lr (float) : learning rate
        - beta (float)  : regularization parameter
        - n_epochs (int) : Num of Iteration
        """
        # convert ndArray
        self.sparse_matrix = sparse_matrix.fillna(0).to_numpy()
        self.item_n, self.user_n = sparse_matrix.shape
        self.K = K
        self.lr = lr
        self.beta = beta
        self.n_epochs = n_epochs

    def train(self):
        # Initialize user and item latent feature matrice
        self.I = np.random.normal(scale=1./self.K, size=(self.item_n, self.K)) # scale = std
        self.U = np.random.normal(scale=1./self.K, size=(self.user_n, self.K))

        # Init biases
        self.item_bias = np.zeros(self.item_n)
        self.user_bias = np.zeros(self.user_n)
        self.total_mean = np.mean(self.sparse_matrix[np.where(self.sparse_matrix != 0)])

        # Create training Samples
        idx, jdx = self.sparse_matrix.nonzero()
        samples = list(zip(idx, jdx))

        training_log = []
        progress = trange(self.n_epochs, desc="train-rmse: nan")
        for idx in progress:
            np.random.shuffle(samples)

            for i, u in samples:
                # get error
                y = self.sparse_matrix[i, u]
                pred = self.predict(i, u)
                error = y - pred
                # update bias
                self.item_bias[i] += self.lr * (error - self.beta * self.item_bias[i])
                self.user_bias[u] += self.lr * (error - self.beta * self.user_bias[u])
                # update latent factors
                I_i = self.I[i,:][:]
                self.I[i, :] += self.lr * (error * self.U[u,:] - self.beta * self.I[i,:])
                self.U[u, :] += self.lr * (error * I_i - self.beta * self.U[u,:])

            rmse = self.evaluate()
            progress.set_description("train-rmse: %0.6f" % rmse)
            progress.refresh()
            training_log.append((idx, rmse))

        self.pred_matrix =  self.get_pred_matrix()

    def predict(self, i, u):
        """
        :param i: item index
        :param u: user index
        :return: predicted rating
        """
        return (
            self.total_mean
            + self.item_bias[i]
            + self.user_bias[u]
            + self.U[u,:].dot(self.I[i,:].T)
        )

    def get_pred_matrix(self):
        return (
            self.total_mean
            + self.item_bias[:,np.newaxis]
            + self.user_bias[np.newaxis:,]
            + self.I.dot(self.U.T)
        )

    def evaluate(self):
        idx, jdx = self.sparse_matrix.nonzero()
        pred_matrix = self.get_pred_matrix()
        ys, preds = [], []
        for i, j in zip(idx, jdx):
            ys.append(self.sparse_matrix[i, j])
            preds.append(pred_matrix[i, j])

        error = mean_squared_error(ys, preds)
        return np.sqrt(error)

    def test_evaluate(self, test_set):
        pred_matrix = self.get_pred_matrix()
        ys, preds = [], []
        for i, j, rating in test_set:
            ys.append(rating)
            preds.append(pred_matrix[i, j])

        error = mean_squared_error(ys, preds)
        return np.sqrt(error)

In [41]:
# Create an instance of SGD with parameters
K = 3
lr = 0.01
beta = 0.02
n_epochs = 100

sgd_model = SGD(data, K, lr, beta, n_epochs)

# Train the model
sgd_model.train()

# Evaluate the model on the training set
train_rmse = sgd_model.evaluate()
print("Root Mean Squared Error (RMSE) on training set:", train_rmse)

train-rmse: 0.351484: 100%|██████████| 100/100 [20:17<00:00, 12.18s/it]


Root Mean Squared Error (RMSE) on training set: 0.3514838018988879


In [42]:
# 채워진 matrix
mat=sgd_model.get_pred_matrix()
print(mat.shape) # item x user

# item과 matrix사이의 변경
mat=np.transpose(mat)
print(mat.shape) # user x item

(34653, 15383)
(15383, 34653)


In [43]:
mat=pd.DataFrame(mat)
mat=mat.rename_axis('item', axis = 1).rename_axis('user', axis = 0)
mat.index = data.columns
mat.columns = data.index
mat

item_id,1,2,3,5,7,8,14,16,17,18,...,62981,62982,62985,62989,62993,62994,62995,62996,62998,62999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30718,4.147405,4.745171,4.380739,3.820746,5.141764,5.377504,5.209317,2.259370,4.325188,4.150536,...,4.497711,4.546698,3.451289,1.993382,4.461280,4.634247,3.780770,3.780208,3.561718,4.511195
30719,5.315077,4.930237,5.386291,3.393387,4.948165,5.229846,5.104580,4.248225,4.814328,4.658821,...,4.764362,5.186408,3.121766,3.907931,4.966874,5.324160,4.732625,3.852967,3.796458,4.945417
30720,4.934953,2.862026,3.806036,4.569187,4.274771,4.261881,4.118327,4.405470,4.240181,4.369240,...,4.164586,4.218017,3.837030,4.581575,3.622493,4.653464,4.403265,2.907407,4.091753,4.459626
30721,4.719573,4.664814,5.119753,3.273607,4.211328,4.392067,4.196293,3.917672,4.253488,4.466203,...,3.825859,3.778164,3.313007,4.022864,4.251144,4.655346,3.876217,3.336566,2.881872,4.246405
30722,4.744892,3.660858,4.125032,2.796942,4.673342,5.008505,4.964131,3.354307,4.244136,3.678803,...,4.705708,5.595726,1.999945,2.572625,4.409640,4.843218,4.540622,3.192284,3.926036,4.582269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46096,5.649649,3.224417,4.864207,3.411154,3.615075,3.689249,3.562072,5.926650,4.333212,4.570071,...,3.860295,4.185215,3.061378,6.082296,3.888011,4.930279,4.759536,2.704564,3.500803,4.405480
46097,3.384096,5.041668,4.423413,3.095005,4.456952,4.695750,4.458071,1.498268,3.727589,3.824193,...,3.538419,3.210248,3.212256,1.503667,3.992946,3.944798,2.746952,3.403418,2.292120,3.742184
46098,3.155931,3.837880,3.849891,2.405146,3.288688,3.456030,3.224584,1.971111,3.040811,3.303542,...,2.627574,2.328995,2.527483,2.163379,3.079415,3.332018,2.344108,2.373533,1.603369,3.015436
46099,4.972628,5.053197,5.200920,2.741314,5.066282,5.470199,5.379920,3.433929,4.654148,4.212955,...,4.874600,5.573329,2.389382,2.725841,5.087802,5.204790,4.573411,3.856075,3.671309,4.853225


In [44]:
# 5를 초과하는 값은 5로 대체
mat = mat.clip(upper=5)
mat

item_id,1,2,3,5,7,8,14,16,17,18,...,62981,62982,62985,62989,62993,62994,62995,62996,62998,62999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30718,4.147405,4.745171,4.380739,3.820746,5.000000,5.000000,5.000000,2.259370,4.325188,4.150536,...,4.497711,4.546698,3.451289,1.993382,4.461280,4.634247,3.780770,3.780208,3.561718,4.511195
30719,5.000000,4.930237,5.000000,3.393387,4.948165,5.000000,5.000000,4.248225,4.814328,4.658821,...,4.764362,5.000000,3.121766,3.907931,4.966874,5.000000,4.732625,3.852967,3.796458,4.945417
30720,4.934953,2.862026,3.806036,4.569187,4.274771,4.261881,4.118327,4.405470,4.240181,4.369240,...,4.164586,4.218017,3.837030,4.581575,3.622493,4.653464,4.403265,2.907407,4.091753,4.459626
30721,4.719573,4.664814,5.000000,3.273607,4.211328,4.392067,4.196293,3.917672,4.253488,4.466203,...,3.825859,3.778164,3.313007,4.022864,4.251144,4.655346,3.876217,3.336566,2.881872,4.246405
30722,4.744892,3.660858,4.125032,2.796942,4.673342,5.000000,4.964131,3.354307,4.244136,3.678803,...,4.705708,5.000000,1.999945,2.572625,4.409640,4.843218,4.540622,3.192284,3.926036,4.582269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46096,5.000000,3.224417,4.864207,3.411154,3.615075,3.689249,3.562072,5.000000,4.333212,4.570071,...,3.860295,4.185215,3.061378,5.000000,3.888011,4.930279,4.759536,2.704564,3.500803,4.405480
46097,3.384096,5.000000,4.423413,3.095005,4.456952,4.695750,4.458071,1.498268,3.727589,3.824193,...,3.538419,3.210248,3.212256,1.503667,3.992946,3.944798,2.746952,3.403418,2.292120,3.742184
46098,3.155931,3.837880,3.849891,2.405146,3.288688,3.456030,3.224584,1.971111,3.040811,3.303542,...,2.627574,2.328995,2.527483,2.163379,3.079415,3.332018,2.344108,2.373533,1.603369,3.015436
46099,4.972628,5.000000,5.000000,2.741314,5.000000,5.000000,5.000000,3.433929,4.654148,4.212955,...,4.874600,5.000000,2.389382,2.725841,5.000000,5.000000,4.573411,3.856075,3.671309,4.853225


In [37]:
#mat=pd.read_csv('mat3.csv', index_col = 'user_id')

In [86]:
# matrix 미리 저장하기
#mat.to_csv('mat3.csv', index = True)

# 2.PCA

In [19]:
text = np.load('data/text.npy')

scaler = StandardScaler()
standardized_data = scaler.fit_transform(text)

pca = PCA(n_components = 129)
reduced_data=pca.fit_transform(standardized_data)

# 3.K-Means

In [20]:
seed_value = 42
kmeans = KMeans(n_clusters = 9, random_state=seed_value)
kmeans.fit(reduced_data)

In [21]:
kmeans.predict(reduced_data)

array([7, 4, 7, ..., 8, 7, 1])

In [22]:
# item 별로 category 지정
df = pd.DataFrame({
    'item_num': np.arange(0, 63001),
    'category': kmeans.predict(reduced_data)
})

# df는 전체 데이터를 9개로 범주화 한 자료
df

Unnamed: 0,item_num,category
0,0,7
1,1,4
2,2,7
3,3,6
4,4,6
...,...,...
62996,62996,7
62997,62997,0
62998,62998,8
62999,62999,7


In [28]:
# mat에서 첫 번째 user가 구매한 아이템들 
# 중에서 3점 이상의 item을 filter
a=mat.iloc[0][mat.iloc[0]>=3].index
a

Int64Index([    3,     4,     5,     7,    10,    11,    12,    14,    15,
               18,
            ...
            62977, 62979, 62981, 62982, 62984, 62989, 62991, 62994, 62998,
            62999],
           dtype='int64', name='item_id', length=33891)

In [29]:
(df[df.item_num.isin(a)].category.value_counts().sort_index()/len(df[df.item_num.isin(a)]))

0    0.158833
1    0.103449
2    0.164586
3    0.133782
4    0.084329
5    0.076451
6    0.089286
7    0.081142
8    0.108141
Name: category, dtype: float64

In [30]:
df[df.item_num.isin(a)]

Unnamed: 0,item_num,category
3,3,5
4,4,5
5,5,7
7,7,6
10,10,4
...,...,...
62989,62989,4
62991,62991,4
62994,62994,6
62998,62998,4


# Final Algorithm

In [45]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

list1 = []
error_occurred = False

for i in range(0,15383):
    try:
        if len(mat.iloc[i][mat.iloc[i] >= 3].index) > 0 :
        # i번째 데이터, 평점이 3점 이상되는 item들
            a = mat.iloc[i][mat.iloc[i] >= 3].index

        else :
            a = mat.iloc[i][mat.iloc[i] >= 2.5].index

        # df는 전체 데이터를 9개로 범주화 한 자료
        # 해당 a 데이터에 해당하는 범주를 알아본다.
        df1 = df[df.item_num.isin(a)]

        ##-------------------------------------------------------------------------------
        # 비율대로 추출했을 때 무조건 50개로 맞춰주는 함수 (48,49 같은 수 나오지 않게)

        # Given ratios
        ratios = df1.category.value_counts().sort_index() / len(df1)


        # Multiply by 50 and round to integers
        rounded_samples = np.ceil(ratios * 50).astype(int)

        # Calculate the adjustment needed to make the sum 50
        adjustment = 50 - np.sum(rounded_samples)

        # Adjust the rounded samples to make the sum exactly 50
        if adjustment > 0:
            # If adjustment is positive, add 1 to the ratios with the largest fractional part
            while adjustment > 0:
                idx = np.argmax(ratios - np.floor(ratios))
                idx = ratios.index[idx]
                rounded_samples[idx] += 1
                adjustment -= 1
                ratios[idx] = 0.0  # Set the adjusted ratio to 0 to prevent it from being selected again
        else:
            # If adjustment is negative, subtract 1 from the ratios with the smallest fractional part
            while adjustment < 0:
                idx = np.argmin(ratios - np.ceil(ratios))
                idx = ratios.index[idx]
                rounded_samples[idx] -= 1

                adjustment += 1

                ratios[idx] = 1.0

                # Set the adjusted ratio to 1 to prevent it from being selected again
        ##-------------------------------------------------------------------------------

        list2 = []

        # j번 째 범주
        for j in range(9):
            # 0 ~ 8까지 9개의 범주중에서 원하는 범주 j의 값을 뽑아낸다.
            idx = df1.item_num[df1.category == j]

            # this is really important without this NaN happens
            if len(idx) == 0:
                # Skip this iteration if 'idx' is empty
                continue


            # j범주 벡터 평균
            mean_vec = np.mean(reduced_data[idx], axis=0) # (129,)

            mean_vec = mean_vec.reshape(1, -1)  # (1, 129)

            # 전체 데이터에서 j범주의 벡터값
            # reduced_data는 63001개의 item들의 129차원 벡터값
            category_j = reduced_data[df.item_num[df.category == j]]  # (num, 129)

            # 코사인 유사도
            cosine_similarities = cosine_similarity(category_j, mean_vec)

            # 코사인 유사도가 가장 큰 값 n개를 추출 (n개는 비율에 따라 다르다.)
            x = np.argsort(cosine_similarities.flatten())[::-1][:rounded_samples[j]]
            # 결과 값은 아이템 index이다.

            # j : 0 ~ 8 범주의 추천된 item_index
            list2.append(x)

        # rating score에 따라서 내림차순으로 정렬하기
        data_array = np.concatenate(list2)
        user_i = mat.iloc[i]
        user_i_dict = user_i.to_dict()
        sorted_data_array = sorted(data_array, key=lambda x: user_i_dict.get(x, 0), reverse=True)

        list1.append(sorted_data_array)

    except Exception as e:
        print(f"Error occurred at i = {i}")
        print(e)
        error_occurred = True
        break

if not error_occurred:
    print("No error found in the loop.")


No error found in the loop.


___

# 왜 2167번 user에서 NA값이 발생하는 건가?

In [125]:
# i = 2167
a = mat.iloc[2167][mat.iloc[2167] >= 3].index
# df는 전체 데이터를 9개로 범주화 한 자료
# 해당 a 데이터에 해당하는 범주를 알아본다.
df1 = df[df.item_num.isin(a)]

In [120]:
# 비율대로 추출했을 때 무조건 50개로 맞춰주는 함수 (48,49 같은 수 나오지 않게)

# Given ratios
ratios = df1.category.value_counts().sort_index() / len(df1)

# Multiply by 50 and round to integers
rounded_samples = np.round(ratios * 50).astype(int)

# Calculate the adjustment needed to make the sum 50
adjustment = 50 - np.sum(rounded_samples)

# Adjust the rounded samples to make the sum exactly 50
if adjustment > 0:
    # If adjustment is positive, add 1 to the ratios with the largest fractional part
    while adjustment > 0:
                idx = np.argmax(ratios - np.floor(ratios))
                rounded_samples[idx] += 1
                adjustment -= 1
                ratios[idx] = 0.0  # Set the adjusted ratio to 0 to prevent it from being selected again
else:
    # If adjustment is negative, subtract 1 from the ratios with the smallest fractional part
    while adjustment < 0:
        idx = np.argmin(ratios - np.ceil(ratios))
        rounded_samples[idx] -= 1
        adjustment += 1
        ratios[idx] = 1.0  # Set the adjusted ratio to 1 to prevent it from being selected again

In [133]:
rounded_samples

7    25
8    25
Name: category, dtype: int32

In [146]:
list2 = []

        # j번 째 범주
for j in range(9):
    # 0 ~ 8까지 9개의 범주중에서 원하는 범주 j의 값을 뽑아낸다.
    idx = df1.item_num[df1.category == j]

    # j범주 벡터 평균
    mean_vec = np.mean(reduced_data[idx], axis=0)  # (129,)
    mean_vec = mean_vec.reshape(1, -1)  # (1, 129)

    # 전체 데이터에서 j범주의 벡터값
    # reduced_data는 63001개의 item들의 129차원 벡터값
    category_j = reduced_data[df.item_num[df.category == j]]  # (num, 129)

    # 코사인 유사도
    cosine_similarities = cosine_similarity(category_j, mean_vec)

    # 코사인 유사도가 가장 큰 값 n개를 추출 (n개는 비율에 따라 다르다.)
    x = np.argsort(cosine_similarities.flatten())[::-1][:rounded_samples[j]]
    # 결과 값은 아이템 index이다.

    # j : 0 ~ 8 범주의 추천된 item_index
    list2.append(x)

In [155]:
np.mean(reduced_data[df1.item_num[df1.category == 0]], axis = 0).reshape(1,-1)

array([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]],
      dtype=float32)

In [156]:
# 0범주에 해당하는 값들
reduced_data[df.item_num[df.category == 0]]

array([[ 2.6334474 ,  7.7918153 , -3.516437  , ..., -0.91045177,
        -0.70561254,  1.3215618 ],
       [ 0.34296054,  6.7369466 , -0.23475124, ...,  1.2395616 ,
         0.13039902, -0.38900805],
       [ 6.081248  ,  2.5320532 ,  0.9740667 , ...,  0.32466555,
         0.32457116,  0.78496784],
       ...,
       [-0.54207206,  9.502971  , -0.35757032, ..., -1.1682967 ,
         0.12579405, -0.4558694 ],
       [ 2.0056179 ,  9.783293  , -3.0049877 , ..., -0.2023496 ,
         0.8129351 ,  0.88811517],
       [ 6.3553977 ,  3.5443609 , -1.3126819 , ...,  1.3850862 ,
         1.2894734 , -0.8252573 ]], dtype=float32)

In [None]:
# 해당 카테고리의 값이 비율에 전혀 없으면 평균을 낼 수 있는 값이 없다.

In [157]:
list2 = []

# j번 째 범주
for j in range(9):
    # 0 ~ 8까지 9개의 범주중에서 원하는 범주 j의 값을 뽑아낸다.
    idx = df1.item_num[df1.category == j]

    if len(idx) == 0:
        # Skip this iteration if 'idx' is empty
        continue

    # j범주 벡터 평균
    mean_vec = np.mean(reduced_data[idx], axis=0)  # (129,)
    mean_vec = mean_vec.reshape(1, -1)  # (1, 129)

    # 전체 데이터에서 j범주의 벡터값
    # reduced_data는 63001개의 item들의 129차원 벡터값
    category_j = reduced_data[df.item_num[df.category == j]]  # (num, 129)

    # 코사인 유사도
    cosine_similarities = cosine_similarity(category_j, mean_vec)

    # 코사인 유사도가 가장 큰 값 n개를 추출 (n개는 비율에 따라 다르다.)
    x = np.argsort(cosine_similarities.flatten())[::-1][:rounded_samples[j]]
    # 결과 값은 아이템 index이다.

    # j : 0 ~ 8 범주의 추천된 item_index
    list2.append(x)

In [160]:
len(np.concatenate(list2))

50

## yeah~!! Problem Solved! 
## After all the loop is done lets check list1

In [74]:
# 15272개의 데이터 전부 있음 확인 완료!
len(list1)

15446

In [76]:
#list1

# list1을 csv로 저장하기

In [46]:
import csv
with open("df3.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(list1)

In [47]:
data1=pd.read_csv('df3.csv', header=None)
data1.columns = range(1,51)
data1=data1.rename_axis('user_id', axis = 0)
data1=data1.rename_axis('item_id', axis = 1)
data1

item_id,1,2,3,4,5,6,7,8,9,10,...,41,42,43,44,45,46,47,48,49,50
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4711,684,511,3741,4946,465,3650,1177,3985,5115,...,2089,5647,3167,1789,9480,9448,50,6933,3659,2084
1,4711,684,511,3741,4946,1177,5218,465,3650,3985,...,2089,5647,3167,1789,9480,9448,50,6933,3659,9252
2,5218,4346,1453,4397,4946,684,465,4597,6446,3950,...,5647,3167,1789,9480,9448,50,6933,9252,3659,4245
3,3650,465,4946,4397,5218,4156,4711,3985,3741,3950,...,5647,3167,1789,9480,9448,50,6933,3659,9252,2084
4,684,511,4946,3650,1177,465,4711,2819,5599,819,...,2089,5647,3167,1789,9480,9448,50,6933,3659,2084
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15378,4597,5218,4397,5599,4946,4346,4156,3950,2187,1453,...,5647,3167,1789,9480,9448,50,6933,9252,3659,4245
15379,3741,465,4711,511,684,3985,4946,3650,73,1177,...,2089,5647,3167,1789,9480,9448,50,6933,3659,2084
15380,4393,465,3741,3650,3985,4946,3950,684,4156,511,...,5647,3167,1789,9480,9448,50,6933,3659,2084,4245
15381,4711,684,511,3741,4946,1177,465,3650,73,2819,...,2089,5647,3167,1789,9480,9448,50,6933,3659,2084
