In [20]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow.keras as keras
from sklearn.model_selection import train_test_split

from typing import Dict, Text, Tuple

In [21]:
# 필요한 데이터 준비
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from collections import defaultdict

# synthesize data
NUM_USERS = 10_000
NUM_ITEMS = 779
user_id = np.arange(start = 0, stop = NUM_USERS)
item_id = np.arange(start = 0, stop = NUM_ITEMS)
np.random.seed(42)

user_item_dict = defaultdict(list)

for id in user_id:
    
    # random the number of item generation
    # for each user, random 3 to 5 items to be rated.
    num_rand_item = np.random.randint(low = 3, high = 5)

    # random from the item_id
    rand_items = np.random.choice(item_id, size = num_rand_item, replace = False)

    # random rating for each itme_id
    rand_rating = np.random.randint(low = 1, high = 10, size = num_rand_item)

    # collect the user-item paris.
    for uid, iid,rating in zip([id] * num_rand_item, rand_items, rand_rating):
        user_item_dict['user_id'].append(uid)
        user_item_dict['item_id'].append(iid)
        user_item_dict['rating'].append(rating)

# prepare dataframe
ratings = pd.DataFrame(user_item_dict)
print("Rating Dataframe")
ratings[['user_id','item_id']] = ratings[['user_id','item_id']].astype(str)
display(ratings.head())

Rating Dataframe


Unnamed: 0,user_id,item_id,rating
0,0,595,3
1,0,587,9
2,0,543,7
3,1,441,6
4,1,307,3


In [30]:
# Items 데이터
item_dict = defaultdict(list)
#items = pd.read_csv("animals_crawling.csv")

for iid in item_id:
    item_dict['item_id'].append(iid)
# prepare dataframe
item_ids = pd.DataFrame(item_dict)
items = pd.read_csv("animals_crawling.csv")
items = pd.concat([item_ids, items], axis=1)
print("\nItem Dataframe")
items = items.astype(str)
display(items.head())


Item Dataframe


Unnamed: 0.1,item_id,Unnamed: 0,title,link
0,0,0,"상어, 기린, 호랑이, 고릴라, 낙타는 영어로?ㅣ영어 배우기ㅣ위키와 동물언어_영어ㅣ...",https://youtube.com/watch?v=HRd98hTZb-U
1,1,1,절대로 부활하면 안 되는 멸종 동물!,https://youtube.com/watch?v=c20uGI5Mmvs
2,2,2,심쿵 아기동물 성장기 #OfftheFence #KBS #동물의왕국 (KBS1 202...,https://youtube.com/watch?v=p_wTEHPGAGI
3,3,3,본격 귀여움 참기 챌린지! 역대급 심장 폭행범 ‘꼬물이들.zip’ I TV동물농장 ...,https://youtube.com/watch?v=c9cYIGqdcvA
4,4,4,귀엽고 신기한 동물들을 만나봤어요! 동물편 모음 40분 자연 학습 체험,https://youtube.com/watch?v=VotU3AUcMuk


In [31]:
# 데이터프레임을 딥러닝 모델에 사용하기 위해 dataset 형태로 바꿈
def df_to_ds(df):

    # convert pd.DataFrame to tf.data.Dataset
    ds = tf.data.Dataset.from_tensor_slices(
        (dict(df[['user_id','item_id']]), df['rating']))
    
    # convert Tuple[Dict[Text, tf.Tensor], tf.Tensor] to Dict[Text, tf.Tensor]
    ds = ds.map(lambda x, y: {
    'user_id' : x['user_id'],
    'item_id' : x['item_id'],
    'rating' : y
    })

    return ds.batch(256)

In [32]:
class RankingModel(keras.Model):

    def __init__(self, user_id, item_id, embedding_size):
        super().__init__()
        
        # user model
        input = keras.Input(shape=(), dtype=tf.string)
        x = keras.layers.StringLookup(
            vocabulary = user_id, mask_token = None
            )(input)
        output = keras.layers.Embedding(
            input_dim = len(user_id) + 1,
            output_dim = embedding_size,
            name = 'embedding'
        )(x)
        self.user_model = keras.Model(inputs = input,
                                      outputs = output,
                                      name = 'user_model')

        # item model
        input = keras.Input(shape=(), dtype=tf.string)
        x = keras.layers.StringLookup(
            vocabulary = item_id, mask_token = None
            )(input)
        output = keras.layers.Embedding(
            input_dim = len(item_id) + 1,
            output_dim = embedding_size,
            name = 'embedding'
        )(x)
        self.item_model = keras.Model(inputs = input,
                                  outputs = output,
                                  name = 'item_model')

        # rating model
        user_input = keras.Input(shape=(embedding_size,), name='user_emb')
        item_input = keras.Input(shape=(embedding_size,), name='item_emb')
        x = keras.layers.Concatenate(axis=1)([user_input, item_input])
        x = keras.layers.Dense(256, activation = 'relu')(x)
        x = keras.layers.Dense(64, activation = 'relu')(x)
        output = keras.layers.Dense(1)(x)
        
        self.rating_model = keras.Model(
            inputs = {
                'user_id' : user_input,
                'item_id' : item_input
            },
            outputs = output,
            name = 'rating_model'
        )

    def call(self, inputs: Dict[Text, tf.Tensor]) -> tf.Tensor:

        user_emb = self.user_model(inputs['user_id'])
        item_emb = self.item_model(inputs['item_id'])

        prediction = self.rating_model({
            'user_id' : user_emb,
            'item_id' : item_emb
        })
        
        return prediction

In [33]:
class GMFModel(tfrs.models.Model):

    def __init__(self, user_id, item_id, embedding_size):
        super().__init__()
        self.ranking_model = RankingModel(user_id, item_id, embedding_size)
        self.task = tfrs.tasks.Ranking(
            loss = keras.losses.MeanSquaredError(),
            metrics = [keras.metrics.RootMeanSquaredError()]
        )
    
    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        
        return self.ranking_model(
            {
             'user_id' : features['user_id'], 
             'item_id' : features['item_id']
            })

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

        return self.task(labels = features.pop('rating'),
                         predictions = self.ranking_model(features))



In [55]:
def top_k_items(item_id, top_k, corr_mat, map_name):
    
    # sort correlation value ascendingly and select top_k item_id
    top_items = corr_mat[item_id,:].argsort()[-top_k:][::-1] 
    top_items = [map_name[e] for e in top_items] 

    return top_items

# preprocessing
rated_items = items.loc[items['item_id'].isin(ratings['item_id'])].copy()

# create item-genre matrix
item_mat = rated_items['item_id'].copy()
item_mat = item_mat.set_index(item_id)

# 코사인 유사도 구하기 compute similarity matix
corr_mat = cosine_similarity(item_mat)

# get top-k similar items
ind2name = {ind:name for ind,name in enumerate(item_genre_mat.index)}
name2ind = {v:k for k,v in ind2name.items()}
similar_items = top_k_items(name2ind['99'],
                            top_k = 10,
                            corr_mat = corr_mat,
                            map_name = ind2name)

ValueError: Expected 2D array, got 1D array instead:
array=[  0.   1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.
  14.  15.  16.  17.  18.  19.  20.  21.  22.  23.  24.  25.  26.  27.
  28.  29.  30.  31.  32.  33.  34.  35.  36.  37.  38.  39.  40.  41.
  42.  43.  44.  45.  46.  47.  48.  49.  50.  51.  52.  53.  54.  55.
  56.  57.  58.  59.  60.  61.  62.  63.  64.  65.  66.  67.  68.  69.
  70.  71.  72.  73.  74.  75.  76.  77.  78.  79.  80.  81.  82.  83.
  84.  85.  86.  87.  88.  89.  90.  91.  92.  93.  94.  95.  96.  97.
  98.  99. 100. 101. 102. 103. 104. 105. 106. 107. 108. 109. 110. 111.
 112. 113. 114. 115. 116. 117. 118. 119. 120. 121. 122. 123. 124. 125.
 126. 127. 128. 129. 130. 131. 132. 133. 134. 135. 136. 137. 138. 139.
 140. 141. 142. 143. 144. 145. 146. 147. 148. 149. 150. 151. 152. 153.
 154. 155. 156. 157. 158. 159. 160. 161. 162. 163. 164. 165. 166. 167.
 168. 169. 170. 171. 172. 173. 174. 175. 176. 177. 178. 179. 180. 181.
 182. 183. 184. 185. 186. 187. 188. 189. 190. 191. 192. 193. 194. 195.
 196. 197. 198. 199. 200. 201. 202. 203. 204. 205. 206. 207. 208. 209.
 210. 211. 212. 213. 214. 215. 216. 217. 218. 219. 220. 221. 222. 223.
 224. 225. 226. 227. 228. 229. 230. 231. 232. 233. 234. 235. 236. 237.
 238. 239. 240. 241. 242. 243. 244. 245. 246. 247. 248. 249. 250. 251.
 252. 253. 254. 255. 256. 257. 258. 259. 260. 261. 262. 263. 264. 265.
 266. 267. 268. 269. 270. 271. 272. 273. 274. 275. 276. 277. 278. 279.
 280. 281. 282. 283. 284. 285. 286. 287. 288. 289. 290. 291. 292. 293.
 294. 295. 296. 297. 298. 299. 300. 301. 302. 303. 304. 305. 306. 307.
 308. 309. 310. 311. 312. 313. 314. 315. 316. 317. 318. 319. 320. 321.
 322. 323. 324. 325. 326. 327. 328. 329. 330. 331. 332. 333. 334. 335.
 336. 337. 338. 339. 340. 341. 342. 343. 344. 345. 346. 347. 348. 349.
 350. 351. 352. 353. 354. 355. 356. 357. 358. 359. 360. 361. 362. 363.
 364. 365. 366. 367. 368. 369. 370. 371. 372. 373. 374. 375. 376. 377.
 378. 379. 380. 381. 382. 383. 384. 385. 386. 387. 388. 389. 390. 391.
 392. 393. 394. 395. 396. 397. 398. 399. 400. 401. 402. 403. 404. 405.
 406. 407. 408. 409. 410. 411. 412. 413. 414. 415. 416. 417. 418. 419.
 420. 421. 422. 423. 424. 425. 426. 427. 428. 429. 430. 431. 432. 433.
 434. 435. 436. 437. 438. 439. 440. 441. 442. 443. 444. 445. 446. 447.
 448. 449. 450. 451. 452. 453. 454. 455. 456. 457. 458. 459. 460. 461.
 462. 463. 464. 465. 466. 467. 468. 469. 470. 471. 472. 473. 474. 475.
 476. 477. 478. 479. 480. 481. 482. 483. 484. 485. 486. 487. 488. 489.
 490. 491. 492. 493. 494. 495. 496. 497. 498. 499. 500. 501. 502. 503.
 504. 505. 506. 507. 508. 509. 510. 511. 512. 513. 514. 515. 516. 517.
 518. 519. 520. 521. 522. 523. 524. 525. 526. 527. 528. 529. 530. 531.
 532. 533. 534. 535. 536. 537. 538. 539. 540. 541. 542. 543. 544. 545.
 546. 547. 548. 549. 550. 551. 552. 553. 554. 555. 556. 557. 558. 559.
 560. 561. 562. 563. 564. 565. 566. 567. 568. 569. 570. 571. 572. 573.
 574. 575. 576. 577. 578. 579. 580. 581. 582. 583. 584. 585. 586. 587.
 588. 589. 590. 591. 592. 593. 594. 595. 596. 597. 598. 599. 600. 601.
 602. 603. 604. 605. 606. 607. 608. 609. 610. 611. 612. 613. 614. 615.
 616. 617. 618. 619. 620. 621. 622. 623. 624. 625. 626. 627. 628. 629.
 630. 631. 632. 633. 634. 635. 636. 637. 638. 639. 640. 641. 642. 643.
 644. 645. 646. 647. 648. 649. 650. 651. 652. 653. 654. 655. 656. 657.
 658. 659. 660. 661. 662. 663. 664. 665. 666. 667. 668. 669. 670. 671.
 672. 673. 674. 675. 676. 677. 678. 679. 680. 681. 682. 683. 684. 685.
 686. 687. 688. 689. 690. 691. 692. 693. 694. 695. 696. 697. 698. 699.
 700. 701. 702. 703. 704. 705. 706. 707. 708. 709. 710. 711. 712. 713.
 714. 715. 716. 717. 718. 719. 720. 721. 722. 723. 724. 725. 726. 727.
 728. 729. 730. 731. 732. 733. 734. 735. 736. 737. 738. 739. 740. 741.
 742. 743. 744. 745. 746. 747. 748. 749. 750. 751. 752. 753. 754. 755.
 756. 757. 758. 759. 760. 761. 762. 763. 764. 765. 766. 767. 768. 769.
 770. 771. 772. 773. 774. 775. 776. 777. 778.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [47]:
from sklearn.metrics.pairwise import cosine_similarity

# preprocess
train, test = train_test_split(ratings, train_size = .8, random_state=42)
train, test = df_to_ds(train), df_to_ds(test)

# # init model
embedding_size = 64
model = GMFModel(user_id.astype(str),
                 item_id.astype(str),
                 embedding_size)
model.compile(
    optimizer = keras.optimizers.Adagrad(learning_rate = .01)
)

# # fitting the model
model.fit(train, epochs=3, verbose=0)

# evaluate with the test data
result = model.evaluate(test, return_dict=True, verbose=0)
print("\nEvaluation on the test set:")
display(result)

# extract item embedding
item_emb = model.ranking_model.item_model.layers[-1].get_weights()[0]


item_corr_mat = cosine_similarity(item_emb)


print("\nThe top-k similar video to item_id 99")
similar_items = top_k_items(name2ind['99'],
                            top_k = 10,
                            corr_mat = item_corr_mat,
                            map_name = ind2name)

display(items.loc[items[item_id].isin(similar_items)])

del item_corr_mat
gc.collect();


Evaluation on the test set:


{'root_mean_squared_error': 2.573801279067993,
 'loss': 7.641561508178711,
 'regularization_loss': 0,
 'total_loss': 7.641561508178711}


The top-k similar video to item_id 99


KeyError: "None of [Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,\n            ...\n            769, 770, 771, 772, 773, 774, 775, 776, 777, 778],\n           dtype='int64', length=779)] are in the [columns]"