In [1]:
!pip install --upgrade --no-cache-dir gdown >/dev/null
!gdown 1kTJmHH3qEyivtSJvnVUMYpWnZcQl5XaV

Downloading...
From: https://drive.google.com/uc?id=1kTJmHH3qEyivtSJvnVUMYpWnZcQl5XaV
To: /content/Bigdata_hw2_datasets.zip
  0% 0.00/6.28M [00:00<?, ?B/s] 75% 4.72M/6.28M [00:00<00:00, 35.0MB/s]100% 6.28M/6.28M [00:00<00:00, 44.3MB/s]


In [2]:
!unzip Bigdata_hw2_datasets.zip

Archive:  Bigdata_hw2_datasets.zip
   creating: Bigdata_hw2_datasets/
   creating: Bigdata_hw2_datasets/q1/
  inflating: Bigdata_hw2_datasets/q1/stream_data_dgim.txt  
   creating: Bigdata_hw2_datasets/q2/
  inflating: Bigdata_hw2_datasets/q2/games.csv  
  inflating: Bigdata_hw2_datasets/q2/ratings.csv  
   creating: Bigdata_hw2_datasets/q3/
  inflating: Bigdata_hw2_datasets/q3/c1.txt  
  inflating: Bigdata_hw2_datasets/q3/c2.txt  
  inflating: Bigdata_hw2_datasets/q3/data.txt  


In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 37 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 43.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=fe62223a38a6aa7a207945c4ebc940d0581a764f5ca0fe0a0f2139f15085489f
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [4]:
from pyspark import SparkContext
from tabulate import tabulate
import math

RATINGS_FILE = 'Bigdata_hw2_datasets/q2/ratings.csv'
ITEMS_FILE = 'Bigdata_hw2_datasets/q2/games.csv'
N = 5
U1 = 5461
U2 = 10140

In [23]:
def map_items(l):
    """Consider item_id as the key and the list of users
     rated for it as the value."""
    t = l.split(',')
    key = 0
    val = [(0, 0), ]
    try:
        key = int(t[0])
        val = [(int(t[1]), int(t[2])), ]
    except:
        pass
    return key, val


def reduce_items(a, b):
    """Reduce in an ascending order to make the list value sorted."""
    if b[0][0] < a[0][0]:
        return b + a
    return a + b


def map_users(l):
    """Consider user_id as the key and the list of rated
     items as the value."""
    t = l.split(',')
    key = 0
    val = [(0, 0), ]
    try:
        key = int(t[1])
        val = [(int(t[0]), int(t[2])), ]
    except:
        pass
    return key, val


def map_itdescs(l):
    """Consider item_id as the key and its title as the value."""
    t = l.split(',')
    key = 0
    val = ''
    try:
        key = int(t[0])
        val = t[1].strip()
    except:
        pass
    return key, val


def print_table(rows):
    """Print the rows in a pretty table."""
    header = ['Game ID', 'Rate', 'Title']
    rows = [header] + rows
    table = tabulate(rows, headers='firstrow', tablefmt='fancy_grid')
    print(table)


def cosine_sim(u, v, id):
    ud = {i: r for i, r in u}
    vd = {i: r for i, r in v}

    u_dot_v = 0
    for i in ud:
        u_dot_v += vd.get(i, 0) * ud[i]

    u_ = math.sqrt(sum(map(lambda x: x * x, ud.values())))
    v_ = math.sqrt(sum(map(lambda x: x * x, vd.values())))

    s = 0
    try:
        s = u_dot_v / (u_ * v_)
    except:
        pass

    rate = vd[id]

    return s, rate


def icf(uid, users, items, itdescs, sc):
    """Item-item collaborative filtering"""

    # Get the rated and not rated items for the user.
    user = users.lookup(uid)[0]
    rated = sc.parallelize(user)
    not_rated = items.subtractByKey(rated)
    rated = items.subtractByKey(not_rated).collect()
    not_rated = not_rated.collect()

    # Calculate the rate for each not rated item.
    i_r = []
    for i, u in not_rated:
        
        # Calculate the similarity between each rated and not rated item.
        s_r = []
        for j, v in rated:
            s_r.append(cosine_sim(u, v, uid))

        # Predict the rate by taking weighted average.
        try:
            r = sum(map(lambda x: x[0] * x[1], s_r)) / sum(map(lambda x: x[0], s_r))
        except:
            r = 0
        i_r.append((i, r))

    # Sort list of tuples (item, rate).
    i_r.sort(key=lambda x: x[0], reverse=False)
    i_r.sort(key=lambda x: x[1], reverse=True)

    # Attach the item title to the list of N top tuples.
    i_r = i_r[:N]
    i_r_t = []
    for i, r in i_r:
        title = itdescs.lookup(i)[0]
        i_r_t.append((i, r, title))

    # Return the list of tuples (item, rate, title).
    return i_r_t


def ucf(uid, users, items, itdescs, sc):
    """User-user collaborative filtering"""

    # Get the not rated items for the user.
    user = users.lookup(uid)[0]
    rated = sc.parallelize(user)
    not_rated = items.subtractByKey(rated).collect()

    # Collect all the users in the main memory for faster operation.
    users = users.collectAsMap()

    # Calculate the rate for each not rated item.
    i_r = []
    for i, usrs in not_rated:
        
        # Calculate the similarity between users of each not rated item and the user.
        s_r = []
        for usr in usrs:
            usr = users[usr[0]]
            s_r.append(cosine_sim(user, usr, i))
        
        # Predict the rate by taking weighted average.
        try:
            r = sum(map(lambda x: x[0] * x[1], s_r)) / sum(map(lambda x: x[0], s_r))
        except:
            r = 0
        i_r.append((i, r))

    # Sort list of tuples (item, rate).
    i_r.sort(key=lambda x: x[0], reverse=False)
    i_r.sort(key=lambda x: x[1], reverse=True)

    # Attach the item title to the list of N top tuples.
    i_r = i_r[:N]
    i_r_t = []
    for i, r in i_r:
        title = itdescs.lookup(i)[0]
        i_r_t.append((i, r, title))

    # Return the list of tuples (item, rate, title).
    return i_r_t

In [24]:
# Load the data.

sc = SparkContext.getOrCreate()

ratings = sc.textFile(RATINGS_FILE)
itdescs = sc.textFile(ITEMS_FILE)

items = ratings.map(map_items)
items = items.reduceByKey(reduce_items)
items = items.sortByKey()

users = ratings.map(map_users)
users = users.reduceByKey(reduce_items)
users = users.sortByKey()

itdescs = itdescs.map(map_itdescs)
itdescs = itdescs.sortByKey()

In [25]:
print('Item-item collaborative filtering for user:', U1)
print_table(icf(U1, users, items, itdescs, sc))

Item-item collaborative filtering for user: 5461
╒═══════════╤════════╤═══════════════════════════╕
│   Game ID │   Rate │ Title                     │
╞═══════════╪════════╪═══════════════════════════╡
│      5442 │      5 │ Worms 3D                  │
├───────────┼────────┼───────────────────────────┤
│      6247 │      5 │ Sword of Mana             │
├───────────┼────────┼───────────────────────────┤
│      9127 │      5 │ Zombie Vikings            │
├───────────┼────────┼───────────────────────────┤
│      9243 │      5 │ Bounty Hounds             │
├───────────┼────────┼───────────────────────────┤
│      9712 │      5 │ Spider-Man: Friend or Foe │
╘═══════════╧════════╧═══════════════════════════╛


In [26]:
print('User-user collaborative filtering for user:', U1)
print_table(ucf(U1, users, items, itdescs, sc))

User-user collaborative filtering for user: 5461
╒═══════════╤════════╤═══════════════════════════════════╕
│   Game ID │   Rate │ Title                             │
╞═══════════╪════════╪═══════════════════════════════════╡
│      2100 │      5 │ Bloodstained: Curse of the Moon 2 │
├───────────┼────────┼───────────────────────────────────┤
│      2743 │      5 │ Trials Fusion                     │
├───────────┼────────┼───────────────────────────────────┤
│      4170 │      5 │ Kirby 64: The Crystal Shards      │
├───────────┼────────┼───────────────────────────────────┤
│      4761 │      5 │ Wraith: The Oblivion - Afterlife  │
├───────────┼────────┼───────────────────────────────────┤
│      6062 │      5 │ Supercross                        │
╘═══════════╧════════╧═══════════════════════════════════╛


In [27]:
print('Item-item collaborative filtering for user:', U2)
print_table(icf(U2, users, items, itdescs, sc))

Item-item collaborative filtering for user: 10140
╒═══════════╤═════════╤══════════════════╕
│   Game ID │    Rate │ Title            │
╞═══════════╪═════════╪══════════════════╡
│      7803 │ 5       │ Frantics         │
├───────────┼─────────┼──────────────────┤
│      8917 │ 5       │ Super Motherload │
├───────────┼─────────┼──────────────────┤
│      9993 │ 5       │ Sega Smash Pack  │
├───────────┼─────────┼──────────────────┤
│      1935 │ 4.5549  │ MLB 07: The Show │
├───────────┼─────────┼──────────────────┤
│      9243 │ 4.51725 │ Bounty Hounds    │
╘═══════════╧═════════╧══════════════════╛


In [28]:
print('User-user collaborative filtering for user:', U2)
print_table(ucf(U2, users, items, itdescs, sc))

User-user collaborative filtering for user: 10140
╒═══════════╤════════╤══════════════════════════════════╕
│   Game ID │   Rate │ Title                            │
╞═══════════╪════════╪══════════════════════════════════╡
│      1828 │      5 │ Helldivers                       │
├───────────┼────────┼──────────────────────────────────┤
│      4191 │      5 │ The Godfather: Blackhand Edition │
├───────────┼────────┼──────────────────────────────────┤
│      6789 │      5 │ Metal Slug 4 & 5                 │
├───────────┼────────┼──────────────────────────────────┤
│      6971 │      5 │ The Sims Online                  │
├───────────┼────────┼──────────────────────────────────┤
│      7134 │      5 │ Duke Nukem 3D: Megaton Edition   │
╘═══════════╧════════╧══════════════════════════════════╛
