In [3]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import MinHashLSH

import json
import pandas as pd

In [4]:
with open("all.json", "r") as content:
    all = json.load(content)
u2irmap = {int(k): {int(mk): int(mv) for mk, mv in v.items()} for k, v in all.items()}

In [5]:
i2umap = {}

In [6]:
for uid, uimap in u2irmap.items():
    for iid, rating in uimap.items():
        if iid not in i2umap:
            i2umap[iid] = set()
        i2umap[iid].add(uid)

In [8]:
max(list(i2umap))


1699

In [21]:
test_raw = pd.read_csv('test.dat', header=None,
            names=["uid", "iid"], sep='\t', engine='python')
test_raw

Unnamed: 0,uid,iid
0,158,951
1,521,1202
2,98,1556
3,292,1583
4,68,1064
...,...,...
2149,537,1414
2150,618,1448
2151,154,1519
2152,154,1429


In [22]:
test = test_raw.values.tolist()

# Prediction

In [9]:
# expect to have:
# maxiid
# u2irmap (user-to-item-ratings map), and
# i2umap (item-to-users map)
def predict_rating(itemid, userhistory, num_neighbors=10, num_hash_tables=20, max_iid=1699):
    if itemid not in i2umap or len(userhistory) == 0: # no rating has given for this item, can't predict
        return None
    userDs = [(int(uid), Vectors.sparse(max_iid+1, u2irmap[uid])) for uid in u2irmap if uid in i2umap[itemid]]
    dfUsers = spark.createDataFrame(userDs, ["uid", "features"])
    mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=num_hash_tables)
    model = mh.fit(dfUsers)
    key = Vectors.sparse(max_iid+1, userhistory) # item_history expect to be a dict {<item>: rating}
    rows = model.approxNearestNeighbors(dfUsers, key, num_neighbors)
    ratings = []
    for r in rows.collect():
        ratings.append(u2irmap[r["uid"]][itemid])
    return int(round(sum(ratings) / len(ratings))) if ratings else None

In [10]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/14 00:57:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/12/14 00:57:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/12/14 00:57:26 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [34]:
import time
start_time = time.time()
ratings = []
for index, (uid, iid) in enumerate(test):
    print(f"Predicting entry {index}/{len(test)}")
    ratings.append(predict_rating(iid, u2irmap[uid]))
print("--- %s seconds ---" % (time.time() - start_time))

Predicting entry 0/2154
Predicting entry 1/2154
Predicting entry 2/2154
Predicting entry 3/2154
Predicting entry 4/2154
Predicting entry 5/2154
Predicting entry 6/2154
Predicting entry 7/2154
Predicting entry 8/2154
Predicting entry 9/2154
Predicting entry 10/2154
Predicting entry 11/2154
Predicting entry 12/2154
Predicting entry 13/2154
Predicting entry 14/2154
Predicting entry 15/2154
Predicting entry 16/2154
Predicting entry 17/2154
Predicting entry 18/2154
Predicting entry 19/2154
Predicting entry 20/2154
Predicting entry 21/2154
Predicting entry 22/2154
Predicting entry 23/2154
Predicting entry 24/2154
Predicting entry 25/2154
Predicting entry 26/2154
Predicting entry 27/2154
Predicting entry 28/2154
Predicting entry 29/2154
Predicting entry 30/2154
Predicting entry 31/2154
Predicting entry 32/2154
Predicting entry 33/2154
Predicting entry 34/2154
Predicting entry 35/2154
Predicting entry 36/2154
Predicting entry 37/2154
Predicting entry 38/2154
Predicting entry 39/2154
Predicting

Predicting entry 320/2154
Predicting entry 321/2154
Predicting entry 322/2154
Predicting entry 323/2154
Predicting entry 324/2154
Predicting entry 325/2154
Predicting entry 326/2154
Predicting entry 327/2154
Predicting entry 328/2154
Predicting entry 329/2154
Predicting entry 330/2154
Predicting entry 331/2154
Predicting entry 332/2154
Predicting entry 333/2154
Predicting entry 334/2154
Predicting entry 335/2154
Predicting entry 336/2154
Predicting entry 337/2154
Predicting entry 338/2154
Predicting entry 339/2154
Predicting entry 340/2154
Predicting entry 341/2154
Predicting entry 342/2154
Predicting entry 343/2154
Predicting entry 344/2154
Predicting entry 345/2154
Predicting entry 346/2154
Predicting entry 347/2154
Predicting entry 348/2154
Predicting entry 349/2154
Predicting entry 350/2154
Predicting entry 351/2154
Predicting entry 352/2154
Predicting entry 353/2154
Predicting entry 354/2154
Predicting entry 355/2154
Predicting entry 356/2154
Predicting entry 357/2154
Predicting e

Predicting entry 637/2154
Predicting entry 638/2154
Predicting entry 639/2154
Predicting entry 640/2154
Predicting entry 641/2154
Predicting entry 642/2154
Predicting entry 643/2154
Predicting entry 644/2154
Predicting entry 645/2154
Predicting entry 646/2154
Predicting entry 647/2154
Predicting entry 648/2154
Predicting entry 649/2154
Predicting entry 650/2154
Predicting entry 651/2154
Predicting entry 652/2154
Predicting entry 653/2154
Predicting entry 654/2154
Predicting entry 655/2154
Predicting entry 656/2154
Predicting entry 657/2154
Predicting entry 658/2154
Predicting entry 659/2154
Predicting entry 660/2154
Predicting entry 661/2154
Predicting entry 662/2154
Predicting entry 663/2154
Predicting entry 664/2154
Predicting entry 665/2154
Predicting entry 666/2154
Predicting entry 667/2154
Predicting entry 668/2154
Predicting entry 669/2154
Predicting entry 670/2154
Predicting entry 671/2154
Predicting entry 672/2154
Predicting entry 673/2154
Predicting entry 674/2154
Predicting e

Predicting entry 953/2154
Predicting entry 954/2154
Predicting entry 955/2154
Predicting entry 956/2154
Predicting entry 957/2154
Predicting entry 958/2154
Predicting entry 959/2154
Predicting entry 960/2154
Predicting entry 961/2154
Predicting entry 962/2154
Predicting entry 963/2154
Predicting entry 964/2154
Predicting entry 965/2154
Predicting entry 966/2154
Predicting entry 967/2154
Predicting entry 968/2154
Predicting entry 969/2154
Predicting entry 970/2154
Predicting entry 971/2154
Predicting entry 972/2154
Predicting entry 973/2154
Predicting entry 974/2154
Predicting entry 975/2154
Predicting entry 976/2154
Predicting entry 977/2154
Predicting entry 978/2154
Predicting entry 979/2154
Predicting entry 980/2154
Predicting entry 981/2154
Predicting entry 982/2154
Predicting entry 983/2154
Predicting entry 984/2154
Predicting entry 985/2154
Predicting entry 986/2154
Predicting entry 987/2154
Predicting entry 988/2154
Predicting entry 989/2154
Predicting entry 990/2154
Predicting e

Predicting entry 1261/2154
Predicting entry 1262/2154
Predicting entry 1263/2154
Predicting entry 1264/2154
Predicting entry 1265/2154
Predicting entry 1266/2154
Predicting entry 1267/2154
Predicting entry 1268/2154
Predicting entry 1269/2154
Predicting entry 1270/2154
Predicting entry 1271/2154
Predicting entry 1272/2154
Predicting entry 1273/2154
Predicting entry 1274/2154
Predicting entry 1275/2154
Predicting entry 1276/2154
Predicting entry 1277/2154
Predicting entry 1278/2154
Predicting entry 1279/2154
Predicting entry 1280/2154
Predicting entry 1281/2154
Predicting entry 1282/2154
Predicting entry 1283/2154
Predicting entry 1284/2154
Predicting entry 1285/2154
Predicting entry 1286/2154
Predicting entry 1287/2154
Predicting entry 1288/2154
Predicting entry 1289/2154
Predicting entry 1290/2154
Predicting entry 1291/2154
Predicting entry 1292/2154
Predicting entry 1293/2154
Predicting entry 1294/2154
Predicting entry 1295/2154
Predicting entry 1296/2154
Predicting entry 1297/2154
P

Predicting entry 1565/2154
Predicting entry 1566/2154
Predicting entry 1567/2154
Predicting entry 1568/2154
Predicting entry 1569/2154
Predicting entry 1570/2154
Predicting entry 1571/2154
Predicting entry 1572/2154
Predicting entry 1573/2154
Predicting entry 1574/2154
Predicting entry 1575/2154
Predicting entry 1576/2154
Predicting entry 1577/2154
Predicting entry 1578/2154
Predicting entry 1579/2154
Predicting entry 1580/2154
Predicting entry 1581/2154
Predicting entry 1582/2154
Predicting entry 1583/2154
Predicting entry 1584/2154
Predicting entry 1585/2154
Predicting entry 1586/2154
Predicting entry 1587/2154
Predicting entry 1588/2154
Predicting entry 1589/2154
Predicting entry 1590/2154
Predicting entry 1591/2154
Predicting entry 1592/2154
Predicting entry 1593/2154
Predicting entry 1594/2154
Predicting entry 1595/2154
Predicting entry 1596/2154
Predicting entry 1597/2154
Predicting entry 1598/2154
Predicting entry 1599/2154
Predicting entry 1600/2154
Predicting entry 1601/2154
P

Predicting entry 1869/2154
Predicting entry 1870/2154
Predicting entry 1871/2154
Predicting entry 1872/2154
Predicting entry 1873/2154
Predicting entry 1874/2154
Predicting entry 1875/2154
Predicting entry 1876/2154
Predicting entry 1877/2154
Predicting entry 1878/2154
Predicting entry 1879/2154
Predicting entry 1880/2154
Predicting entry 1881/2154
Predicting entry 1882/2154
Predicting entry 1883/2154
Predicting entry 1884/2154
Predicting entry 1885/2154
Predicting entry 1886/2154
Predicting entry 1887/2154
Predicting entry 1888/2154
Predicting entry 1889/2154
Predicting entry 1890/2154
Predicting entry 1891/2154
Predicting entry 1892/2154
Predicting entry 1893/2154
Predicting entry 1894/2154
Predicting entry 1895/2154
Predicting entry 1896/2154
Predicting entry 1897/2154
Predicting entry 1898/2154
Predicting entry 1899/2154
Predicting entry 1900/2154
Predicting entry 1901/2154
Predicting entry 1902/2154
Predicting entry 1903/2154
Predicting entry 1904/2154
Predicting entry 1905/2154
P

In [1]:
with open("test_lsh.dat" ,'w') as f:
    for r in ratings:
        if r:
            f.write(f"{r}\n")
        else:
            f.write(f"{3}\n")

SyntaxError: EOL while scanning string literal (4171013142.py, line 6)

# Validation

In [11]:
with open("train_set.json", "r") as content:
    train_set = json.load(content)
train_set = {int(k): {int(mk): int(mv) for mk, mv in v.items()} for k, v in train_set.items()}
u2irmap = train_set

In [12]:
with open("valid_set.json", "r") as content:
    valid_set = json.load(content)
valid_set = {int(k): {int(mk): int(mv) for mk, mv in v.items()} for k, v in valid_set.items()}
vu2irmap = valid_set

In [13]:
i2umap = {}
for uid, uimap in u2irmap.items():
    for iid, rating in uimap.items():
        if iid not in i2umap:
            i2umap[iid] = set()
        i2umap[iid].add(uid)

In [14]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local) created by __init__ at /var/folders/4r/z4c5m78x2tqgbkj8hx9jx2cm0000gn/T/ipykernel_51794/168358632.py:3 

In [15]:
# expect to have:
# maxiid
# u2irmap (user-to-item-ratings map), and
# i2umap (item-to-users map)
def predict_rating(itemid, userhistory, num_neighbors=10, num_hash_tables=20, max_iid=1699):
    if itemid not in i2umap or len(userhistory) == 0: # no rating has given for this item, can't predict
        return None
    userDs = [(int(uid), Vectors.sparse(max_iid+1, u2irmap[uid])) for uid in u2irmap if uid in i2umap[itemid]]
    dfUsers = spark.createDataFrame(userDs, ["uid", "features"])
    mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=num_hash_tables)
    model = mh.fit(dfUsers)
    key = Vectors.sparse(max_iid+1, userhistory) # item_history expect to be a dict {<item>: rating}
    rows = model.approxNearestNeighbors(dfUsers, key, num_neighbors)
    ratings = []
    for r in rows.collect():
        ratings.append(u2irmap[r["uid"]][itemid])
    return int(round(sum(ratings) / len(ratings))) if ratings else None

## Running Validation Set and get RMSE

In [16]:
def get_user_history_without_item(user_history, itemid):
    user_history_without_item = user_history.copy()
    user_history_without_item.pop(itemid)
    return user_history_without_item

In [17]:
import time
from math import sqrt
def get_user_history_without_item(user_history, itemid):
    user_history_without_item = user_history.copy()
    user_history_without_item.pop(itemid)
    return user_history_without_item
rmse_total = []
start_time = time.time()
for index, uid in enumerate(list(vu2irmap.keys())):
    print(f"Processing user {uid}, {index+1}/{len(vu2irmap)}")
    user_history = vu2irmap[uid]
    rmse_user = []
    for iid in user_history:
        expected = user_history[iid]
        uh = get_user_history_without_item(user_history, iid)
        predicted = predict_rating(iid, uh)
        if predicted:
            rmse_user.append((expected - predicted)**2)
        else:
            print(f"No predicted for user: {uid}, item: {iid}")
    if rmse_user:
        rmse_total.append(sqrt(sum(rmse_user) / len(rmse_user)))
    else:
        print(f"no rmse_user for user {uid}")
print("--- %s seconds ---" % (time.time() - start_time))

Processing user 890, 1/85


                                                                                

Processing user 719, 2/85
Processing user 594, 3/85
Processing user 698, 4/85
Processing user 468, 5/85
Processing user 326, 6/85
Processing user 338, 7/85
No predicted for user: 338, item: 48
No predicted for user: 338, item: 358
Processing user 472, 8/85
No predicted for user: 472, item: 82
Processing user 547, 9/85
Processing user 956, 10/85
Processing user 604, 11/85
Processing user 740, 12/85
Processing user 663, 13/85
Processing user 159, 14/85
Processing user 102, 15/85
Processing user 181, 16/85
Processing user 388, 17/85
Processing user 64, 18/85
Processing user 581, 19/85
Processing user 692, 20/85
No predicted for user: 692, item: 185
Processing user 931, 21/85
No predicted for user: 931, item: 556
Processing user 249, 22/85
No predicted for user: 249, item: 39
Processing user 126, 23/85
Processing user 495, 24/85
Processing user 250, 25/85
Processing user 382, 26/85
Processing user 122, 27/85
Processing user 681, 28/85
Processing user 642, 29/85
No predicted for user: 642, 

In [18]:
map(lambda x: x ** 2, rmse_total)
rmse = sqrt(sum(rmse_total)/len(rmse_total))

In [19]:
print(rmse)

1.0216001565331043
22/12/14 01:49:19 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 120438 ms exceeds timeout 120000 ms
22/12/14 01:49:19 WARN SparkContext: Killing executors is not supported by current scheduler.
