In [18]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# Data Examination

In [6]:
train_raw = pd.read_csv('train.dat', header=None,
            names=["uid", "iid", "rating", "ts"], sep='\t', engine='python')
train_raw

Unnamed: 0,uid,iid,rating,ts
0,905,470,1,889325071
1,697,1518,5,879835275
2,855,1687,5,875638677
3,950,1447,5,877420720
4,806,1170,4,879889337
...,...,...,...,...
85719,205,1136,1,884142487
85720,708,1497,4,881473612
85721,167,1036,3,875492395
85722,508,1528,3,880337585


In [7]:
train = set(train_raw["uid"].values.tolist())

In [260]:
max_iid = train_raw.iid.max() # 1699

# Data Transformation

In [10]:
max_iid = 1699 # result from train_raw
u2irmap = {}
i2umap = {}

In [11]:
for _, r in train_raw.iterrows():
    uid, iid = int(r.uid), int(r.iid)
    if uid not in u2irmap:
        u2irmap[uid] = {}
    u2irmap[uid][iid] = int(r.rating)
    if iid not in i2umap:
        i2umap[iid] = set()
    i2umap[iid].add(uid)

In [12]:
print(len(u2irmap)) # 943 unique users
print(len(i2umap)) # 1659 unique items

943
1659


In [312]:
# Saving
with open("all.json", "w") as outfile:
    json.dump(u2irmap, outfile)
# Loading  
with open("all.json", "r") as content:
    all = json.load(content)
u2irmap = {int(k): {int(mk): int(mv) for mk, mv in v.items()} for k, v in all.items()}

# Train-Validation Split

In [310]:
from sklearn.model_selection import train_test_split

s = pd.Series(u2irmap)
train_set, valid_set  = [i.to_dict() for i in train_test_split(s, test_size=0.1)]
train_set = {int(k): {int(mk): int(mv) for mk, mv in v.items()} for k, v in train_set.items()}
valid_set = {int(k): {int(mk): int(mv) for mk, mv in v.items()} for k, v in valid_set.items()}

In [327]:
with open("train_set.json", "w") as outfile:
    json.dump(train_set, outfile)
with open("valid_set.json", "w") as outfile:
    json.dump(valid_set, outfile)

In [196]:
test_raw = pd.read_csv('test.dat', header=None,
            names=["uid", "iid"], sep='\t', engine='python')
test_raw

Unnamed: 0,uid,iid
0,158,951
1,521,1202
2,98,1556
3,292,1583
4,68,1064
...,...,...
2149,537,1414
2150,618,1448
2151,154,1519
2152,154,1429


# Running Model

In [16]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import MinHashLSH

In [19]:
with open("train_set.json", "r") as content:
    train_set = json.load(content)
train_set = {int(k): {int(mk): int(mv) for mk, mv in v.items()} for k, v in train_set.items()}
u2irmap = train_set
#test_set = {int(k): {int(mk): int(mv) for mk, mv in v.items()} for k, v in test_set.items()}

In [20]:
max_iid = 1699 # result from train_raw
i2umap = {}

In [21]:
for uid, uimap in u2irmap.items():
    for iid, rating in uimap.items():
        if iid not in i2umap:
            i2umap[iid] = set()
        i2umap[iid].add(uid)

In [22]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/13 23:44:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/12/13 23:44:50 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Prediction Function

In [14]:
# expect to have:
# u2irmap (user-to-item-ratings map), and
# i2umap (item-to-users map)
def predict_rating(itemid, userhistory, num_neighbors=10, num_hash_tables=20):
    if itemid not in i2umap or len(userhistory) == 0: # no rating has given for this item, can't predict
        return None
    userDs = [(int(uid), Vectors.sparse(max_iid+1, u2irmap[uid])) for uid in u2irmap if uid in i2umap[itemid]]
    dfUsers = spark.createDataFrame(userDs, ["uid", "features"])
    mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=num_hash_tables)
    model = mh.fit(dfUsers)
    key = Vectors.sparse(max_iid+1, userhistory) # item_history expect to be a dict {<item>: rating}
    rows = model.approxNearestNeighbors(dfUsers, key, num_neighbors)
    ratings = []
    for r in rows.collect():
        ratings.append(u2irmap[r["uid"]][itemid])
    return int(round(sum(ratings) / len(ratings))) if ratings else None

## Running Validation Set and get RMSE

In [23]:
with open("valid_set.json", "r") as content:
    valid_set = json.load(content)
valid_set = {int(k): {int(mk): int(mv) for mk, mv in v.items()} for k, v in valid_set.items()}
vu2irmap = valid_set

In [24]:
def get_user_history_without_item(user_history, itemid):
    user_history_without_item = user_history.copy()
    user_history_without_item.pop(itemid)
    return user_history_without_item

In [25]:
import time
from math import sqrt
def get_user_history_without_item(user_history, itemid):
    user_history_without_item = user_history.copy()
    user_history_without_item.pop(itemid)
    return user_history_without_item
rmse_total = []
start_time = time.time()
for index, uid in enumerate(list(vu2irmap.keys())):
    print(f"Processing user {uid}, {index+1}/{len(vu2irmap)}")
    user_history = vu2irmap[uid]
    rmse_user = []
    for iid in user_history:
        expected = user_history[iid]
        uh = get_user_history_without_item(user_history, iid)
        predicted = predict_rating(iid, uh)
        if predicted:
            rmse_user.append((expected - predicted)**2)
        else:
            print(f"No predicted for user: {uid}, item: {iid}")
    if rmse_user:
        rmse_total.append(sqrt(sum(rmse_user) / len(rmse_user)))
    else:
        print(f"no rmse_user for user {uid}")
print("--- %s seconds ---" % (time.time() - start_time))

Processing user 890, 1/85


                                                                                

Processing user 719, 2/85
Processing user 594, 3/85
Processing user 698, 4/85
Processing user 468, 5/85
Processing user 326, 6/85
Processing user 338, 7/85
No predicted for user: 338, item: 48
No predicted for user: 338, item: 358
Processing user 472, 8/85
No predicted for user: 472, item: 82
Processing user 547, 9/85
Processing user 956, 10/85
Processing user 604, 11/85
Processing user 740, 12/85
Processing user 663, 13/85
Processing user 159, 14/85
Processing user 102, 15/85
Processing user 181, 16/85
Processing user 388, 17/85
Processing user 64, 18/85
Processing user 581, 19/85
Processing user 692, 20/85
No predicted for user: 692, item: 185
Processing user 931, 21/85
No predicted for user: 931, item: 556
Processing user 249, 22/85
No predicted for user: 249, item: 39
Processing user 126, 23/85
Processing user 495, 24/85
Processing user 250, 25/85
Processing user 382, 26/85
Processing user 122, 27/85
Processing user 681, 28/85
Processing user 642, 29/85
No predicted for user: 642, 

In [13]:
map(lambda x: x ** 2, rmse_total)
rmse = sqrt(sum(rmse_total)/len(rmse_total))

NameError: name 'rmse_total' is not defined

In [175]:
rmse

1.0311541929735746

# LSH Build Time

In [26]:
# Loading  
with open("all.json", "r") as content:
    all = json.load(content)
u2irmap = {int(k): {int(mk): int(mv) for mk, mv in v.items()} for k, v in all.items()}

In [29]:
num_hash_tables=20
userDs = [(int(uid), Vectors.sparse(max_iid+1, u2irmap[uid])) for uid in u2irmap]
dfUsers = spark.createDataFrame(userDs, ["uid", "features"])
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=num_hash_tables)
start_time = time.time()
model = mh.fit(dfUsers)
print("--- %s seconds ---" % (time.time() - start_time))

22/12/14 01:40:21 WARN TaskSetManager: Stage 14629 contains a task of very large size (1027 KiB). The maximum recommended task size is 1000 KiB.
--- 0.039320945739746094 seconds ---
