In [4]:
def svd_with_x(train, k):
    utilMat = train.to_numpy()
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0], 1))
    utilMat = utilMat - x
    U, s, V = np.linalg.svd(utilMat, full_matrices=False)
    s = np.diag(s)
    s = s[0:k, 0:k]
    U = U[:, 0:k]
    V = V[0:k, :]
    return U.data @ s @ V.data + x

def svd_without_x(train, k):
    utilMat = train.to_numpy()
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    utilMat = masked_arr.filled(item_means)
    U, s, V = np.linalg.svd(utilMat, full_matrices=False)
    s = np.diag(s)
    s = s[0:k, 0:k]
    U = U[:, 0:k]
    V = V[0:k, :]
    return U.data @ s @ V.data

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

train = pd.read_csv('ua.base', sep="\t", header=None)
train.columns = ['user_id', 'item_id', 'rating', 'timestamp']
train = train.drop(['timestamp'], axis=1)
train.user_id = train.user_id - 1
train.item_id = train.item_id - 1


test = pd.read_csv('ua.test', sep="\t", header=None)
test.columns = ['user_id', 'item_id', 'rating', 'timestamp']
test = test.drop(['timestamp'], axis=1)
test.user_id = test.user_id - 1
test.item_id = test.item_id - 1

In [2]:
def create_utility_matrix(data):
    n_users = train.user_id.max() + 1
    n_items = train.item_id.max() + 1
    ret = np.full(shape=(n_items, n_users), fill_value=np.nan)
    for u, i, r in data.to_numpy():
        ret[i][u] = r
    return pd.DataFrame(ret)

def rmse(y_true, y_pred):
    return np.sqrt(((y_pred - y_true) **2).mean())

In [5]:
no_of_features = range(1, 101)
utilMat = create_utility_matrix(train)
for f in no_of_features: 
    svdout = svd_with_x(utilMat, k=f)
    pred = [] 
    for u, i, r in train.to_numpy():
        pred.append(svdout[i, u])
    print(f, rmse(train['rating'], pred))

1 0.9827101638809858
2 0.9642641320346889
3 0.953800545865901
4 0.9451313444382188
5 0.9367068718007894
6 0.9281553126862141
7 0.9207731517917208
8 0.9135570518897865
9 0.9071242295928691
10 0.9009574339572411
11 0.8950676088580872
12 0.8892578752841643
13 0.8833827839544615
14 0.8778578837174742
15 0.8720778134493544
16 0.8666058298476599
17 0.8613299593189946
18 0.85619035404349
19 0.8510712479052189
20 0.8460253704208994
21 0.8409467692125421
22 0.8360454973544971
23 0.8312773631925247
24 0.826528328729885
25 0.8219792413505543
26 0.8171334831673368
27 0.8125841534535405
28 0.8080364072857641
29 0.8034446336309945
30 0.79895304335931
31 0.7946132518241829
32 0.7902498926579155
33 0.7858282189230463
34 0.7816593343066228
35 0.7773963312287432
36 0.7733713929316728
37 0.7692365435182758
38 0.7651013985842512
39 0.7610043644635044
40 0.7569639286689106
41 0.7528960923666287
42 0.7489307814642903
43 0.7449911511427286
44 0.7410567022393076
45 0.7372272938809824
46 0.7334626052151729
47 

In [6]:
no_of_features = range(1, 101)
utilMat = create_utility_matrix(test)
for f in no_of_features: 
    svdout = svd_with_x(utilMat, k=f)
    pred = [] 
    for u, i, r in test.to_numpy():
        pred.append(svdout[i, u])
    print(f, rmse(test['rating'], pred))

1 0.9680606213226651
2 0.9596695946717688
3 0.9495047212276849
4 0.9411762397345931
5 0.9340912913938685
6 0.9264957899567279
7 0.9193687430743771
8 0.9126047553900907
9 0.9072221029939234
10 0.9017554006229846
11 0.8963438979594661
12 0.8912804150350816
13 0.8852940849568489
14 0.8806138493155553
15 0.8757372879973704
16 0.8700941797883669
17 0.8653296356129533
18 0.8605294872506768
19 0.8565527188797193
20 0.8526596324184234
21 0.8485906971981718
22 0.8439870414716995
23 0.8391708124383592
24 0.8343541563409143
25 0.8290703271208564
26 0.8244851330456929
27 0.8202542061540294
28 0.8154064500243878
29 0.8108428650402846
30 0.8066411846194175
31 0.8027553307521126
32 0.7981941722255291
33 0.7939903409747054
34 0.7896843219530223
35 0.7853243447732816
36 0.7810867841150144
37 0.7772261769041798
38 0.773539191655241
39 0.7693639297371929
40 0.7653612109758106
41 0.7614408208091958
42 0.7577253079284041
43 0.7539348586307241
44 0.7501646608870256
45 0.7469568704400428
46 0.743461039965024

In [7]:
no_of_features = range(1, 101)
utilMat = create_utility_matrix(train)
for f in no_of_features: 
    svdout = svd_without_x(utilMat, k=f)
    pred = [] 
    for u, i, r in train.to_numpy():
        pred.append(svdout[i, u])
    print(f, rmse(train['rating'], pred))

1 1.0097997746512346
2 0.9805461590070965
3 0.9620571168690643
4 0.9517375737232584
5 0.943160000002186
6 0.9345398717690686
7 0.9266295639596663
8 0.9192908882406476
9 0.9123100745064596
10 0.905886618563938
11 0.8997012221475544
12 0.8939087609206241
13 0.8880884999418532
14 0.8823341950388544
15 0.8768365762136665
16 0.8710724462388852
17 0.865582156681882
18 0.8603241081811898
19 0.8551671215599982
20 0.8500712640962502
21 0.8450306265223034
22 0.8400950089168536
23 0.8353277548510184
24 0.8303671722021186
25 0.8256144115863627
26 0.8210703292217754
27 0.8162343615919264
28 0.8116904925834325
29 0.8071430221393007
30 0.8025493988785649
31 0.7980828976741924
32 0.7937690935688986
33 0.7893692955329201
34 0.7849610692562641
35 0.7807866505444675
36 0.776547999302913
37 0.7724613087505801
38 0.7684091521318724
39 0.7643021892657762
40 0.7601930683211857
41 0.756145855127271
42 0.7520563378269551
43 0.7481814242956527
44 0.7442377624772547
45 0.7402480395380162
46 0.7364221289104939
47

In [8]:
no_of_features = range(1, 101)
utilMat = create_utility_matrix(test)
for f in no_of_features: 
    svdout = svd_without_x(utilMat, k=f)
    pred = [] 
    for u, i, r in test.to_numpy():
        pred.append(svdout[i, u])
    print(f, rmse(test['rating'], pred))

1 0.9759548553952386
2 0.9648189247322172
3 0.9563125318980672
4 0.9487840079002973
5 0.9391248552808066
6 0.9321655514604382
7 0.9245897781058413
8 0.917608553474668
9 0.9105877015409007
10 0.9051770581936206
11 0.8994992880729384
12 0.894796928179952
13 0.8890929911769907
14 0.8835860286523524
15 0.8784804345486944
16 0.8735991950052692
17 0.8682834453188601
18 0.8634146103442852
19 0.8586725697091626
20 0.8547866913061718
21 0.8509013876086463
22 0.8468702345465098
23 0.8423055153359335
24 0.8374527514656562
25 0.8326006518592202
26 0.827591119322494
27 0.8226776127974251
28 0.8185613905006526
29 0.8138496697251648
30 0.8096111458201097
31 0.8051919452423202
32 0.8011201249601765
33 0.7966254400501862
34 0.7924318964867689
35 0.7880323581897923
36 0.7838199637610527
37 0.7795643756800956
38 0.7757379444735526
39 0.7719410184005072
40 0.767738479413144
41 0.7639859239739173
42 0.7601889819246751
43 0.7563794651517156
44 0.7526096718809111
45 0.7488406938189579
46 0.7455168848689533
4