In [1]:
def svd_with_x(train, k):
    utilMat = train.to_numpy()
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0], 1))
    utilMat = utilMat - x
    U, s, V = np.linalg.svd(utilMat, full_matrices=False)
    s = np.diag(s)
    s = s[0:k, 0:k]
    U = U[:, 0:k]
    V = V[0:k, :]
    return U.data @ s @ V.data + x

def svd_without_x(train, k):
    utilMat = train.to_numpy()
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    utilMat = masked_arr.filled(item_means)
    U, s, V = np.linalg.svd(utilMat, full_matrices=False)
    s = np.diag(s)
    s = s[0:k, 0:k]
    U = U[:, 0:k]
    V = V[0:k, :]
    return U.data @ s @ V.data

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

train = pd.read_csv('ua.base', sep="\t", header=None)
train.columns = ['user_id', 'item_id', 'rating', 'timestamp']
train = train.drop(['timestamp'], axis=1)
train.user_id = train.user_id - 1
train.item_id = train.item_id - 1


test = pd.read_csv('ua.test', sep="\t", header=None)
test.columns = ['user_id', 'item_id', 'rating', 'timestamp']
test = test.drop(['timestamp'], axis=1)
test.user_id = test.user_id - 1
test.item_id = test.item_id - 1

In [3]:
def create_utility_matrix(data):
    n_users = train.user_id.max() + 1
    n_items = train.item_id.max() + 1
    ret = np.full(shape=(n_items, n_users), fill_value=np.nan)
    for u, i, r in data.to_numpy():
        ret[i][u] = r
    return pd.DataFrame(ret)

def rmse(y_true, y_pred):
    return np.sqrt(((y_pred - y_true) **2).mean())

In [4]:
no_of_features = range(1, 101)
utilMat = create_utility_matrix(train)
for f in no_of_features: 
    svdout = svd_with_x(utilMat, k=f)
    pred = [] 
    for u, i, r in train.to_numpy():
        pred.append(svdout[i, u])
    print(f, rmse(train['rating'], pred))

1 0.9827101638809858
2 0.9642641320346889
3 0.953800545865901
4 0.9451313444382188
5 0.9367068718007894
6 0.9281553126862141
7 0.9207731517917208
8 0.9135570518897865
9 0.9071242295928691
10 0.9009574339572411
11 0.8950676088580872
12 0.8892578752841643
13 0.8833827839544615
14 0.8778578837174742
15 0.8720778134493544
16 0.8666058298476599
17 0.8613299593189946
18 0.85619035404349
19 0.8510712479052189
20 0.8460253704208994
21 0.8409467692125421
22 0.8360454973544971
23 0.8312773631925247
24 0.826528328729885
25 0.8219792413505543
26 0.8171334831673368
27 0.8125841534535405
28 0.8080364072857641
29 0.8034446336309945
30 0.79895304335931
31 0.7946132518241829
32 0.7902498926579155
33 0.7858282189230463
34 0.7816593343066228
35 0.7773963312287432
36 0.7733713929316728
37 0.7692365435182758
38 0.7651013985842512
39 0.7610043644635044
40 0.7569639286689106
41 0.7528960923666287
42 0.7489307814642903
43 0.7449911511427286
44 0.7410567022393076
45 0.7372272938809824
46 0.7334626052151729
47 

In [5]:
no_of_features = range(1, 101)
utilMat = create_utility_matrix(train)
for f in no_of_features: 
    svdout = svd_with_x(utilMat, k=f)
    pred = [] 
    for u, i, r in test.to_numpy():
        pred.append(svdout[i, u])
    print(f, rmse(test['rating'], pred))

1 1.0236276195066973
2 1.0167867701152202
3 1.015758274216431
4 1.014559265003363
5 1.0113762120572047
6 1.008450433439035
7 1.0046728894841286
8 1.0039932435727523
9 1.0036202344843173
10 1.0032262970454622
11 1.00342165145686
12 1.0032947217627932
13 1.0026181410187653
14 1.0020391023998796
15 1.001482991521815
16 1.0012389152368897
17 1.0012939091758037
18 1.001486542931706
19 1.0021880153665472
20 1.002230730269796
21 1.002425233351614
22 1.0028006254702369
23 1.0034730479672613
24 1.0033036476392623
25 1.003989274281926
26 1.0037654883465161
27 1.0038647504368126
28 1.0042004430968048
29 1.0046192800093223
30 1.0046087997689788
31 1.0047684430719155
32 1.0047126783147409
33 1.0056119471025078
34 1.0057354417892572
35 1.0060623502861994
36 1.007093642823849
37 1.0069052713239843
38 1.00732300542876
39 1.007346043461391
40 1.007232199414634
41 1.0072320552028555
42 1.0070767961919187
43 1.0070300526127802
44 1.0072442073319592
45 1.0068852333327316
46 1.0074631172782305
47 1.0078231

In [6]:
no_of_features = range(1, 101)
utilMat = create_utility_matrix(train)
for f in no_of_features: 
    svdout = svd_without_x(utilMat, k=f)
    pred = [] 
    for u, i, r in train.to_numpy():
        pred.append(svdout[i, u])
    print(f, rmse(train['rating'], pred))

1 1.0097997746512346
2 0.9805461590070965
3 0.9620571168690643
4 0.9517375737232584
5 0.943160000002186
6 0.9345398717690686
7 0.9266295639596663
8 0.9192908882406476
9 0.9123100745064596
10 0.905886618563938
11 0.8997012221475544
12 0.8939087609206241
13 0.8880884999418532
14 0.8823341950388544
15 0.8768365762136665
16 0.8710724462388852
17 0.865582156681882
18 0.8603241081811898
19 0.8551671215599982
20 0.8500712640962502
21 0.8450306265223034
22 0.8400950089168536
23 0.8353277548510184
24 0.8303671722021186
25 0.8256144115863627
26 0.8210703292217754
27 0.8162343615919264
28 0.8116904925834325
29 0.8071430221393007
30 0.8025493988785649
31 0.7980828976741924
32 0.7937690935688986
33 0.7893692955329201
34 0.7849610692562641
35 0.7807866505444675
36 0.776547999302913
37 0.7724613087505801
38 0.7684091521318724
39 0.7643021892657762
40 0.7601930683211857
41 0.756145855127271
42 0.7520563378269551
43 0.7481814242956527
44 0.7442377624772547
45 0.7402480395380162
46 0.7364221289104939
47

In [7]:
no_of_features = range(1, 101)
utilMat = create_utility_matrix(train)
for f in no_of_features: 
    svdout = svd_without_x(utilMat, k=f)
    pred = [] 
    for u, i, r in test.to_numpy():
        pred.append(svdout[i, u])
    print(f, rmse(test['rating'], pred))

1 1.024777906785842
2 1.0202984087770544
3 1.0134236424532117
4 1.0124247420047818
5 1.0112195571766207
6 1.0078247474442796
7 1.0060910779881054
8 1.00356299198587
9 1.0028926471758393
10 1.0026264434615428
11 1.002197621350039
12 1.0024972912504415
13 1.0023464712565682
14 1.0019857830376824
15 1.0014207405805948
16 1.0009839883973888
17 1.0009999854587888
18 1.0010420762457661
19 1.001181985829426
20 1.0019580951831777
21 1.0021087255311376
22 1.0021281841379557
23 1.0028071560637495
24 1.002999177056129
25 1.0028368230318347
26 1.0035055094116583
27 1.003247912809257
28 1.0033874530423104
29 1.0037211794717293
30 1.00404152738291
31 1.0042202729622276
32 1.0042689396512314
33 1.0042057954476256
34 1.0051388826995509
35 1.005231483259268
36 1.0055015956097635
37 1.0067497649718482
38 1.0064276014443307
39 1.006781598916967
40 1.0068681162919564
41 1.0065431090983792
42 1.0064669814214373
43 1.006737734599504
44 1.006619397968308
45 1.006625379239925
46 1.0062803850263462
47 1.006935