In [1]:
import pandas as pd
import random
import csv
import gzip
import urllib.request
import tarfile
from collections import defaultdict
import scipy.optimize
import numpy as np
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split

In [5]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d['rating']

In [6]:
# extract gz files

url = 'http://cseweb.ucsd.edu/classes/fa21/cse258-b/files/assignment1.tar.gz'

ftpstream = urllib.request.urlopen(url)
thetarfile = tarfile.open(fileobj=ftpstream, mode="r|gz")
thetarfile.extractall()

In [68]:
data = list(readCSV("assignment1/trainInteractions.csv.gz"))
data[:2]

[('88348277', '03969194', '5'), ('86699739', '27096427', '4')]

In [70]:
data = pd.DataFrame(data, columns=['user_id', 'recipe_id', 'rating'])
data[:2]

Unnamed: 0,user_id,recipe_id,rating
0,88348277,3969194,5
1,86699739,27096427,4


In [71]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(data, reader)

In [72]:
trainset, testset = train_test_split(data, test_size=.2)


In [100]:
model = SVD(n_factors=1, n_epochs=10, reg_all=0.001)

model.fit(trainset)
predictions = model.test(testset)

sse = 0
for p in predictions:
    sse += (p.r_ui - p.est)**2

print(sse / len(predictions))

0.8258559268252678


In [101]:
fulldata = data.build_full_trainset()
model = SVD(n_factors=1, n_epochs=10, reg_all=0.001)
model.fit(fulldata)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x28b6ad100>

In [102]:
predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + ',' + str(model.predict(u, i).est) + '\n')
predictions.close()