In [1]:
#Imports
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import sparse
#import jovian

In [2]:
# Creates a database, clears if exists and fills with training data
import sqlite3
import codecs
import random

# Connect to database (using sqlite3 lib built into python)
conn = sqlite3.connect( 'comp3208_recommender2.db')

# Read in training data file
readHandle = codecs.open( 'comp3208-train.csv', 'r', 'utf-8', errors = 'replace' )
listLines = readHandle.readlines()
readHandle.close()
    
# Creates a table for training data, deletes all data
c = conn.cursor()
c.execute( 'CREATE TABLE IF NOT EXISTS training_table (UserID INT, ItemID INT, Rating FLOAT, Prediction1 FLOAT, Prediction2 FLOAT)' )
conn.commit()
c.execute( 'DELETE FROM training_table' )
conn.commit()
# Inserts data from csv file into training data table (userid, itemid, rating, timestamp - currently not included)
for strLine in listLines :
    if len(strLine.strip()) > 0 :
        listParts = strLine.strip().split(',')
        allFloats = True
        for part in listParts:
            try:
                float(part)
            except ValueError:
                print("Value error", repr(part))
                allFloats = False
        if allFloats == True:
            if len(listParts) == 4 :
                # Insert training set into table with an initial predicted rating of 0
                c.execute( 'INSERT INTO training_table VALUES (?,?,?,?,?)', (listParts[0], listParts[1], listParts[2], random.random() * 5, random.random() * 5) )
            else :
                raise Exception( 'failed to parse csv : ' + repr(listParts) )
        else:
            allFloats = True
conn.commit()
    
# Creates an index of the user ID and item ID
c.execute( 'CREATE INDEX IF NOT EXISTS training_table_index on training_table (UserID, ItemID)' )
conn.commit()

Value error 'rating'


In [3]:
# Inserts testing data into a table in database

# Reads in data
readHandle = codecs.open( 'comp3208-test.csv', 'r', 'utf-8', errors = 'replace' )
listLines = readHandle.readlines()
readHandle.close()

# Create and clear table for testing data
c2 = conn.cursor()
c2.execute( 'CREATE TABLE IF NOT EXISTS testing_table (UserID INT, ItemID INT, Rating FLOAT, PredRating FLOAT)' )
conn.commit()
c2.execute( 'DELETE FROM testing_table' )
conn.commit()
# Inserts data from csv file into testing data table (userid, itemid, timestamp - currently not used)
for strLine in listLines :
    if len(strLine.strip()) > 0 :
        listParts = strLine.strip().split(',')
        if len(listParts) == 3 :
            # Insert training set into table with a rating and predicted rating of 0
            c2.execute( 'INSERT INTO testing_table VALUES (?,?,?,?)', (listParts[0], listParts[1], 0, 0) )
        else :
            raise Exception( 'failed to parse csv : ' + repr(listParts) )
conn.commit()

In [4]:
# Get all user ID', item ID's and ratings from training table in database, store in pandas dataframe
c.execute('SELECT UserID, ItemID, Rating FROM training_table')
items = []
[items.append([x[0], x[1], x[2]]) for x in c.fetchall()]
ratings_df = pd.DataFrame(items, columns=["UserID", "ItemID", "Rating"])
ratings_df.head()

Unnamed: 0,UserID,ItemID,Rating
0,1,12793,4.0
1,1,31216,4.0
2,1,38744,3.5
3,1,113,1.5
4,1,33000,3.0


In [5]:
# Get all user ID', item ID's and ratings from testing table in database, store in pandas dataframe
c.execute('SELECT UserID, ItemID, Rating FROM testing_table')
items = []
[items.append([x[0], x[1]]) for x in c.fetchall()]
test_df = pd.DataFrame(items, columns=["UserID", "ItemID"])
test_df.head()

Unnamed: 0,UserID,ItemID
0,1,18953
1,1,47984
2,1,42356
3,1,14254
4,1,27049


In [31]:
# Output count of each rating 
Counter(ratings_df.Rating)

Counter({4.0: 7024679,
         3.5: 3234206,
         1.5: 419265,
         3.0: 5240313,
         4.5: 2254357,
         2.5: 1305118,
         5.0: 3867596,
         2.0: 1758103,
         1.0: 841828,
         0.5: 420307})

In [32]:
# Show number of ratings made by each user
Counter(ratings_df.groupby(["UserID"]).count()["ItemID"])

Counter({99: 651,
         14: 15492,
         509: 37,
         75: 749,
         74: 771,
         38: 1914,
         25: 2751,
         806: 7,
         15: 13185,
         19: 5470,
         106: 535,
         97: 679,
         333: 82,
         1: 5726,
         122: 413,
         6: 5327,
         17: 6194,
         44: 1470,
         18: 5036,
         62: 949,
         29: 2387,
         79: 713,
         16: 8058,
         36: 1771,
         10: 6102,
         665: 21,
         114: 474,
         316: 81,
         69: 840,
         3: 3606,
         32: 2029,
         5: 7402,
         261: 109,
         133: 355,
         52: 1349,
         24: 2948,
         135: 343,
         9: 4929,
         169: 219,
         233: 146,
         7: 4068,
         42: 1477,
         104: 538,
         1345: 2,
         23: 3093,
         240: 121,
         571: 26,
         134: 356,
         43: 1443,
         241: 148,
         11: 4298,
         81: 657,
         65: 924,
         26: 2

In [33]:
# Show average number of ratings per user
np.mean(ratings_df.groupby(["UserID"]).count()["ItemID"])

93.18173528892031

In [6]:
# FOR TESTING: Split into training and testing
train_df, test_df = train_test_split(ratings_df, test_size=0.2)

# Reset index values of train and test set
train_df = train_df.reset_index()[["UserID", "ItemID", "Rating"]]
test_df = test_df.reset_index()[["UserID", "ItemID", "Rating"]]

In [34]:
# Reset indexes of training and testing data
train_df = ratings_df.reset_index()[["UserID", "ItemID", "Rating"]]
test_df = test_df.reset_index()[["UserID", "ItemID"]]

In [7]:
# Encode columns to have continuous IDs
def encode_column(column):
    keys = column.unique()
    key_to_id = {key:idx for idx,key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [8]:
# Encode the whole dataframe to have continuous IDs
def encode_df(rate_df):
    movie_ids, rate_df["ItemID"], num_movies = encode_column(rate_df["ItemID"])
    user_ids, rate_df["UserID"], num_users = encode_column(rate_df["UserID"])
    return rate_df, num_users, num_movies, user_ids, movie_ids

In [9]:
# Output number of users and movies after encoding
rate_df, num_users, num_movies, user_ids, movie_ids = encode_df(train_df)
print("Number of users:", num_users)
print("Number of movies:", num_movies)
rate_df.head()

Number of users: 281631
Number of movies: 51045


Unnamed: 0,UserID,ItemID,Rating
0,0,0,4.0
1,1,1,3.0
2,2,2,3.0
3,3,3,3.0
4,4,4,5.0


In [10]:
# Creates a random matrix with shape (n,K)
def create_embeddings(n, K):
    return 11*np.random.random((n,K)) / K

In [11]:
# Creates a sparse utilitiy matrix
def create_sparse_matrix(df, rows, cols, column_name="Rating"):
    return sparse.csc_matrix((df[column_name].values,(df["UserID"].values, df["ItemID"].values)), shape=(rows, cols))

In [12]:
rate_df, num_users, num_movies, user_ids, movie_ids = encode_df(train_df)
Y = create_sparse_matrix(rate_df, num_users, num_movies)

In [13]:
# Makes the prediction using elementwise multiplication to avoid making a dense matrix
def predict(df, emb_user, emb_movie):
    df["Prediction"] = np.sum(np.multiply(emb_movie[df["ItemID"]],emb_user[df["UserID"]]), axis=1)
    #new_pred = []
    #for pred in df["Prediction"]:
    #    if pred > 5: 
    #        new_pred.append(5)
    #    else:
    #        new_pred.append(pred)
    #df["Prediction"] = new_pred
    #df["Prediction"] = [round(item*2)/2 for item in df["Prediction"]]
    return df

In [15]:
lmbda = 0.0003

In [16]:
# Calculates the cost (MSE)
def cost(df, emb_user, emb_movie):
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_movie.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_movie), emb_user.shape[0], emb_movie.shape[0], 'Prediction')
    return np.sum((Y-predicted).power(2))/df.shape[0]

In [17]:
# Calculates the gradient for the user and movie embeddings
def gradient(df, emb_user, emb_movie):
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_movie.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_movie), emb_user.shape[0], emb_movie.shape[0], 'Prediction')
    delta =(Y-predicted)
    grad_user = (-2/df.shape[0])*(delta*emb_movie) + 2*lmbda*emb_user
    grad_movie = (-2/df.shape[0])*(delta.T*emb_user) + 2*lmbda*emb_movie
    return grad_user, grad_movie

In [86]:
# Calculates the gradient descent with the momentum beta
def gradient_descent(df, emb_user, emb_movie, iterations=2000, learning_rate=0.01, df_val=None):
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_movie.shape[0])
    beta = 0.9
    grad_user, grad_movie = gradient(df, emb_user, emb_movie)
    v_user = grad_user
    v_movie = grad_movie
    for i in range(iterations):
        grad_user, grad_movie = gradient(df, emb_user, emb_movie)
        v_user = beta*v_user + (1-beta)*grad_user
        v_movie = beta*v_movie + (1-beta)*grad_movie
        emb_user = emb_user - learning_rate*v_user
        emb_movie = emb_movie - learning_rate*v_movie
        if(not (i+1)%50):
            print("\niteration", i+1, ":")
            print("train mse:",  cost(df, emb_user, emb_movie))
            if df_val is not None:
                print("validation mse:",  cost(df_val, emb_user, emb_movie))
    return emb_user, emb_movie

In [85]:
emb_user = create_embeddings(num_users, 3)
emb_movie = create_embeddings(num_movies, 3)
emb_user, emb_movie = gradient_descent(rate_df, emb_user, emb_movie, iterations=450, learning_rate=1)

  df["Prediction"] = np.sum(np.multiply(emb_movie[df["ItemID"]],emb_user[df["UserID"]]), axis=1)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


KeyboardInterrupt: 

# Predictions

In [20]:
# Removes data not seen before
def encode_new_data(valid_df, user_ids, movie_ids):
    df_val_chosen = valid_df["ItemID"].isin(movie_ids.keys()) & valid_df["UserID"].isin(user_ids.keys())
    #[print(x) for x=false in df_val_chosen]
    #for i in range(len(df_val_chosen)):
        #if df_val_chosen[i] == False:
            #print(i)
    #print(df_val_chosen)
    valid_df = valid_df[df_val_chosen]
    #print(valid_df)
    valid_df["ItemID"] = np.array([movie_ids[x] for x in valid_df["ItemID"]])
    valid_df["UserID"] = np.array([user_ids[x] for x in valid_df["UserID"]])
    return valid_df

In [21]:
def unknown_data(invalid_df, user_ids, movie_ids):
    df_not_chosen = invalid_df["ItemID"].isin(movie_ids.keys()) & invalid_df["UserID"].isin(user_ids.keys())
    #print(df_not_chosen.keys())
    invalid_df = invalid_df[np.logical_not(df_not_chosen)]
    invalid_df["ItemID"] = np.array([test_movie_ids[x] for x in invalid_df["ItemID"]])
    invalid_df["UserID"] = np.array([test_user_ids[x] for x in invalid_df["UserID"]])
    return valid_df

In [22]:
print("Before encoding:", test_df.shape)
valid_df = encode_new_data(test_df, user_ids, movie_ids)
invalid_df = unknown_data(test_df, user_ids, movie_ids)
print("After encoding:", valid_df.shape)
print("Unknown shape:", invalid_df.shape)

Before encoding: (5273155, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df["ItemID"] = np.array([movie_ids[x] for x in valid_df["ItemID"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df["UserID"] = np.array([user_ids[x] for x in valid_df["UserID"]])


NameError: name 'test_movie_ids' is not defined

In [41]:
en_test_df, num_test_users, num_test_movies, test_user_ids, test_movie_ids = encode_df(test_df)

In [23]:
print("After encoding:", valid_df.shape)

After encoding: (4965355, 3)


In [None]:
invalid_df = []
invalid_df["UserID"] = np.array(test_user_ids[x] for x in )

In [61]:
new_df = predict(valid_df, emb_user, emb_movie)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Prediction"] = np.sum(np.multiply(emb_movie[df["ItemID"]],emb_user[df["UserID"]]), axis=1)


In [49]:
# Output the 70-80th columns of predicted data
valid_df[6010:7720].head()

Unnamed: 0,UserID,ItemID,Rating,Prediction
6010,5719,1721,5.0,3.514622
6011,5720,601,5.0,1.21598
6012,5721,318,4.0,4.552377
6013,5722,948,3.0,5.102184
6014,5723,654,4.5,2.66218


In [27]:
# Calculates the MSE o the training and testing data
train_mse = cost(train_df, emb_user, emb_movie)
val_mse = cost(valid_df, emb_user, emb_movie)
print(train_mse, val_mse)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Prediction"] = np.sum(np.multiply(emb_movie[df["ItemID"]],emb_user[df["UserID"]]), axis=1)


3.815652131050122 13.66687418302316


In [25]:
new_pred = []
for pred in valid_df["Prediction"]:
    if pred > 5: 
        new_pred.append(5)
    else:
        new_pred.append(pred)
print(new_pred[7040:7044])
valid_df["Prediction"] = new_pred

[5, 5, 4.55944452289862, 3.6378932723426187]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df["Prediction"] = new_pred


In [26]:
new_pred = []
for pred in valid_df["Prediction"]: 
    new_pred.append(round(pred*2)/2)
print(new_pred[7040:7044])
valid_df["Prediction"] = new_pred

[5.0, 5.0, 4.5, 3.5]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df["Prediction"] = new_pred


In [None]:
for item in valid_df:
    c.execute('INSERT INTO testing_table_2 VALUES (?,?,?)', (valid_df["UserID"], listParts["ItemID"], listParts["Prediction"]) )

In [57]:
forSQL = []
for i in :
    print(i)
    forSQL.append((valid_df["Prediction"][i], valid_df["UserID"][i], valid_df["ItemID"][i]))

0
1
2


KeyError: 2

In [82]:
records = valid_df.to_records(index=False)
result = list(records)
print(result[60:70])

[(222761, 26361, 4., 3.83868437), (214986, 28862, 4.5, 0.86828588), (262843, 11122, 5., 3.63222815), (1336, 12232, 5., 6.54612788), (220946, 34513, 4., 7.85490807), (34262, 25586, 3., 12.92640097), (26941, 27972, 4., 5.79248552), (272465, 20136, 3.5, 3.28857748), (219439, 9679, 3., 3.74697019), (10720, 27041, 3., 3.90267993)]


In [83]:
forSQL = []
for item in result:
    forSQL.append(item[3], item[0], item[1])

TypeError: append() takes exactly one argument (3 given)

In [59]:
c.executemany('UPDATE testing_table SET PredRating = ? WHERE UserID = ? AND ItemID = ?', valid_df)

ProgrammingError: Incorrect number of bindings supplied. The current statement uses 3, and there are 6 supplied.