In [2]:
import psycopg2 as pg2
from psycopg2.extras import RealDictCursor
import pandas as pd
import numpy as np
import pickle
from pandas.algos import nancorr
from sys import getsizeof

# Computing item-item relationships

In [3]:
def con_cur_to_bgg_db_tuples():
    con = pg2.connect(host='34.216.22.202',
                  dbname='postgres',
                  user='postgres')
    cur = con.cursor()
    return con, cur

In [11]:
def sizeof(foo):
    return getsizeof(foo) * 1e-9

In [9]:
def get_data():
    con, cur = con_cur_to_bgg_db_tuples()
    cur.execute('SELECT u.us, g.gs, rating FROM clean_ratings r\
                 INNER JOIN clean_users u\
                 ON r.uid=u.uid\
                 INNER JOIN boardgames g\
                 ON r.gid=g.gid;')
    rating_tuples = cur.fetchall()
    con.close()
    return rating_tuples

In [10]:
rating_tuples = get_data()

In [11]:
sizeof(rating_tuples)

0.086711224

In [7]:
def to_sparse(tuples, n_users=118632, n_items=10000):
    sparse_mat = np.zeros((n_users, n_items))
    for row in tuples:
        sparse_mat[row[0]-1, row[1]-1] = row[2]
    return sparse_mat

In [8]:
sparse_mat = to_sparse(rating_tuples)

In [5]:
import pickle

In [6]:
with open('../data/rating_tuples_full', 'rb') as f:
    rating_tuples = pickle.load(f)

In [12]:
sizeof(sparse_mat)

9.490560112

In [157]:
def calc_sparsity(sparse_mat):
    sparsity = float(len(sparse_mat.nonzero()[0]))
    sparsity /= (sparse_mat.shape[0] * sparse_mat.shape[1])
    sparsity *= 100
    print('Sparsity: {:4.2f}%'.format(sparsity))

In [158]:
calc_sparsity(sparse_mat)

Sparsity: 0.91%


In [13]:
def cardinality_set(sparse_mat):
    rated = sparse_mat.nonzero()
    rated_mat = np.empty_like(sparse_mat)
    rated_mat[rated] = 1
    card_mat = rated_mat.T.dot(rated_mat)

    return card_mat

def corr(sparse_mat):
    nan_mat = sparse_mat.copy()
    nan_mat[nan_mat == 0] = np.nan
    corr_mat = nancorr(nan_mat)
    corr_mat = np.nan_to_num(corr_mat)

    return corr_mat

def correct_corr(corr_mat, card_mat, alpha=5):
    denom = np.add(card_mat, alpha)
    scaling = np.divide(card_mat, denom)
    corr_ect = np.multiply(corr_mat, scaling)

    return corr_ect

In [14]:
card_mat = cardinality_set(sparse_mat)

In [16]:
corr_mat = corr(sparse_mat)

In [17]:
with open('../data/corr_mat_full', 'wb') as f:
    pickle.dump(corr_mat, f)

In [19]:
corr_ect = correct_corr(corr_mat, card_mat)

In [161]:
with open('../data/corr_ect_full', 'wb') as f:
    pickle.dump(corr_ect, f)

In [18]:
sizeof(corr_mat)

0.8000001120000001

# Calculate bias

In [172]:
def get_item_bias_vec(sparse_mat, rated, lam=25):
    sum_ratings = np.sum(sparse_mat, axis=0)
    _, count = np.unique(rated[1], return_counts=True)
    lambda_count = np.add(count, lam)
    item_bias = np.divide(sum_ratings, lambda_count)
    return item_bias

In [173]:
item_bias = get_item_bias_vec(sparse_mat, rated)

In [174]:
item_bias.shape

(10000,)

In [177]:
with open('data/item_bias_vec_full', 'wb') as f:
    pickle.dump(item_bias, f)

# Removing expansions from recommendations

In [318]:
def get_exp():
    con, cur = con_cur_to_bgg_db_tuples()
    cur.execute('SELECT b.gs, c.gs FROM re_ex a \
                 INNER JOIN boardgames b \
                 ON a.gid=b.gid \
                 INNER JOIN boardgames c \
                 ON a.rx=c.gid;')
    exp = cur.fetchall()
    con.close()
    return exp

In [319]:
exp = get_exp()

In [328]:
def exp_to_sparse(exp_tuples):
    exp_mat = np.zeros([10000, 10000])
    for row in exp_tuples:
        exp_mat[row[0]-1, row[1]-1] = 1
    return exp_mat

In [329]:
exp_mat = exp_to_sparse(exp)

In [466]:
with open('data/exp_mat_full', 'wb') as f:
    pickle.dump(exp_mat, f)