## Imports

In [1]:
import pandas as pd
import numpy as np
import scipy as sp

# Operating System
import os

# Numpy, Pandas and Scipy
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, save_npz, load_npz

# Scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Model Evaluation
from evaluation import evaluate

In [2]:
from validate_sample_submission import validate_submission

## Load data

In [3]:
df_ratings = pd.read_csv('BookRatings.csv')
df_users = pd.read_csv('BooksUsers.csv')
# df_books = pd.read_csv('BooksMetaInfo.csv')
df_test = pd.read_csv('test_users.csv')

In [4]:
test_users = df_test.values.ravel().tolist()

## Non-personalised recommendations

In [65]:
def pick_10_best(df_ratings):
    '''Returns 10 ISBN numbers according to the following criteria:
    - rated at least 80 times
    - ranked according to average rating
    '''
    
    # create ratings matrix
    ratings_frame = df_ratings.set_index(['User-ID', 'ISBN']).unstack() # this is slow
    ratings_frame.index.name = None
    
    # summarize
    books_summary = pd.DataFrame(index=ratings_frame.columns)
    books_summary['mean_rating'] = ratings_frame.mean(skipna=True)
    books_summary['n_ratings'] = ratings_frame.count()
    books_summary['highest_rating'] = ratings_frame.max()
    books_summary['lowest_rating'] = ratings_frame.min()

    # select
    top_10 = books_summary[books_summary.n_ratings>=30]\
                    .sort_values('mean_rating', ascending=False)\
                    .head(10)\
                    .index.get_level_values('ISBN').tolist()
    return top_10

In [66]:
top_10 = pick_10_best(df_ratings)

In [67]:
def generate_non_personalized_submission(top_10, test_users):
    
    # sizes
    n_users = len(test_users)
    n_books = 10
    
    # output
    recs = pd.DataFrame()
    recs['User-ID'] = [user for user in test_users for book in range(n_books)]
    recs['ISBN'] = top_10*n_users
    # recs = recs.set_index('User-ID')
    
    return recs

In [68]:
non_pers_submission = generate_non_personalized_submission(top_10, test_users)

if validate_submission(non_pers_submission):
    non_pers_submission.to_csv('non_pers_submission.csv')

## Collaborative Filtering 

In [9]:
def make_ratings(data: pd.DataFrame) -> csr_matrix:
    """Creates the ratings matrix of listening history with optional shape
    
    Creates the ratings matrix from the listening history imported using the read_users_history() method.
    
    Args:
        data (pd.DataFrame):  Listening history for the users.
        shape (tuple): The overall (n_users, n_items) shape desired for the matrix. 
                       If None, define the shape with the (n_users, n_items) from data argument.
        
    Returns:
        ratings (csr_matrix): Ratings matrix with shape (n_users, n_items).
    
    """
    users, user_pos = np.unique(data.iloc[:, 0].values, return_inverse=True)
    items, item_pos = np.unique(data.iloc[:, 1].values, return_inverse=True)
    values = data.iloc[:, 2].fillna(0).values
    
    #R Matrix dimensions (n_users, n_items)
    shape = (len(users), len(items))

    R_ = csr_matrix((values, (user_pos, item_pos)), shape=shape)
    return R_
#ratings_user = make_ratings(df_ratings)
#ratings_user

In [10]:
test_size = 0.2

def make_train_val_split(data: pd.DataFrame, test_size : float = 0.2):
    """Split the data into train and validation and returns the ratings matrixes accordingly.
    
    Args:
        data (pd.DataFrame): Listening history for the users.
        test_size (float): Percentage of listening history used for validation.
    
    Returns:
        ratings_train (csr_matrix): Ratings matrix for train.
        ratings_val (csr_matrix): Ratings matrix for validation.
    
    """
    train_data, val_data = train_test_split(data, test_size=test_size, random_state=8)

    #Store the indexes of each observation to identify which records to replace with zero
    train_index = train_data.index
    val_index = val_data.index

    #make copies of data to replace the observations
    train_data_clean = data.copy()
    val_data_clean = data.copy()

    #Replace the validation observations on the training data
    train_data_clean.loc[val_index,["Book-Rating"]] = 0
    
    #Replace the training observations on the validation data
    val_data_clean.loc[train_index,["Book-Rating"]] = 0

    #Create the R matrices
    R_train = make_ratings(train_data_clean)
    R_val = make_ratings(val_data_clean)

    #remove the explicit zeros from the sparse matrices
    R_train.eliminate_zeros()
    R_val.eliminate_zeros()

    return R_train, R_val

ratings_train, ratings_val = make_train_val_split(df_ratings, test_size=test_size)
print(f"After the split we have {ratings_train.nnz:,} ratings in the train set and {ratings_val.nnz:,} ratings in the validation set.")

After the split we have 87,367 ratings in the train set and 21,842 ratings in the validation set.


In [11]:
def get_indices_from_users_to_pred(users_to_pred: pd.DataFrame, data: pd.DataFrame):
    """Get the indices of users_to_pred for which we have data and for which we don't.
    
    Args:
        users_to_pred (pd.DataFrame): DataFrame containing the list of users we are going to recommend items.
        data (pd.DataFrame): Original of listening history for the users.
        
    Returns:
        index_users_in_data (Int64Index): Index that filters the users_to_pred to get the user_id's with training data.
        index_users_not_in_data (Int64Index): Index that filters the users_to_pred to get the user_id's without training data.
        
    """
    index_users_in_data = users_to_pred[users_to_pred.isin(data["User-ID"].values).values].index
    index_users_not_in_data = users_to_pred[~users_to_pred.isin(data["User-ID"].values).values].index
    
    return index_users_in_data, index_users_not_in_data

index_users_in_data, index_users_not_in_data = get_indices_from_users_to_pred(df_test, df_ratings)
print(f"The index for users which we have training data has length of {len(index_users_in_data)}.")
print(f"The index for users which we don't have training data has length of {len(index_users_not_in_data)}.")

The index for users which we have training data has length of 489.
The index for users which we don't have training data has length of 100.


In [12]:
#ratings_user = df_ratings.iloc[index_users_in_data]
#ratings_user

In [13]:
def calculate_similarities(ratings_matrix, similarity_type):
    """
    Get the cosine similarity between users.
    
    Parameters
    ----------
    ratings_matrix : csr_matrix
              Ratings matrix.
              
    similarity_type: str, "users" or "items"

    Returns
    -------
    similarities : csr_matrixb
                        sparse representation of the cosine similarity between users or items.
    """
    if(similarity_type == "users"):
        similarities = cosine_similarity(ratings_matrix, dense_output=False)
    elif(similarity_type == "items"):
        similarities = cosine_similarity(ratings_matrix.T, dense_output=False)
    
    return similarities

user_similarities = calculate_similarities(ratings_train,"users")
user_similarities

<5719x5719 sparse matrix of type '<class 'numpy.float64'>'
	with 759717 stored elements in Compressed Sparse Row format>

## TRIAL

In [15]:
training_users = df_ratings['User-ID'].unique().tolist()
df_users = df_users[df_users['User-ID'].isin(training_users)]

In [16]:
def process_user_data(df_users):
    
    # index
    df_users = df_users.set_index('User-ID')
    
    # vectorize location data
    vectorizer = TfidfVectorizer()
    user_profiles = vectorizer.fit_transform(df_users.Location)
    
    # join age information
    user_similarities = cosine_similarity(user_profiles, dense_output=False)
    
    return user_similarities

In [17]:
user_loc_sim = process_user_data(df_users)

In [18]:
collab_filt_user_preds = make_user_predictions(user_loc_sim, ratings_train)

In [24]:
abs(df_users.Age.values.reshape(-1,1) - df_users.Age.values.reshape(1,-1))**-1

  """Entry point for launching an IPython kernel.


array([[       inf, 0.06666667,        nan, ...,        nan, 0.07142857,
        0.11111111],
       [0.06666667,        inf,        nan, ...,        nan, 0.03448276,
        0.04166667],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       ...,
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [0.07142857, 0.03448276,        nan, ...,        nan,        inf,
        0.2       ],
       [0.11111111, 0.04166667,        nan, ...,        nan, 0.2       ,
               inf]])

In [14]:
def make_user_predictions(S: csr_matrix, R_: csr_matrix):
    """Predict using collaborative filtering.
    
    Args:
        S (csr_matrix): Similarities matrix (tipically using the cosine_similarity).
        R_ (csr_matrix): Ratings matrix.
        
    Returns:
        preds (csr_matrix): Predictions matrix.
    
    """
    weighted_sum = np.dot(S, R_)
    
    # We use the absolute value to support negative similarities.
    # In this particular example there are none.
    sum_of_weights = np.abs(S).sum(axis=1)
    
    preds = weighted_sum / sum_of_weights
    
    # Exclude previously rated items.
    preds[R_.nonzero()] = 0
    
    return csr_matrix(preds)


collab_filt_user_preds = make_user_predictions(user_similarities, ratings_train)
collab_filt_user_preds

  return np.true_divide(self.todense(), other)


<5719x47768 sparse matrix of type '<class 'numpy.float64'>'
	with 30246008 stored elements in Compressed Sparse Row format>

In [28]:
def sparsity(matrix: csr_matrix) -> float:
    """Calculates the sparsity of a matrix.
    
    Args:
        matrix (csr_matrix): Sparse matrix.
        
    Returns:
        sparsity_ (float): Sparsity percentage (between 0 and 1).
    
    """
    return 1 - matrix.nnz / (matrix.shape[0] * matrix.shape[1])

sparsity(collab_filt_user_preds)

0.5209580173730646

In [29]:
def get_top_n(pred, n):
    pred_ = np.negative(pred).toarray()
    return pred_.argsort()[:, :n]


collab_filt_most_rated = get_top_n(collab_filt_user_preds, 10)
collab_filt_most_rated

array([[ 5791,  3950,  7192, ..., 17420, 25509,  7229],
       [23964,  7192, 16034, ...,  1298, 16745, 17420],
       [ 7192,  3950,  5791, ..., 19960, 32475, 17420],
       ...,
       [ 7192, 23964,  3950, ..., 16148, 12896, 16769],
       [ 7192,  5791,  3950, ...,  8005, 16148,  8286],
       [ 7192, 23964,  3950, ..., 13300,  5791, 16034]])

In [30]:
# Let's store a Series with the unique user id's that we have in the original data.
def get_unique_users(data: pd.DataFrame) -> pd.DataFrame:
    """Get unique users in training data.
    
    Args:
        data (pd.DataFrame):  listening history for the users.
        
    Returns:
        unique_users (pd.DataFrame): DataFrame of one column with unique users in training data.
    
    """
    return pd.DataFrame(np.unique(data.iloc[:, 0].values), columns=["users to recommend books"])


unique_users_training_data = get_unique_users(df_ratings)
unique_users_training_data.head()



Unnamed: 0,users to recommend books
0,99
1,114
2,243
3,244
4,254


In [31]:
def convert_pers_recommendations_to_df(pers_recs: np.array, users_to_pred: pd.DataFrame) -> pd.DataFrame:
    """Converts the personalized most rated to an DataFrame with the users and the recommendations.
    
    Args:
        pers_recs (np.array): Array of indices for the best personalized items to recommend.
        users_to_pred (pd.DataFrame): DataFrame containing the users which need recommendations.
        
    Returns:
        non_pers_most_rated_matrix (np.array): Two dimensional array of (n_users, top_n_items)
    
    """
    pers_df = pd.concat([users_to_pred, pd.DataFrame(pers_recs)], axis=1)
    pers_df = pers_df.set_index("users to recommend books")
    
    return pers_df


collab_filt_most_rated_df = convert_pers_recommendations_to_df(collab_filt_most_rated, unique_users_training_data)
collab_filt_most_rated_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
users to recommend books,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
99,5791,3950,7192,7835,23964,23963,1354,17420,25509,7229
114,23964,7192,16034,16148,5791,3950,32475,1298,16745,17420
243,7192,3950,5791,23964,25296,16148,25509,19960,32475,17420
244,7192,23964,3950,22610,16148,8286,5791,16034,25509,16425
254,7192,5791,7229,16148,25509,23964,32475,25296,869,11742


In [32]:
def create_dict_preds(preds_df: pd.DataFrame) -> dict:
    """Convert the predictions DataFrame (index:users -> columns: items) to a dictionary of key (user->list of items).
    
    Args: 
        preds_df (pd.DataFrame): DataFrame containing the users and the ordered predictions.
        
    Returns:
        preds_dict (dict): Dict of (user_id: list of items) used for evaluating the performance.
    
    """
    return {preds_df.index[i]: preds_df.values[i].tolist() for i in range(len(preds_df))}


collab_filt_dict = create_dict_preds(collab_filt_most_rated_df)
# Since dicts in python are not ordered, we need to HAMMER DOWN a way to print some values.
dict(list(collab_filt_dict.items())[0:1])

{99: [5791, 3950, 7192, 7835, 23964, 23963, 1354, 17420, 25509, 7229]}

In [33]:
def get_y_true(R_val_: csr_matrix, users_to_pred: pd.DataFrame, n=100):
    """Get the ground truth (best recommendations) of the users in the validation set.
    
    Args:
        R_val_ (csr_matrix): Validation set ratings matrix.
        users_to_pred: 
        n (int): Number of top-n items.
        
    Returns:
        y_true_df (pd.DataFrame): DataFrame which returns the y_true items.
        
    """
    top_from_R_val = pd.DataFrame(np.negative(R_val_).toarray().argsort()[:, :n])
    y_true_df = pd.concat([users_to_pred, top_from_R_val], axis=1)
    y_true_df = y_true_df.set_index("users to recommend books")
    return y_true_df


y_true_df = get_y_true(ratings_val, unique_users_training_data, n=10)
y_true_dict = create_dict_preds(y_true_df)

evaluate(y_true_dict, collab_filt_dict)

0.006104803814630708

In [34]:
def make_item_predictions(S, R):
    
    weighted_sum = np.dot(R, S)
    
    # We use the absolute value to support negative similarities.
    # In this particular example there are none.
    sum_of_weights = np.abs(S).sum(axis=0)
    
    preds = weighted_sum / sum_of_weights
    
    # Exclude previously rated items.
    preds[R.nonzero()] = 0
    
    return csr_matrix(preds)

In [57]:
content_recs = pd.read_csv('content_based.csv')\
                    .rename(columns={'Unnamed: 0': 'User-ID'})\
                    .set_index('User-ID')\
                    .stack()\
                    .reset_index()\
                    .drop(columns='level_1')\
                    .rename(columns={0: 'ISBN'})


In [58]:
content_recs

Unnamed: 0,User-ID,ISBN
0,114,0786015810
1,114,1881554031
2,114,1592231365
3,114,1584792965
4,114,0812014790
...,...,...
4885,278633,087406452X
4886,278633,0440402255
4887,278633,0966533356
4888,278633,051754489X


In [61]:
content_submission = merge_submissions(non_pers_submission, content_recs)

In [62]:
if validate_submission(content_submission):
    content_submission.to_csv('content_submission.csv', index=False)

In [35]:
def produce_submission(collab_filt_most_rated, df_ratings, test_users):
    
    # indices
    ratings_frame = df_ratings.set_index(['User-ID', 'ISBN']).unstack()
    users = ratings_frame.index.tolist()
    books = ratings_frame.columns.get_level_values(1).tolist()
    
    # replace values
    recs = pd.DataFrame(collab_filt_most_rated, index=users).stack()
    col_to_isbn = {i: isbn for i, isbn in enumerate(books)}
    recs = recs.replace(col_to_isbn)
    
    # filter
    recs.index = recs.index.get_level_values(0).tolist()
    recs = recs[recs.index.isin(test_users)]

    # format frame
    recs = recs.reset_index()
    recs.columns = ['User-ID', 'ISBN']
    
    return recs

In [36]:
pers_submission = produce_submission(collab_filt_most_rated, df_ratings, test_users)

In [37]:
def merge_submissions(non_pers_submission, pers_submission):
    
    combined_submission = pers_submission\
                                .append(non_pers_submission[~non_pers_submission['User-ID']\
                                                            .isin(pers_submission['User-ID'])])\
                                .sort_values('User-ID')

    return combined_submission

In [38]:
merged_submission = merge_submissions(non_pers_submission, pers_submission)

In [39]:
if validate_submission(merged_submission):
    merged_submission.to_csv('combined_submission_locations.csv', index=False)

In [45]:
pers_submission['ISBN'].values == merged_submission['ISBN'].values

  """Entry point for launching an IPython kernel.


False

In [46]:
merged_submission#['ISBN']

Unnamed: 0,User-ID,ISBN
0,114,0312995423
1,114,043935806X
2,114,0066214122
3,114,0671027387
4,114,0312195516
...,...,...
4885,278633,0140293248
4886,278633,0804114986
4887,278633,0345361792
4888,278633,0316284955
