In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

from scipy import spatial

import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.feature_selection import chi2

## Importación de los datos

In [2]:
def load_dataset():
    
    dataset_path = 'dataset/'
    df_business = pd.read_json(dataset_path + 'business.json', lines=True)
    
    size = 1000000
    df_reviews = pd.read_json(dataset_path + 'review.json', lines=True, 
                          dtype={'review_id':str,'user_id':str,
                                 'business_id':str,'stars':int,
                                 'date':str,'text':str,'useful':int,
                                 'funny':int,'cool':int},
                          chunksize=size)
    reviews_list = []
    for df_review in tqdm(df_reviews):
        df_review = df_review.drop(['review_id','useful','funny','cool'], axis=1)
        df_review = df_review.rename(columns={'stars': 'review_stars'})
        df_review_m = pd.merge(df_business, df_review, on='business_id', how='inner')
        reviews_list.append(df_review_m)

    df_review = pd.concat(reviews_list, ignore_index=True, join='outer', axis=0)
    return df_review, df_business

## Sistema de recomendación Usuario-Usuario

In [3]:
def filter_by_user(df_review, user_ids, non_seen_items):
    user_cities = list(df_review[df_review['user_id'].isin(user_ids)]['city'].values)
    user_items = list(set(list(df_review[df_review['user_id'].isin(user_ids)]['business_id'])))
    reviews_user = df_review[(df_review['city'].isin(user_cities)) & (df_review['business_id'].isin(user_items))]
    if non_seen_items:
        user_cities = list(df_review[df_review['user_id'].isin([user_ids[0]])]['city'].values)
        user_items = list(set(list(df_review[df_review['user_id'].isin([user_ids[0]])]['business_id'])))
        non_seen_items = df_review[(df_review['city'].isin(user_cities)) & (df_review['business_id'].isin(user_items) == False)]
        return reviews_user, non_seen_items[['business_id', 'user_id', 'review_stars']]
    else:
        return reviews_user, None

In [4]:
def cosine_similarity(matrix):
    return 1-pairwise_distances(matrix, metric="cosine")

In [5]:
def get_similarity_users(df_reviews_user, user_id):
    
    reviews_user = df_reviews_user[['user_id', 'business_id', 'review_stars']].drop_duplicates()
    review_user_matrix = reviews_user.pivot_table(values='review_stars', index='user_id', columns='business_id').fillna(0)
    idx = list(review_user_matrix.index)
    cosine_sim = cosine_similarity(review_user_matrix)
    cosine_sim_matrix = pd.DataFrame(data = cosine_sim, index = idx, columns = idx)
    user_sim = cosine_sim_matrix.filter(items=[user_id], axis=0)
    most_sim_k_users = user_sim.max().rename_axis('user').reset_index().sort_values(by=0, ascending=False)
    most_sim_k_users.columns = ['user_id', 'similarity']
    
    return most_sim_k_users

In [12]:
def generate_recommendations(df_review, df_business, df_non_seen_items, similarity_users, user_id, K_sim_user, K_rec):
    
    means_user_ratings = df_reviews_user[['user_id', 'review_stars']].groupby('user_id').mean().rename_axis('user_id').reset_index()
    similarity_users = similarity_users.merge(means_user_ratings, on='user_id', how='left')
    
    top_sim_user = list(similarity_users['user_id'])
    df_items = df_non_seen_items[df_non_seen_items['user_id'].isin(top_sim_user)][['business_id', 'user_id', 'review_stars']]
    df_items = df_items.pivot_table(values='review_stars', index='user_id', columns='business_id').rename_axis('user_id').reset_index()
    
    similarity_users = similarity_users.merge(df_items, on='user_id', how='left').fillna(0)
    
    unseen_items = list(df_non_seen_items['business_id'].drop_duplicates())
    df_recommendations = pd.DataFrame(data = unseen_items, columns= ['item'])
    df_recommendations['prediction'] = 0
    
    ra = similarity_users[similarity_users['user_id']==user_id]['review_stars'].values[0]
    cols = similarity_users.columns
    for unseen_item in tqdm(unseen_items):
        if unseen_item in cols:
            sample = similarity_users[similarity_users[unseen_item]!=0].head(K_sim_user+1).tail(K_sim_user)
            num = np.dot(sample['similarity'], (sample[unseen_item]-sample['review_stars']))
            den = sum(sample['similarity'])
            ri = ra + num/den
            df_recommendations.loc[df_recommendations['item']==unseen_item, ['prediction']] = ri
    df_recommendations = df_recommendations.merge(df_business, left_on='item', right_on='business_id', how='left')
    df_recommendations = df_recommendations.sort_values(by='prediction', ascending=False)[['name', 'address', 'city', 'state', 'prediction']]
    return df_recommendations.head(K_rec)

In [None]:
user_id = 'MR_0VqlmaHRBskfq_u9UaA'
K_sim_user = 10
K_rec = 10

df_review, df_business = load_dataset()
df_reviews_user, df_non_seen_items = filter_by_user(df_review, [user_id], True)
similarity_users = get_similarity_users(df_reviews_user, user_id)

In [13]:
df_recommendations = generate_recommendations(df_review, df_business, df_non_seen_items, similarity_users, user_id, \
                                              K_sim_user, K_rec)

100%|█████████████████████████████████████████████████████████████████████████████| 2529/2529 [00:05<00:00, 493.47it/s]
