In [11]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

from sklearn.model_selection import train_test_split
from scipy import spatial

import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.feature_selection import chi2

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import precision_recall_fscore_support

## Importación de los datos

In [2]:
def load_dataset():
    
    dataset_path = 'dataset/'
    df_business = pd.read_json(dataset_path + 'business.json', lines=True)
    
    size = 1000000
    df_reviews = pd.read_json(dataset_path + 'review.json', lines=True, 
                          dtype={'review_id':str,'user_id':str,
                                 'business_id':str,'stars':int,
                                 'date':str,'text':str,'useful':int,
                                 'funny':int,'cool':int},
                          chunksize=size)
    reviews_list = []
    for df_review in tqdm(df_reviews):
        df_review = df_review.drop(['review_id','useful','funny','cool'], axis=1)
        df_review = df_review.rename(columns={'stars': 'review_stars'})
        df_review_m = pd.merge(df_business, df_review, on='business_id', how='inner')
        reviews_list.append(df_review_m)

    df_review = pd.concat(reviews_list, ignore_index=True, join='outer', axis=0)
    return df_review, df_business

In [3]:
def filter_by_user(df_review, user_ids, non_seen_items):
    user_cities = list(df_review[df_review['user_id'].isin(user_ids)]['city'].values)
    user_items = list(set(list(df_review[df_review['user_id'].isin(user_ids)]['business_id'])))
    reviews_user = df_review[(df_review['city'].isin(user_cities)) & (df_review['business_id'].isin(user_items))]
    if non_seen_items:
        user_cities = list(df_review[df_review['user_id'].isin([user_ids[0]])]['city'].values)
        user_items = list(set(list(df_review[df_review['user_id'].isin([user_ids[0]])]['business_id'])))
        non_seen_items = df_review[(df_review['city'].isin(user_cities)) & (df_review['business_id'].isin(user_items) == False)]
        return reviews_user, non_seen_items[['business_id', 'user_id', 'review_stars']]
    else:
        return reviews_user, None

In [4]:
def cosine_similarity(matrix):
    return 1-pairwise_distances(matrix, metric="cosine")

In [5]:
def get_similarity_users(df_reviews_user, user_id):
    
    reviews_user = df_reviews_user[['user_id', 'business_id', 'review_stars']].drop_duplicates()
    review_user_matrix = reviews_user.pivot_table(values='review_stars', index='user_id', columns='business_id').fillna(0)
    idx = list(review_user_matrix.index)
    cosine_sim = cosine_similarity(review_user_matrix)
    cosine_sim_matrix = pd.DataFrame(data = cosine_sim, index = idx, columns = idx)
    user_sim = cosine_sim_matrix.filter(items=[user_id], axis=0)
    most_sim_k_users = user_sim.max().rename_axis('user').reset_index().sort_values(by=0, ascending=False)
    most_sim_k_users.columns = ['user_id', 'similarity']
    
    return most_sim_k_users

## Sistema de recomendación basado en Contenido

## Preprocesmiento de las caracteristicas

In [6]:
def transform_business_features(df_business, reprocess):
    if reprocess:
        df_business = pd.concat([df_business.drop(['attributes'], axis=1), df_business['attributes'].apply(pd.Series)], axis=1)
        categories = [x.split(',') for x in list(df_business['categories']) if x != None]
        categories = list(set([x.lstrip() for x in [item for sublist in categories for item in sublist]]))

        for category in categories:
            column_name = category.replace(' ', '_').replace('-','_').replace('/','_').lower()
            df_business[category] = df_business['categories'].str.contains(category)

        delete_cols = ['address', 'state', 'postal_code', 'latitude', 'longitude', 'categories', 'hours', 'review_count', \
                       'is_open', 'city', 'stars']
        df_business = df_business.drop(columns = delete_cols)

        dict_cols = ['BusinessParking','Ambience','GoodForMeal','Music', 'BestNights', 'HairSpecializesIn', \
                     'DietaryRestrictions']
        new_cols = []
        for dict_col in tqdm(dict_cols):
            new_df = df_business[dict_col].apply(pd.Series)
            new_cols.append(new_df.columns)
            df_business = pd.concat([df_business.drop([dict_col], axis=1), new_df], axis=1)

        unicode_cols = ['Alcohol', 'WiFi', 'RestaurantsAttire', 'NoiseLevel', 'Smoking', 'BYOBCorkage', 'AgesAllowed']
        for unicode_col in tqdm(unicode_cols):
            df_business[unicode_col] = df_business[unicode_col].str.replace('u', '')

        df_business = df_business.replace('True', '1').replace('False', '0').fillna('0')
        df_business = df_business.replace(True, '1').replace(False, '0').replace('None','0')
        df_business = df_business.drop(0, axis=1)

        avoid_cols = ['business_id', 'city'] + dict_cols + unicode_cols
        transform_cols = [x for x in df_business.columns if x not in avoid_cols]
        for col in tqdm(transform_cols):
            df_business[col] = df_business[col].fillna(0)
            df_business[col] = df_business[col].astype(int)

        df_business['WiFi'] = df_business['WiFi'].replace('0', 'no specify')
        df_business['Alcohol'] = df_business['Alcohol'].replace('0', 'no specify')
        df_business['RestaurantsAttire'] = df_business['RestaurantsAttire'].replace(0, 'no specify')
        df_business['NoiseLevel'] = df_business['NoiseLevel'].replace('0', 'no specify')
        df_business['Smoking'] = df_business['Smoking'].replace('0', 'no specify')
        df_business['BYOBCorkage'] = df_business['BYOBCorkage'].replace('0', 'no specify')
        df_business['AgesAllowed'] = df_business['AgesAllowed'].replace('0', 'no specify')

        cat_cols = ['WiFi', 'Alcohol', 'RestaurantsAttire', 'NoiseLevel', 'Smoking', 'BYOBCorkage', 'AgesAllowed']
        df_business = pd.get_dummies(df_business, columns=cat_cols)
    else:
        df_business= pd.read_pickle('df_business_transformed.pkl')
    
    return df_business

In [7]:
def extract_user_features(df_review, df_business, similarity_users, user_id, K_users):
    
    users_id = list(similarity_users.head(K_users+1)['user_id'])
    df_reviews_user, df_non_seen_items = filter_by_user(df_review, users_id, True)
    df_non_seen_items = df_non_seen_items[['business_id']].drop_duplicates()

    df_business_features = transform_business_features(df_business, reprocess = False)
    df_reviews_user = df_reviews_user[['business_id', 'review_stars']].merge(df_business_features, on='business_id', how='left')
    
    cols_X = [x for x in df_reviews_user.columns if x not in ['business_id', 'review_stars', 'city', 'stars']]
    X, y = df_reviews_user[cols_X], df_reviews_user['review_stars']
    
    pesos_features, pval= chi2(X, y)
    pesos_features = np.nan_to_num(pesos_features)
    pesos_features_mask = pesos_features>0
    X = X[X.columns[pesos_features_mask]]
    
    df_non_seen_items = df_non_seen_items.merge(df_business_features, on='business_id', how='left')
    X_rec, y_rec = df_non_seen_items[X.columns], df_non_seen_items[['business_id']]
    
    return X, y, X_rec, y_rec

### Extracción de caracteristicas mas importantes

In [17]:
def generate_recommendations(X, y, X_rec, y_rec, K_rec):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    model = KNeighborsClassifier(2)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    prec, recall, fscore, supp = precision_recall_fscore_support(y_test, y_pred)
    message = """ Promedio de Precisión del modelo: {} \n Promedio de Cobertura del modelo: {} \n Promedio de Medida F del modelo: {}"""
    print(message.format(np.mean(prec), np.mean(recall), np.mean(fscore)))
    
    predictions = model.predict(X_rec)
    df_pred = pd.DataFrame(data = y_rec, columns=['business_id'])
    df_pred.loc[:, ['review_stars_predicted']] = predictions
    
    df_pred = df_pred.sort_values(by='review_stars_predicted', ascending=False).head(K_rec)
    df_pred = df_pred.merge(df_business, on='business_id', how='left')[['name', 'address', 'city', 'state', 'review_stars_predicted']]
    return df_pred

In [9]:
user_id = 'MR_0VqlmaHRBskfq_u9UaA'
K_sim_user = 10
K_rec = 10

df_review, df_business = load_dataset()
df_reviews_user, df_non_seen_items = filter_by_user(df_review, [user_id], True)
similarity_users = get_similarity_users(df_reviews_user, user_id)

7it [03:49, 32.74s/it]


In [18]:
X, y, X_rec, y_rec = extract_user_features(df_review, df_business, similarity_users, user_id, K_sim_user)

predictions = generate_recommendations(X, y, X_rec, y_rec, K_rec)

 Promedio de Precisión del modelo: 0.24408017562438547
    Promedio de Cobertura del modelo: 0.24876622800098577
    Promedio de Medida F del modelo: 0.22868379989176826
