In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

from surprise import KNNBasic, accuracy, Dataset, Reader, SVD
from surprise.model_selection import cross_validate, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import jaccard_score
from scipy import spatial

import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.feature_selection import chi2

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import precision_recall_fscore_support

## Importación de los datos

In [2]:
dataset_path = 'dataset/'
df_business = pd.read_json(dataset_path + 'business.json', lines=True)

In [3]:
size = 1000000
df_reviews = pd.read_json(dataset_path + 'review.json', lines=True, 
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

In [4]:
reviews_list = []
for df_review in tqdm(df_reviews):
    df_review = df_review.drop(['review_id','useful','funny','cool'], axis=1)
    df_review = df_review.rename(columns={'stars': 'review_stars'})
    df_review_m = pd.merge(df_business, df_review, on='business_id', how='inner')
    reviews_list.append(df_review_m)
    
df_review = pd.concat(reviews_list, ignore_index=True, join='outer', axis=0)

7it [03:50, 32.94s/it]


In [5]:
del df_review_m, reviews_list

## Sistema de recomendación Usuario-Usuario

In [72]:
def filter_by_user(df_review, user_ids, non_seen_items):
    user_cities = list(df_review[df_review['user_id'].isin(user_ids)]['city'].values)
    user_items = list(set(list(df_review[df_review['user_id'].isin(user_ids)]['business_id'])))
    reviews_user = df_review[(df_review['city'].isin(user_cities)) & (df_review['business_id'].isin(user_items))]
    if non_seen_items:
        user_cities = list(df_review[df_review['user_id'].isin([user_ids[0]])]['city'].values)
        user_items = list(set(list(df_review[df_review['user_id'].isin([user_ids[0]])]['business_id'])))
        non_seen_items = df_review[(df_review['city'].isin(user_cities)) & (df_review['business_id'].isin(user_items) == False)]
        return reviews_user, non_seen_items[['business_id', 'user_id', 'review_stars']]
    else:
        return reviews_user, None

In [73]:
def cosine_similarity(matrix):
    return 1-pairwise_distances(matrix, metric="cosine")

In [74]:
def get_similarity_users(df_reviews_user, user_id):
    
    reviews_user = df_reviews_user[['user_id', 'business_id', 'review_stars']].drop_duplicates()
    review_user_matrix = reviews_user.pivot_table(values='review_stars', index='user_id', columns='business_id').fillna(0)
    idx = list(review_user_matrix.index)
    cosine_sim = cosine_similarity(review_user_matrix)
    cosine_sim_matrix = pd.DataFrame(data = cosine_sim, index = idx, columns = idx)
    user_sim = cosine_sim_matrix.filter(items=[user_id], axis=0)
    most_sim_k_users = user_sim.max().rename_axis('user').reset_index().sort_values(by=0, ascending=False)
    most_sim_k_users.columns = ['user_id', 'similarity']
    
    return most_sim_k_users

In [75]:
def generate_recommendations(df_review, df_business, similarity_users, user_id, K_sim_user, K_rec):
    
    df_reviews_user, df_non_seen_items = filter_by_user(df_review, [user_id], True)
    means_user_ratings = df_reviews_user[['user_id', 'review_stars']].groupby('user_id').mean().rename_axis('user_id').reset_index()
    similarity_users = similarity_users.merge(means_user_ratings, on='user_id', how='left')
    
    top_sim_user = list(similarity_users['user_id'])
    df_items = df_non_seen_items[df_non_seen_items['user_id'].isin(top_sim_user)][['business_id', 'user_id', 'review_stars']]
    df_items = df_items.pivot_table(values='review_stars', index='user_id', columns='business_id').rename_axis('user_id').reset_index()
    
    similarity_users = similarity_users.merge(df_items, on='user_id', how='left').fillna(0)

    df_recommendations = pd.DataFrame(data = unseen_items, columns= ['item'])
    df_recommendations['prediction'] = 0
    
    ra = similarity_users[similarity_users['user_id']==user_id]['review_stars'].values[0]
    cols = similarity_users.columns
    for unseen_item in tqdm(unseen_items):
        if unseen_item in cols:
            sample = similarity_users[similarity_users[unseen_item]!=0].head(K_sim_user+1).tail(K_sim_user)
            num = np.dot(sample['similarity'], (sample[unseen_item]-sample['review_stars']))
            den = sum(sample['similarity'])
            ri = ra + num/den
            df_recommendations.loc[df_recommendations['item']==unseen_item, ['prediction']] = ri
    df_recommendations = df_recommendations.merge(df_business, left_on='item', right_on='business_id', how='left')
    df_recommendations = df_recommendations.sort_values(by='prediction', ascending=False)[['name', 'address', 'city', 'state', 'prediction']]
    return df_recommendations.head(K_rec)

In [76]:
user_id = 'MR_0VqlmaHRBskfq_u9UaA'
K_sim_user = 10
K_rec = 10
similarity_users = get_similarity_users(df_reviews_user, user_id)
df_recommendations = generate_recommendations(df_review, df_business, similarity_users, user_id, K_sim_user, K_rec)
df_recommendations

100%|████████████████████████████████████████████████████████████████████████| 150346/150346 [01:02<00:00, 2401.05it/s]


Unnamed: 0,name,address,city,state,prediction
36944,Apex Home Appliance Repair,,Seminole,FL,8.2
68096,Primi Urban Cafe,27 4th St N,Saint Petersburg,FL,8.2
141584,Jewelry Repair & Stone Cutting,8578 49th St N,Pinellas Park,FL,8.2
148150,All Smiles Orthodontics,8686 131st St N,Seminole,FL,8.2
5352,King and Sons Tree Service,8901 60th St N,Pinellas Park,FL,7.7
146705,HR Trains & Toys,7900 49th St N,Pinellas Park,FL,7.2
74210,Cypress Breeze Farm,8690 60th St N,Pinellas Park,FL,7.2
26426,The Home Depot,10550 Park Blvd,Seminole,FL,7.2
37078,Alterations by Alice,11150 74th Ave,Seminole,FL,7.2
44218,Cuttin Up,"8680 49th St, Ste 2",Pinellas Park,FL,7.2


## Sistema de recomendación basado en Contenido

## Preprocesmiento de las caracteristicas

In [77]:
def transform_features(df_business, reprocess):
    if reprocess:
        df_business = pd.concat([df_business.drop(['attributes'], axis=1), df_business['attributes'].apply(pd.Series)], axis=1)
        categories = [x.split(',') for x in list(df_business['categories']) if x != None]
        categories = list(set([x.lstrip() for x in [item for sublist in categories for item in sublist]]))

        for category in categories:
            column_name = category.replace(' ', '_').replace('-','_').replace('/','_').lower()
            df_business[category] = df_business['categories'].str.contains(category)

        delete_cols = ['address', 'state', 'postal_code', 'latitude', 'longitude', 'categories', 'hours', 'review_count', \
                       'is_open', 'city', 'stars']
        df_business = df_business.drop(columns = delete_cols)

        dict_cols = ['BusinessParking','Ambience','GoodForMeal','Music', 'BestNights', 'HairSpecializesIn', \
                     'DietaryRestrictions']
        new_cols = []
        for dict_col in tqdm(dict_cols):
            new_df = df_business[dict_col].apply(pd.Series)
            new_cols.append(new_df.columns)
            df_business = pd.concat([df_business.drop([dict_col], axis=1), new_df], axis=1)

        unicode_cols = ['Alcohol', 'WiFi', 'RestaurantsAttire', 'NoiseLevel', 'Smoking', 'BYOBCorkage', 'AgesAllowed']
        for unicode_col in tqdm(unicode_cols):
            df_business[unicode_col] = df_business[unicode_col].str.replace('u', '')

        df_business = df_business.replace('True', '1').replace('False', '0').fillna('0')
        df_business = df_business.replace(True, '1').replace(False, '0').replace('None','0')
        df_business = df_business.drop(0, axis=1)

        avoid_cols = ['business_id', 'city'] + dict_cols + unicode_cols
        transform_cols = [x for x in df_business.columns if x not in avoid_cols]
        for col in tqdm(transform_cols):
            df_business[col] = df_business[col].fillna(0)
            df_business[col] = df_business[col].astype(int)

        df_business['WiFi'] = df_business['WiFi'].replace('0', 'no specify')
        df_business['Alcohol'] = df_business['Alcohol'].replace('0', 'no specify')
        df_business['RestaurantsAttire'] = df_business['RestaurantsAttire'].replace(0, 'no specify')
        df_business['NoiseLevel'] = df_business['NoiseLevel'].replace('0', 'no specify')
        df_business['Smoking'] = df_business['Smoking'].replace('0', 'no specify')
        df_business['BYOBCorkage'] = df_business['BYOBCorkage'].replace('0', 'no specify')
        df_business['AgesAllowed'] = df_business['AgesAllowed'].replace('0', 'no specify')

        cat_cols = ['WiFi', 'Alcohol', 'RestaurantsAttire', 'NoiseLevel', 'Smoking', 'BYOBCorkage', 'AgesAllowed']
        df_business = pd.get_dummies(df_business, columns=cat_cols)
    else:
        df_business= pd.read_pickle('df_business_transformed.pkl')
    
    return df_business

In [78]:
K_users = 2
users_id = list(similarity_users.head(K_users+1)['user_id'])
df_reviews_user, df_non_seen_items = filter_by_user(df_review, users_id, True)
df_non_seen_items = df_non_seen_items[['business_id']].drop_duplicates()

df_business_features = transform_features(df_business, reprocess = False)
df_reviews_user = df_reviews_user[['business_id', 'review_stars']].merge(df_business_features, on='business_id', how='left')

### Extracción de caracteristicas mas importantes

In [79]:
cols_X = [x for x in df_reviews_user.columns if x not in ['business_id', 'review_stars', 'city', 'stars']]
X, y = df_reviews_user[cols_X], df_reviews_user['review_stars']

In [80]:
pesos_features, pval= chi2(X, y)
pesos_features = np.nan_to_num(pesos_features)
pesos_features_mask = pesos_features>0
X = X[X.columns[pesos_features_mask]]

### División del conjunto en entrenamiento y prueba

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Construcción del modelo

In [82]:
#del df_review

In [83]:
t1 = time.time()
model = KNeighborsClassifier(2)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
prec, recall, fscore, supp = precision_recall_fscore_support(y_test, y_pred)
message = """ Promedio de Precisión del modelo: {}
Promedio de Cobertura del modelo: {}
Promedio de Medida F del modelo: {}"""
print(message.format(np.mean(prec), np.mean(recall), np.mean(fscore)))

print(time.time() - t1)

 Promedio de Precisión del modelo: 0.22464907110484714
Promedio de Cobertura del modelo: 0.2320454269634622
Promedio de Medida F del modelo: 0.21450141698449596
42.6911985874176


In [84]:
t1 = time.time()
df_non_seen_items = df_non_seen_items.merge(df_business_features, on='business_id', how='left')
X_pred, df_predictions = df_non_seen_items[X.columns], df_non_seen_items[['business_id']]
predictions = model.predict(X_pred)
df_predictions.loc[:, ['review_stars_predicted']] = predictions
print(time.time() - t1)

6.880300283432007


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [85]:
predictions = df_predictions.sort_values(by='review_stars_predicted', ascending=False).head(20)
predictions = predictions.merge(df_business, on='business_id', how='left')[['name', 'address', 'city', 'state', 'review_stars_predicted']]
predictions

Unnamed: 0,name,address,city,state,review_stars_predicted
0,Andi Matheny Acting Studio,2260 1st Ave S,Saint Petersburg,FL,5
1,Wells Fargo Bank,4100 4th St N,Saint Petersburg,FL,5
2,DICK'S Sporting Goods,6775 22nd Ave N,Saint Petersburg,FL,5
3,Public Storage,6543 34th St N,Pinellas Park,FL,5
4,Vapor Vault,"8424 4th St N, Ste L",Saint Petersburg,FL,5
5,Combat Performance and Fitness,"4400 34th St N, Ste 4",Pinellas Park,FL,5
6,Privacy Electronics,5075 Park Blvd,Pinellas Park,FL,5
7,Bob Lee's Tire Company,1631 4th St N,Saint Petersburg,FL,5
8,Device City,7095 66th St N,Pinellas Park,FL,5
9,Coastal Eye Care,8001 US Hwy 19 N,Pinellas Park,FL,5
