## **PROJECT NAME**
##### *PREDICTION ENGINE - RESTAURANTS*

**GROUP DETAILS:** GROUP 15

**TEAM MEMBERS**

| Name                               	| ID       	|
|:------------------------------------	|:---------:|
| Prashant Arya                        	| 12010011 	|
| Abhilash Gadepalli                	| 12010078 	|
| Debjit Ray                         	| 12010066 	|

#### Import necessary libraries

In [1]:
import os
import itertools
import random
from functools import reduce

import numpy as np
import pandas as pd

from surprise import Reader, Dataset
from surprise.prediction_algorithms.knns import KNNWithMeans

from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.decomposition import NMF

In [2]:
os.chdir("RCData")

In [3]:
# import files
chefmozaccepts = pd.read_csv("chefmozaccepts.csv")
chefmozcuisine = pd.read_csv("chefmozcuisine.csv")
chefmozhours = pd.read_csv("chefmozhours4.csv")
geoplaces = pd.read_csv("geoplaces2.csv", encoding = "latin-1")

rating = pd.read_csv("rating_final.csv")

usercuisine = pd.read_csv("usercuisine.csv")
userpayment = pd.read_csv("userpayment.csv")
user_profile = pd.read_csv("userprofile.csv")

In [4]:
# get an overall rating by combining the 3 existing ratings
rating["overall_rating"] = rating.apply(lambda rating: rating["food_rating"] + rating["service_rating"] + rating["rating"], 
                                        axis = 1)

In [5]:
# get a user x restaurant (item) matrix
# rating range is from 0 to 6; add offset of 10 and replace missing values with 0

userPlaceMatrix = rating.pivot(index = "userID", columns = "placeID", values = "overall_rating")

userPlaceMatrix = userPlaceMatrix.add(10)
userPlaceMatrix = userPlaceMatrix.fillna(0)

# Hybrid Model

In [6]:
def hybrid_model(rating):
    
    """
    Predicts ratings based on 4 methods: user-based collaborative filtering, item-based collaborative filtering, a weighted
    average of both types of filtering, and a combination of either type based on the number of ratings for a user.
    
    Accepts a DataFrame with existing rating at a user-item level.
    """
    
    # read data
    reader = Reader(rating_scale = (0, 6))
    data = Dataset.load_from_df(rating.filter(["userID", "placeID", "overall_rating"]), reader)
    
    # number of ratings for each user
    user_n_ratings = rating.groupby(["userID"]).size().rename("n_ratings").to_frame().sort_values("n_ratings")
    
    # all possible user-item combinations
    all_user_item_combo = pd.DataFrame(itertools.product(rating["userID"].unique(), rating["placeID"].unique()), 
                                   columns = ["userID", "placeID"])
    
    # unrated user item combinations
    unrated_user_item_combo = all_user_item_combo.merge(rating, on = ["userID", "placeID"], how = "left")
    unrated_user_item_combo = unrated_user_item_combo.loc[unrated_user_item_combo.isna().any(axis = 1), 
                                                          ["userID", "placeID"]]
    # convert data to train set
    trainset_full = data.build_full_trainset()
    
    # user-based collaborative filtering
    sim_options = {"name": "cosine", "user_based": True}
    knnb_u = KNNWithMeans(sim_options = sim_options)
    _ = knnb_u.fit(trainset_full)
    predictions_knnb_u = unrated_user_item_combo.apply(lambda row: knnb_u.predict(row[0], row[1]), axis = 1).tolist()

    # item-based collaborative filtering
    sim_options = {"name": "cosine", "user_based": False}
    knnb_i = KNNWithMeans(sim_options = sim_options)
    _ = knnb_i.fit(trainset_full)
    predictions_knnb_i = unrated_user_item_combo.apply(lambda row: knnb_i.predict(row[0], row[1]), axis = 1).tolist()

    # flag to indicate if recommendation is user based or item based
    user_n_ratings["CF_type"] = user_n_ratings["n_ratings"].apply(lambda n: "U" if n > 2 else "I")

    # merge datasets of both types of recommendations
    user_item_hybrid = pd.merge(pd.DataFrame(predictions_knnb_i), pd.DataFrame(predictions_knnb_u), 
                                on = ["uid", "iid", "r_ui"], how = "inner", suffixes = ("_item", "_user"))
    user_item_hybrid.drop(["details_item", "details_user"], axis = 1, inplace = True)

    user_item_hybrid_flag = user_item_hybrid.merge(user_n_ratings, left_on = "uid", right_on = "userID")
    
    # get weighted prediction based on flag (assigned earlier)
    user_item_hybrid_flag["weighted_prediction"] = user_item_hybrid_flag.apply(lambda row: row["est_item"]/4 + row["est_user"]*0.75 
                                                                               if row["CF_type"] == "U" 
                                                                               else row["est_item"]*0.75 + row["est_user"]/4, 
                                                                               axis = 1)

    # get either user-based prediction or item-based prediction based ont flag
    user_item_hybrid_flag["non_weighted_prediction"] = user_item_hybrid_flag.apply(lambda row: row["est_user"] 
                                                                                   if row["CF_type"] == "U" else row["est_item"],
                                                                                   axis = 1)
    
    user_item_hybrid_flag.rename(columns = {"uid": "userID", "iid": "placeID"}, inplace = True)
    user_item_hybrid_flag.drop(["r_ui"], axis = 1, inplace = True)
    
    return user_item_hybrid_flag

In [7]:
hybrid_model_results = hybrid_model(rating)

Computing the cosine similarity matrix...
Done computing similarity matrix.


  sim = construction_func[name](*args)


Computing the cosine similarity matrix...
Done computing similarity matrix.


# Cosine Prediction Model

In [8]:
class DataScaler(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.sc = StandardScaler()
        self.scaler = None
        self.toscale_feature_names = feature_names

    def fit(self, tmpDF, display = True):
        if display:
            print ("StandardScaler: Fitting the scaler...")
        toscale_features = tmpDF[self.toscale_feature_names]
        self.scaler = self.sc.fit(toscale_features.values)
        return self

    def transform(self, tmpDF, display = True):
        if display:
            print ("StandardScaler: Scaling the data...")
        X = tmpDF.copy()
        toscale_features = X[self.toscale_feature_names]
        features = self.scaler.transform(toscale_features.values)
        X[self.toscale_feature_names] = features
        return pd.DataFrame(X)


In [21]:
def fncFindSim(tmpDF, fieldName):

    """
    Returns the Cosine similarity scores for each pair of entities (userID or placeID)
    
    Accepts a dataframe of user x attributes matrix and then based on all the attributes calculates the similarities between
    different users.
    """
    
    pairedDF = pd.DataFrame(columns = ['Entity_1', 'Entity_2', 'attr_1', 'attr_2', 'Cosine_Sim'])
    
    for currRowIdx in range(len(tmpDF)):
        dict1 = {}
        entity_1 = tmpDF.iloc[currRowIdx][fieldName]
        attr_1 = tmpDF.iloc[currRowIdx,1:].values.tolist()
        
        for offset in range(len(tmpDF)):
            entity_2 = tmpDF.iloc[offset][fieldName]
            attr_2 = tmpDF.iloc[offset,1:].values.tolist()
            Cosine_Sim = np.dot(attr_1, attr_2)/(np.linalg.norm(attr_1)*np.linalg.norm(attr_2))
            dict1 = {'Entity_1': entity_1, 'Entity_2': entity_2, 'attr_1': attr_1, 'attr_2': attr_2, 'Cosine_Sim': Cosine_Sim}
            pairedDF = pairedDF.append(dict1, ignore_index = True) 
            
    return pairedDF

In [10]:
def cosine_prediction(userPlaceMatrix, userProfile):

    """
    Returns the Cosine similarity based prediction for all places for each user
    
    Accepts a user x item matrix with imputed missing values    
    """

    # Convert all categorical variables to dummy variables using one hot encoding.
    fields =  ['smoker', 'drink_level', 'dress_preference', 'ambience', 'transport', 'marital_status', 'hijos', 'interest', 'personality', 'religion', 'activity', 'color', 'budget']
    for fieldName in fields:
        dummies = pd.get_dummies(userProfile[fieldName], prefix=fieldName, drop_first=True)
        userProfile = pd.concat([userProfile, dummies], axis=1)
        #Drop the original columns
        userProfile.drop(fieldName, axis=1, inplace = True)

    # Use the scaler to scale the original numeric fields
    userScaler = DataScaler(feature_names = ['latitude', 'longitude','birth_year', 'weight', 'height'])
    userScaler.fit(userProfile)
    userProfile = userScaler.transform(userProfile)

    # Calculate the Cosine similarity between each pair of UserID's
    userPairDF = fncFindSim(userProfile, 'userID')
    userSimilarityMatrix = userPairDF.pivot(index = "Entity_1", columns = "Entity_2", values = 'Cosine_Sim')

    # Scale the user-place ratings matrix
    scaler = StandardScaler()
    scaleduserPlaceMatrix = pd.DataFrame(scaler.fit_transform(userPlaceMatrix), 
                                         index = userPlaceMatrix.index, columns = userPlaceMatrix.columns)

    # Calculate the predictive rating based on Cosine similarity matrix and scaled user place rating matrix
    userBasedrating = np.dot(userSimilarityMatrix, scaleduserPlaceMatrix)
    userBasedrating = pd.DataFrame(userBasedrating,
                                   index = userPlaceMatrix.index, columns = userPlaceMatrix.columns)

    # Initiate a weight matrix with random weights
    random.seed(10)
    weights = pd.DataFrame(np.random.rand(userBasedrating.shape[1], userBasedrating.shape[1]))

    # Parameters to tune
    totErr = 0
    alpha =  0.02
    beta = 0.0002
    max_iter = 5000
    iter_cnt = 0

    # Calculate the error matrix by subtracting the original scaled user-place ratings matrix and the calculated rating
    # based on the cosine similarity between users and the past ratings available
    errMatrix = np.subtract(np.dot(userBasedrating, weights.transpose()), scaleduserPlaceMatrix)
    totErr = np.sum(np.sum(errMatrix))
    prevErr = 999999

    # Try to reduce the error by updating the weights
    while iter_cnt < max_iter: 
        if ((totErr <= 500) and (totErr >= -500) or (prevErr == totErr)):
            break
        weights = beta + (alpha * weights)
        errMatrix = np.subtract(np.dot(userBasedrating, weights.transpose()), scaleduserPlaceMatrix)
        prevErr = totErr
        totErr = np.sum(np.sum(errMatrix))
        print ("After {} iterations: Total Error: {}".format(iter_cnt, totErr))
        iter_cnt += 1

    # Generate the predictive ratings
    prediction = pd.DataFrame(np.dot(userBasedrating, weights), 
                              columns = userBasedrating.columns, index = userBasedrating.index)
    
    prediction_long = prediction.reset_index().melt(id_vars = ["userID"], var_name = "placeID", 
                                                   value_name = "cosine_predictive_rating")
    
    return prediction_long

In [11]:
cosine_prediction_results = cosine_prediction(userPlaceMatrix, user_profile)

StandardScaler: Fitting the scaler...
StandardScaler: Scaling the data...
After 0 iterations: Total Error: 2472.9129074874418
After 1 iterations: Total Error: 97.15720465631276


# NMF Model

In [12]:
def nmf_func(input_df):
    
    """
    Returns the NMF based prediction of all ratings
    
    Accepts a user x item matrix with imputed missing values    
    """

    # get components
    nmf_model = NMF(n_components=20)

    # fit data and get W and H components
    nmf_model.fit(input_df)
    Theta = nmf_model.transform(input_df)
    M = nmf_model.components_.T

    # making the predictions
    UserPlace_pred = Theta.dot(M.T)

    UserPlace_pred = pd.DataFrame(UserPlace_pred, columns = input_df.columns, index = input_df.index).round(2)

    UserPlace_pred_long = UserPlace_pred.reset_index().melt(id_vars = ["userID"], var_name = "placeID",
                                                            value_name = "nmf_predictive_rating")

    return UserPlace_pred_long

In [13]:
nmf_prediction_results = nmf_func(userPlaceMatrix)



In [14]:
nmf_prediction_results.head()

Unnamed: 0,userID,placeID,nmf_predictive_rating
0,U1001,132560,0.0
1,U1002,132560,0.0
2,U1003,132560,0.0
3,U1004,132560,0.0
4,U1005,132560,0.0


In [15]:
# merge outputs of all prediction methods
pred_datasets = [hybrid_model_results, cosine_prediction_results, nmf_prediction_results]

pred_merged_datasets = reduce(lambda x, y: pd.merge(x, y, on = ["userID", "placeID"]), pred_datasets).set_index(["userID", "placeID"])

In [16]:
pred_rating_cols = ["est_item", "est_user", 'weighted_prediction', 'non_weighted_prediction', 
                    'cosine_predictive_rating', 'nmf_predictive_rating']

In [17]:
def get_top_n(entity, by = "userID", n = 5):
    
    """
    Returns top n recommended items for a user or the top n recommended users for an item.
    
    entity: Either user or item ID
    by: Type of entity i.e. userID or placeID
    n: number of results
    """
    
    levels = ["userID", "placeID"]
    
    if by in levels:
        
        output = [col for col in levels if col != by][0]
        
        if (entity in pred_merged_datasets.index.get_level_values(by)):
        
            return pd.concat(map(lambda pred: pred_merged_datasets.xs(entity, 
                                                                      level = by)\
                                 .sort_values(by = pred, 
                                              ascending = False).head(n).reset_index()[output].rename(f"by_{pred}"), 
                                 pred_rating_cols), 
                             axis = 1)
        else:
            print(f"{by[:-2].capitalize()} does not exist")
        
    
    else:
        print("Incorrect Level")

In [20]:
get_top_n('U1099',by='userID',n =10)

Unnamed: 0,by_est_item,by_est_user,by_weighted_prediction,by_non_weighted_prediction,by_cosine_predictive_rating,by_nmf_predictive_rating
0,132958,132847,132755,132847,132875,135032
1,132955,135070,132847,135070,134999,135050
2,132922,132846,135055,132846,132851,135041
3,134986,132755,135034,132755,132937,135081
4,135034,135055,132922,135055,132955,135062
5,132755,135057,135057,135057,132921,135052
6,135048,135034,132958,135034,132825,135063
7,135080,132862,132955,132862,132862,135106
8,135013,135054,135070,135054,132755,135079
9,132954,135035,132846,135035,132922,135053
