In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import DataLoader
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer

In [2]:
# ratings = pd.read_csv("data/all_recipe_clean.csv")
with open("data/test_data_100.pkl", "rb") as infile:
    recipes = pickle.load(infile) 
ratings = pd.read_csv("data/test_ratings_100.csv")
dataLoader = DataLoader.DataLoader(ratings, recipes)
user_holdout, recipe_holdout, holdout = dataLoader.get_holdout_data()
holdout_X = [t[:2] for t in holdout]
holdout_Y = np.array([t[2] for t in holdout])

In [5]:
class RecipeMeanEstimator:
    """
    Just guesses the mean for the recipe
    """
    def __init__(self, dataLoader, recipe_holdout):
        self.dataLoader = dataLoader
        self.recipe_holdout = recipe_holdout
        
    def get_recipe_average(self, recipe_id):
        ratings = self.dataLoader.get_recipe_ratings(
            recipe_id, self.recipe_holdout)
        return np.round(np.mean(np.fromiter(ratings.values(), dtype=np.int64)))
    
    def fit(self, X, y):
        """
        X: list of (user_id, recipe_id) tuple
        y: np.array user rating
        """
        return self.predict(X)
    
    def predict(self, X):
        """
        X: list of (user_id, recipe_id) tuple
        """
        return np.array([self.get_recipe_average(recipe_id) for _, recipe_id in X])
    
    def get_params(self, deep=False):
        return {
            'dataLoader': self.dataLoader,
            'recipe_holdout': self.recipe_holdout
        }
        

In [6]:
class UserMeanEstimator:
    """
    Just guesses the user rating mean
    """
    def __init__(self, dataLoader, user_holdout):
        self.dataLoader = dataLoader
        self.user_holdout = user_holdout
        
    def get_user_average(self, user_id):
        ratings = self.dataLoader.get_user_ratings(user_id, self.user_holdout)
        return np.round(np.mean(np.fromiter(ratings.values(), dtype=np.int64)))
        
    def fit(self, X, y):
        """
        X: list of (user_id, recipe_id) tuple
        y: np.array user rating
        """
        return self.predict(X)
    
    def predict(self, X):
        """
        X: list of (user_id, recipe_id) tuple
        """
        return np.array([self.get_user_average(user_id) for user_id, _ in X])
    
    def get_params(self, deep=False):
        return {
            'dataLoader': self.dataLoader,
            'user_holdout': self.user_holdout
        }
            

In [7]:
class TunedMeanEstimator:
    """
    Takes a weighted average of the user's average rating and the
    recipe's average rating, then tunes the weighting parameter w
    based on data.
    """
    def __init__(self, dataLoader, recipe_holdout, user_holdout):
        self.dataLoader = dataLoader
        self.recipe_holdout = recipe_holdout
        self.user_holdout = user_holdout
        self.w = np.array([0.5, 0.5])
        self.eta = 0.01
#         self.classifier = SGDClassifier()
#         self.vec = DictVectorizer()
  
    def get_recipe_average(self, recipe_id):
        ratings = self.dataLoader.get_recipe_ratings(
            recipe_id, self.recipe_holdout)
        return np.mean(np.fromiter(ratings.values(), dtype=np.int64))
    
    def get_user_average(self, user_id):
        ratings = self.dataLoader.get_user_ratings(user_id, self.user_holdout)
        return np.mean(np.fromiter(ratings.values(), dtype=np.int64))
        
#     def get_feat_dict(self, user_id, recipe_id):
#         return {
#             "user_avg": self.get_user_average(user_id),
#             "recipe_avg": self.get_recipe_average(recipe_id),
#         }
        
    def fit(self, X, y):
        self.eta = 0.01
        feats = [np.array([self.get_user_average(user_id), self.get_recipe_average(recipe_id)]) for user_id, recipe_id in X]
#         feats = [self.get_feat_dict(user_id, recipe_id) for user_id, recipe_id in X]
#         feats = self.vec.fit_transform(feats)
#         return self.classifier.fit(feats, y)
        for epoch in range(10):
            for i, feat in enumerate(feats):
                pred = np.dot(self.w, feat)
#                 print(self.w, feat, pred, y[i])
                loss = pred - y[i]
                self.w -= self.eta*2*loss*feat
            self.eta /= 2
        print(self.w)
    
    def predict(self, X):
#         feats = [self.get_feat_dict(user_id, recipe_id) for user_id, recipe_id in X]
        feats = [np.array([self.get_user_average(user_id), self.get_recipe_average(recipe_id)]) for user_id, recipe_id in X]
        return [np.round(np.dot(self.w, feat)) for feat in feats]
#         feats = self.vec.transform(feats)
#         return self.classifier.predict(feats)
    
    def get_params(self, deep=False):
        return {
            'dataLoader': self.dataLoader,
            'user_holdout': self.user_holdout,
            'recipe_holdout': self.recipe_holdout,
        }

In [8]:
class SimpleSGDEstimator:
    """
    Takes a weighted average of the user's average rating and the
    recipe's average rating, then tunes the weighting parameter w
    based on data.
    """
    def __init__(self, dataLoader, recipe_holdout, user_holdout):
        self.dataLoader = dataLoader
        self.recipe_holdout = recipe_holdout
        self.user_holdout = user_holdout
        self.classifier = SGDClassifier()
        self.vec = DictVectorizer()
  
    def get_recipe_average(self, recipe_id):
        ratings = self.dataLoader.get_recipe_ratings(
            recipe_id, self.recipe_holdout)
        return np.mean(np.fromiter(ratings.values(), dtype=np.int64))
    
    def get_user_average(self, user_id):
        ratings = self.dataLoader.get_user_ratings(user_id, self.user_holdout)
        return np.mean(np.fromiter(ratings.values(), dtype=np.int64))
        
    def get_feat_dict(self, user_id, recipe_id):
        return {
            "user_avg": self.get_user_average(user_id),
            "recipe_avg": self.get_recipe_average(recipe_id),
        }
        
    def fit(self, X, y):
        feats = [self.get_feat_dict(user_id, recipe_id) for user_id, recipe_id in X]
        feats = self.vec.fit_transform(feats)
        return self.classifier.fit(feats, y)
    
    def predict(self, X):
        feats = [self.get_feat_dict(user_id, recipe_id) for user_id, recipe_id in X]
        feats = self.vec.transform(feats)
        return self.classifier.predict(feats)
    
    def get_params(self, deep=False):
        return {
            'dataLoader': self.dataLoader,
            'user_holdout': self.user_holdout,
            'recipe_holdout': self.recipe_holdout,
        }

In [9]:
class CollaborativeFilteringEstimator:
    """
    Uses item-item collaborative filtering to predict the user rating
    """
    def __init__(self, dataLoader, recipe_holdout, user_holdout, similarity=None):
        self.dataLoader = dataLoader
        self.recipe_holdout = recipe_holdout
        self.user_holdout = user_holdout
        self.index_to_recipe_id = [id for id in self.dataLoader.get_recipe_ids()]
        self.recipe_id_to_index = { id: idx for idx, id in enumerate(self.index_to_recipe_id) }
        if similarity is None:
            self.similarity = self.compute_similarity_matrix()
        else:
            self.similarity = similarity
        
    def compute_similarity_matrix(self):
        
        num_recipes = len(self.recipe_id_to_index)
        sim = np.zeros((num_recipes, num_recipes))
        
        # since symmetric, only have sim[smallerIndex][largerIndex]
        indices = list(enumerate(self.index_to_recipe_id))
        
        for idx1, recipe1 in indices:
            ratings1 = self.dataLoader.get_recipe_ratings(recipe1, self.recipe_holdout)
            
            related_recipes = set()
            for user_id in dataLoader.get_recipe_ratings(5, recipe_holdout).keys():
                rated_recipes = dataLoader.get_user_ratings(user_id, user_holdout).keys()
                related_recipes = related_recipes.union(rated_recipes)
            
            if idx1 % 100 == 0:
                print("Similarity computation progress", idx1)
            for recipe2 in related_recipes:
                idx2 = self.recipe_id_to_index[recipe2]
                if idx2 > idx1:
                    sim[idx1][idx2] = self.compute_cosine_similiarity(ratings1, recipe2)
        return sim
    
    def compute_cosine_similiarity(self, ratings1, r2):
        ratings2 = self.dataLoader.get_recipe_ratings(r2, self.recipe_holdout)
        prod = 0
        for k in ratings1:
            if k in ratings2:
                prod += ratings1[k] * ratings2[k]
        return prod / (len(ratings1) * len(ratings2))
        
    def fit(self, X, y):
        """
        X: list of (user_id, recipe_id) tuple
        y: np.array user rating
        """
        return self.predict(X)
    
    def predict(self, X):
        """
        X: list of (user_id, recipe_id) tuple
        """
        preds = []
        for user_id, recipe_id in X:
            user_ratings = self.dataLoader.get_user_ratings(user_id, self.user_holdout)
            numer = 0.0
            denom = 0.0
            pred_idx = self.recipe_id_to_index[recipe_id]
            for neighbor_rid, rating in user_ratings.items():
                neighbor_idx = self.recipe_id_to_index[neighbor_rid]
                low_id, high_id = tuple(sorted([pred_idx, neighbor_idx]))
                sim = self.similarity[low_id][high_id]
                numer += sim * rating
                denom += sim
            if denom == 0.0:
                pred = np.mean(np.fromiter(user_ratings.values(), dtype=np.int64))
            else:
                pred = numer / denom
            preds.append(pred)
        return np.round(np.array(preds))
        
    def get_params(self, deep=False):
        return {
            'dataLoader': self.dataLoader,
            'user_holdout': self.user_holdout,
            'recipe_holdout': self.recipe_holdout,
            'similarity': self.similarity,
        }

In [10]:
class LogisticEstimator:
    def __init__(self, dataLoader, recipe_holdout, user_holdout):
        self.dataLoader = dataLoader
        self.recipe_holdout = recipe_holdout
        self.user_holdout = user_holdout
        self.regressor = LogisticRegression(max_iter=100, penalty='l1')
        self.vec = DictVectorizer()
        
        self.recipe_id_to_index = { id: idx for idx, id in enumerate(dataLoader.get_recipe_ids()) }
        
    def extract_features(self, X):
        feats = [self.get_feature_dict(user_id, recipe_id) for user_id, recipe_id in X]
#         f = self.vec.fit_transform(feats)
        return feats

#     def extract_features(self, X):
#         recipe_ratings_matrix = sp.sparse.lil_matrix((len(X), len(self.recipe_id_to_index)), dtype=np.int8)
#         feats = []
#         for i, ids in enumerate(X):
#             user_id, recipe_id = ids
#             item_feats = self.get_feature_dict(user_id, recipe_id, i, recipe_ratings_matrix)
#             feats.append(item_feats)
#         f = self.vec.fit_transform(feats)
#         f = sp.sparse.hstack([f, recipe_ratings_matrix])
#         return f
        
    def get_feature_dict(self, user_id, recipe_id):
        user_ratings = self.dataLoader.get_user_ratings(user_id, self.user_holdout)
        avg_user_rating = np.mean(np.fromiter(user_ratings.values(), dtype=np.int64))
        
        recipe_ratings =  self.dataLoader.get_recipe_ratings(
            recipe_id, self.recipe_holdout)
        avg_recipe_rating = np.mean(np.fromiter(recipe_ratings.values(), dtype=np.int64))
        recipe_info = self.dataLoader.get_recipe_info(recipe_id)
        
        feats = {
            'user_id': user_id,
            'recipe_id': recipe_id,
            'avg_user_rating': avg_user_rating,
            'avg_recipe_rating': avg_recipe_rating,
            'calories': recipe_info['calories']
        }
        
        for category in recipe_info["categories"]:
            feats["cat_{}".format(category)] = 1
        for ingredient in recipe_info["ingredients"]:
            ing_name = ingredient["key ingredient"]
            feats["ing_{}".format(ing_name)] = ingredient["quantity"]
        
#         for other_recipe_id, rating in user_ratings.items():
#             feats[str((user_id, other_recipe_id))] = rating
#         for other_user_id, rating in recipe_ratings.items():
#             feats[str((other_user_id, recipe_id))] = rating
        
        return feats
        
    
#     def get_feature_dict(self, user_id, recipe_id, i, recipe_mat):
#         user_ratings = self.dataLoader.get_user_ratings(user_id, self.user_holdout)
#         avg_user_rating = np.mean(np.fromiter(user_ratings.values(), dtype=np.int64))
        
#         recipe_ratings =  self.dataLoader.get_recipe_ratings(
#             recipe_id, self.recipe_holdout)
#         avg_recipe_rating = np.mean(np.fromiter(recipe_ratings.values(), dtype=np.int64))

#         feats = {
#             'user_id': user_id,
#             'recipe_id': recipe_id,
#             'avg_user_rating': avg_user_rating,
#             'avg_recipe_rating': avg_recipe_rating,
#         }
        
#         for other_recipe_id, rating in user_ratings.items():
# #             feats[str((user_id, other_recipe_id))] = rating
#             recipe_mat[i, self.recipe_id_to_index[other_recipe_id]] = rating
# #         for other_user_id, rating in recipe_ratings.items():
# #             feats[str((other_user_id, recipe_id))] = rating
        
#         return feats
    
    def fit(self, X, y):
        """
        X: list of (user_id, recipe_id) tuple
        y: np.array user rating
        """
        feats = self.extract_features(X)
        feats = self.vec.fit_transform(feats)
        return self.regressor.fit(feats, y)
        
    def predict(self, X):
        """
        X: list of (user_id, recipe_id) tuple
        """
        feats = self.extract_features(X)
        feats = self.vec.transform(feats)
        return self.regressor.predict(feats)
    
    def get_params(self, deep=False):
        return {
            'dataLoader': self.dataLoader,
            'user_holdout': self.user_holdout,
            'recipe_holdout': self.recipe_holdout,
        }

In [11]:
class RandomForestEstimator:
    def __init__(self, dataLoader, recipe_holdout, user_holdout):
        self.dataLoader = dataLoader
        self.recipe_holdout = recipe_holdout
        self.user_holdout = user_holdout
        self.classifier = RandomForestClassifier(n_estimators=100)
        self.vec = DictVectorizer()
    
    def extract_features(self, X):
        feats = [self.get_feature_dict(user_id, recipe_id) for user_id, recipe_id in X]
#         f = self.vec.fit_transform(feats)
        return feats
    
    def get_feature_dict(self, user_id, recipe_id):
        user_ratings = self.dataLoader.get_user_ratings(user_id, self.user_holdout)
        avg_user_rating = np.mean(np.fromiter(user_ratings.values(), dtype=np.int64))
        
        recipe_ratings =  self.dataLoader.get_recipe_ratings(
            recipe_id, self.recipe_holdout)
        avg_recipe_rating = np.mean(np.fromiter(recipe_ratings.values(), dtype=np.int64))

        feats = {
            'user_id': user_id,
            'recipe_id': recipe_id,
            'avg_user_rating': avg_user_rating,
            'avg_recipe_rating': avg_recipe_rating,
        }
        
        for other_recipe_id, rating in user_ratings.items():
            feats[str((user_id, other_recipe_id))] = rating
        for other_user_id, rating in recipe_ratings.items():
            feats[str((other_user_id, recipe_id))] = rating
        
        return feats
        
    def fit(self, X, y):
        """
        X: list of (user_id, recipe_id) tuple
        y: np.array user rating
        """
        feats = self.extract_features(X)
        feats =  self.vec.fit_transform(feats)
        return self.classifier.fit(feats, y)
        
    def predict(self, X):
        """
        X: list of (user_id, recipe_id) tuple
        """
        feats = self.extract_features(X)
        feats = self.vec.transform(feats)
        return self.classifier.predict(feats)
    
    def get_params(self, deep=False):
        return {
            'dataLoader': self.dataLoader,
            'user_holdout': self.user_holdout,
            'recipe_holdout': self.recipe_holdout,
        }

In [31]:
recipeMeanEstimator = RecipeMeanEstimator(dataLoader, recipe_holdout)
# recipeMeanEstimator.fit(holdout_X, holdout_Y)

userMeanEstimator = UserMeanEstimator(dataLoader, user_holdout)
# userMeanEstimator.fit(holdout_X, holdout_Y)

collabFilterEstimator = CollaborativeFilteringEstimator(dataLoader, recipe_holdout, user_holdout)

logitEstimator = LogisticEstimator(dataLoader, recipe_holdout, user_holdout)
# logitEstimator.fit(holdout_X, holdout_Y)

# logitEstimator.predict(holdout_X[:1000])

randomForestEstimator = RandomForestEstimator(dataLoader, recipe_holdout, user_holdout)
# randomForestEstimator.fit(holdout_X, holdout_Y)
# randomForestEstimator.predict(holdout_X[:1000])

tunedMeanEstimator = TunedMeanEstimator(dataLoader, recipe_holdout, user_holdout)
# tunedMeanEstimator.fit(holdout_X, holdout_Y)
# tunedMeanEstimator.predict(holdout_X[:10])

simpleSGDEstimator = SimpleSGDEstimator(dataLoader, recipe_holdout, user_holdout)


# cross_val_score(tunedMeanEstimator, holdout_X, holdout_Y, cv=5, scoring='f1_macro')

Similarity computation progress 0
Similarity computation progress 100
Similarity computation progress 200
Similarity computation progress 300
Similarity computation progress 400
Similarity computation progress 500
Similarity computation progress 600
Similarity computation progress 700
Similarity computation progress 800
Similarity computation progress 900
Similarity computation progress 1000
Similarity computation progress 1100
Similarity computation progress 1200
Similarity computation progress 1300
Similarity computation progress 1400
Similarity computation progress 1500
Similarity computation progress 1600
Similarity computation progress 1700
Similarity computation progress 1800
Similarity computation progress 1900
Similarity computation progress 2000
Similarity computation progress 2100
Similarity computation progress 2200
Similarity computation progress 2300
Similarity computation progress 2400
Similarity computation progress 2500
Similarity computation progress 2600
Similarity co

In [36]:
print("Recipe Mean Estimator Cross-Val F1-Macro Score",
      np.mean(cross_val_score(recipeMeanEstimator, holdout_X, holdout_Y, cv=5, scoring='f1_macro')))

print("User Mean Estimator Cross-Val F1-Macro Score",
      np.mean(cross_val_score(userMeanEstimator, holdout_X, holdout_Y, cv=5, scoring='f1_macro')))
 
print("Collab Filter Estimator Cross-Val F1-Macro Score",
      np.mean(cross_val_score(collabFilterEstimator, holdout_X, holdout_Y, cv=5, scoring='f1_macro')))

print("Logistic Regression Cross-Val F1-Macro Score",
     np.mean(cross_val_score(logitEstimator, holdout_X, holdout_Y, cv=5, scoring='f1_macro')))

print("Random Forest Cross-Val F1-Macro Score",
      np.mean(cross_val_score(randomForestEstimator, holdout_X, holdout_Y, cv=5, scoring='f1_macro')))

Recipe Mean Estimator Cross-Val F1-Macro Score 0.15476658380193647


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [14]:
# collabFilterEstimator.predict(holdout_X)
# cross_val_score(recipeMeanEstimator, holdout_X, holdout_Y, cv=5, scoring='f1_macro')
# cross_val_score(logitEstimator, holdout_X, holdout_Y, cv=5, scoring='f1_macro')
# cross_val_score(collabFilterEstimator, holdout_X, holdout_Y, cv=5, scoring='f1_macro')

  'recall', 'true', average, warn_for)


array([0.2308051 , 0.20650424, 0.18901866, 0.17943824, 0.23043864])

In [79]:
# logitEstimator = LogisticEstimator(dataLoader, recipe_holdout, user_holdout)
# logitEstimator.fit(holdout_X, holdout_Y)

0.15837094399999999

In [24]:
simpleSGDEstimator = SimpleSGDEstimator(dataLoader, recipe_holdout, user_holdout)
np.mean(cross_val_score(simpleSGDEstimator, holdout_X, holdout_Y, cv=5, scoring='f1_macro'))



0.08264884874505347

In [14]:
# np.mean(cross_val_score(tunedMeanEstimator, holdout_X, holdout_Y, cv=5, scoring='f1_macro'))
logitEstimator = LogisticEstimator(dataLoader, recipe_holdout, user_holdout)
logitEstimator.fit(holdout_X, holdout_Y)
logitEstimator.predict(holdout_X)


array([5, 4])