In [1]:
import sys, os 
import json, jsoncomment
import urllib3
import functools
import unittest

# Scope 

provide recommendations to user preferences based on details mentioned in files. This is a very simple filtering based on the matches between likes and dislikes. 

Ofcourse, recommendations is a very interesting and involving exercise and there are ML approaches like clustering, k-means and other algorithms which could be used


# Assumptions 

 ID        |      Assumption Details     | Design Considerations missed |
 -----------:|:---------------------------|:-----------------------------|
 1         | data is absolutely clean, except for trailing comma in venue file | Data Quality, deduping, nulls, non conformance to structure, etc |
 2         | the data volume is really less, therefore we are not using PySpark, Pandas, HIVE and other interesting solutions | Data Volume, scalability and extensibility |
 3         | this is just an exercise, therefore we are not including things around optimizing response speed of the process, but the current process does respond back quite fast | Performance Optimization, parallelization, etc |
 
 
 # Changes made to source data
 
 1. We can use regular expressions in order to remove trailing comma from the data but that would have made the solution look complex, I have worked extensively on regular expression and the following [link](https://github.com/gourav-sg/datamojo/blob/master/US%20National%20Agricultural%20Statistics%20Service%20Parser.ipynb) can be used to see one of my work using regular expression which extracts tables from text files
 2. All the food and drink names have been changed to smaller case
 


# Improvements that can be made

 ID        |      Improvement     | 
 ---------:|:---------------------------|
 1         | we can create meta-data of the name of the food with common names and that will increase the accuracy |
 2         | we can always increase the scalability of this solution by using a few different options |
 3         | I would ideally like to put everything in a class and make a package of this, but I think for the purpose of the interview, I thought of using Jupyter to illustrate the solution

##  Environment

In [240]:
sys.version

'3.6.5 |Anaconda, Inc.| (default, Apr 26 2018, 08:42:37) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

##  Load the data

In [3]:
http = urllib3.PoolManager()
parser = jsoncomment.JsonComment(json)
urllib3.disable_warnings()


def loadData():
    usersLink = "https://gist.githubusercontent.com/benjambles/ea36b76bc5d8ff09a51def54f6ebd0cb/raw/524e40ec297353b8070ff10ee0d9d847e44210f5/users.json"
    usersJson = http.request('GET', usersLink, retries=False)
    userRecords = parser.loads(usersJson.data.decode('utf-8').replace("\n",""))
    usersJson.close()

    venuesLink = "https://gist.githubusercontent.com/benjambles/ea36b76bc5d8ff09a51def54f6ebd0cb/raw/524e40ec297353b8070ff10ee0d9d847e44210f5/venues.json"
    venuesJson = http.request('GET', venuesLink, retries=False)
    venueRecords = parser.loads(venuesJson.data.decode('utf-8').replace("\n",""))
    venuesJson.close()
    
    return(userRecords, venueRecords)

In [6]:
%time (userRecords, venueRecords) = loadData()

CPU times: user 2.23 ms, sys: 980 µs, total: 3.21 ms
Wall time: 12.7 ms


Incase we are loading the content from local files

In [395]:
parser = jsoncomment.JsonComment(json)

usersJson = open("/Users/gouravsengupta/Downloads/users.json", encoding='utf-8')
venuesJson = open("/Users/gouravsengupta/Downloads/venue.json", encoding='utf-8')
userRecords = parser.loads(usersJson.read())
venueRecords = parser.loads(venuesJson.read())
usersJson.close()
venuesJson.close()

In [396]:
userRecords

[{'name': 'John Davis',
  'wont_eat': ['Fish'],
  'drinks': ['Cider', 'Rum', 'Soft drinks']},
 {'name': 'Gary Jones',
  'wont_eat': ['Eggs', 'Pasta'],
  'drinks': ['Tequila', 'Soft drinks', 'beer', 'Coffee']},
 {'name': 'Robert Webb',
  'wont_eat': ['Bread', 'Pasta'],
  'drinks': ['Vokda', 'Gin', 'Whisky', 'Rum']},
 {'name': 'Gavin Coulson',
  'wont_eat': [],
  'drinks': ['Cider', 'Beer', 'Rum', 'Soft drinks']},
 {'name': 'Alan Allen',
  'wont_eat': ['Meat', 'Fish'],
  'drinks': ['Soft drinks', 'Tea']},
 {'name': 'Bobby Robson',
  'wont_eat': ['Mexican'],
  'drinks': ['Vokda', 'Gin', 'whisky', 'Rum', 'Cider', 'Beer', 'Soft drinks']},
 {'name': 'David Lang',
  'wont_eat': ['Chinese'],
  'drinks': ['Beer', 'cider', 'Rum']}]

In [397]:
venueRecords

[{'name': 'El Cantina',
  'food': ['Mexican'],
  'drinks': ['Soft drinks', 'Tequila', 'Beer']},
 {'name': 'Twin Dynasty',
  'food': ['Chinese'],
  'drinks': ['Soft Drinks', 'Rum', 'Beer', 'Whisky', 'Cider']},
 {'name': 'Spice of life',
  'food': ['Eggs', 'Meat', 'Fish', 'Pasta', 'Dairy'],
  'drinks': ['Vokda', 'Gin', 'whisky', 'Rum', 'Cider', 'Beer', 'Soft drinks']},
 {'name': 'The Cambridge',
  'food': ['Eggs', 'Meat', 'Fish', 'Pasta', 'Dairy'],
  'drinks': ['Vokda', 'Gin', 'Cider', 'Beer', 'Soft drinks']},
 {'name': 'Wagamama',
  'food': ['Japanese'],
  'drinks': ['Beer', 'Cider', 'Soft Drinks', 'Sake']},
 {'name': 'Sultan Sofrasi',
  'food': ['Meat', 'Bread', 'Fish'],
  'drinks': ['Beer', 'Cider', 'Soft Drinks']},
 {'name': 'Spirit House',
  'food': ['Nuts', 'Cheese', 'Fruit'],
  'drinks': ['Vodka', 'Gin', 'Rum', 'Tequila']},
 {'name': 'Tally Joe',
  'food': ['Fish', 'Meat', 'Salad', 'Deserts'],
  'drinks': ['Beer', 'Cider', 'Soft Drinks', 'Sake']},
 {'name': 'Fabrique',
  'food': [

## Change the data stucture for easy querying


we will be creating a food based key value pair (the assumption here is that the name of food and drink will not be the same

In [7]:
def updateData():
    itemsInVenue = {} 
    allVenues = []
    allFoodItems = []
    allDrinkItems = []

    for venue in venueRecords:
        allVenues.append(venue["name"])
        for food in venue['food']:
            food = food.lower()
            allFoodItems = list(set(allFoodItems + [food]))
            if food not in itemsInVenue.keys():
                itemsInVenue[food] = {}
                itemsInVenue[food]["type"] = 'food'
                itemsInVenue[food]["in_venue"] = [venue["name"]]
            else:
                itemsInVenue[food]["in_venue"] += [venue["name"]]

        for drink in venue['drinks']:
            drink = drink.lower()
            allDrinkItems = list(set(allDrinkItems + [drink]))        
            if drink not in itemsInVenue.keys():
                itemsInVenue[drink] = {}
                itemsInVenue[drink]["type"] = 'drink'
                itemsInVenue[drink]["in_venue"] = [venue["name"]]
            else:
               itemsInVenue[drink]["in_venue"] += [venue["name"]]
            
    return (itemsInVenue, allVenues, allFoodItems, allDrinkItems) 

In [384]:
(itemsInVenue, allVenues, allFoodItems, allDrinkItems) = updateData()

In [247]:
# this is not required, but just shows the extensibility of this model
#for item in itemsInVenue:
#    itemsInVenue[item]['not_in_venue'] = list(set(venues) - set(itemsInVenue[item]["in_venue"]))
    

In [385]:
itemsInVenue

{'mexican': {'type': 'food', 'in_venue': ['El Cantina']},
 'soft drinks': {'type': 'drink',
  'in_venue': ['El Cantina',
   'Twin Dynasty',
   'Spice of life',
   'The Cambridge',
   'Wagamama',
   'Sultan Sofrasi',
   'Tally Joe',
   'Fabrique']},
 'tequila': {'type': 'drink', 'in_venue': ['El Cantina', 'Spirit House']},
 'beer': {'type': 'drink',
  'in_venue': ['El Cantina',
   'Twin Dynasty',
   'Spice of life',
   'The Cambridge',
   'Wagamama',
   'Sultan Sofrasi',
   'Tally Joe']},
 'chinese': {'type': 'food', 'in_venue': ['Twin Dynasty']},
 'rum': {'type': 'drink',
  'in_venue': ['Twin Dynasty', 'Spice of life', 'Spirit House']},
 'whisky': {'type': 'drink', 'in_venue': ['Twin Dynasty', 'Spice of life']},
 'cider': {'type': 'drink',
  'in_venue': ['Twin Dynasty',
   'Spice of life',
   'The Cambridge',
   'Wagamama',
   'Sultan Sofrasi',
   'Tally Joe']},
 'eggs': {'type': 'food', 'in_venue': ['Spice of life', 'The Cambridge']},
 'meat': {'type': 'food',
  'in_venue': ['Spice of

This shows another way to generate some of the values above, we **do not** need to run this if we have run the code above

In [386]:
allFoodItems = list(filter(lambda itemName: itemsInVenue[itemName]["type"] == "food", itemsInVenue.keys()))
allDrinkItems = list(filter(lambda itemName: itemsInVenue[itemName]["type"] == "drink", itemsInVenue.keys()))
allVenues = list(map(lambda venueRecord: venueRecord["name"], venueRecords))

In [387]:
print(allVenues)
print(allFoodItems)
print(allDrinkItems)

['El Cantina', 'Twin Dynasty', 'Spice of life', 'The Cambridge', 'Wagamama', 'Sultan Sofrasi', 'Spirit House', 'Tally Joe', 'Fabrique']
['mexican', 'chinese', 'eggs', 'meat', 'fish', 'pasta', 'dairy', 'japanese', 'bread', 'nuts', 'cheese', 'fruit', 'salad', 'deserts', 'deli']
['soft drinks', 'tequila', 'beer', 'rum', 'whisky', 'cider', 'vokda', 'gin', 'sake', 'vodka', 'tea', 'coffee']


## Generate the features

In [8]:
def getDistinctInVenue(userName, drinkNames, wontEatNames):
    
    drinkNamesLower = list(map(lambda drinkName: drinkName.lower(), drinkNames))
    wontEatNamesLower = list(map(lambda wontEatName: wontEatName.lower(), wontEatNames))
    eatNamesLower = list(set(allFoodItems) - set(wontEatNamesLower))

    allDrinkVenues = list(map(lambda drinkName: itemsInVenue[drinkName]["in_venue"] if drinkName in itemsInVenue.keys() else []
               , drinkNamesLower))
    allFoodVenues = list(map(lambda foodName: itemsInVenue[foodName]["in_venue"] if foodName in itemsInVenue.keys() else []
               , eatNamesLower))   
    
    allDistinctDrinkVenues = list(set(list(functools.reduce(lambda venueA, venueB: venueA + venueB, allDrinkVenues))))
    allDistinctFoodVenues = list(set(list(functools.reduce(lambda venueA, venueB: venueA + venueB, allFoodVenues))))
    allDistinctVenues = list(set(allDistinctDrinkVenues).intersection(set(allDistinctFoodVenues)))

    venueWithNothingToDrink = list(set(allVenues) - set(allDistinctVenues))
    venueWithNothingToEat   = list(set(allVenues) - set(allDistinctFoodVenues))

    return ({'user_name' : userName,
             'venues_to_go' : allDistinctVenues, 
             'venues_with_no_drinks' : venueWithNothingToDrink, 
             'venues_with_no_food' : venueWithNothingToEat
            })



In [389]:
features = list(map(lambda user: getDistinctInVenue(user["name"], user["drinks"], user["wont_eat"]), userRecords))

In [390]:
features

[{'user_name': 'John Davis',
  'venues_to_go': ['Tally Joe',
   'Wagamama',
   'El Cantina',
   'The Cambridge',
   'Sultan Sofrasi',
   'Spice of life',
   'Fabrique',
   'Spirit House',
   'Twin Dynasty'],
  'venues_with_no_drinks': [],
  'venues_with_no_food': []},
 {'user_name': 'Gary Jones',
  'venues_to_go': ['Tally Joe',
   'Wagamama',
   'El Cantina',
   'The Cambridge',
   'Sultan Sofrasi',
   'Spice of life',
   'Fabrique',
   'Spirit House',
   'Twin Dynasty'],
  'venues_with_no_drinks': [],
  'venues_with_no_food': []},
 {'user_name': 'Robert Webb',
  'venues_to_go': ['Twin Dynasty',
   'Spirit House',
   'The Cambridge',
   'Spice of life'],
  'venues_with_no_drinks': ['Tally Joe',
   'Wagamama',
   'El Cantina',
   'Sultan Sofrasi',
   'Fabrique'],
  'venues_with_no_food': []},
 {'user_name': 'Gavin Coulson',
  'venues_to_go': ['Tally Joe',
   'Wagamama',
   'El Cantina',
   'The Cambridge',
   'Sultan Sofrasi',
   'Spice of life',
   'Fabrique',
   'Spirit House',
   'Tw

## Generating the recommendation

please note that once the features above are generated we can flatten and then input the data to several ML methods to generate recommendations. Currently I am just using basic filtering

In [9]:
def getRecommendations(features):
    hotelsToVisit = []
    hotelsNotToVisit = {}
    
    for records in features:
        if len(hotelsToVisit) == 0: hotelsToVisit = records["venues_to_go"]
        else: hotelsToVisit = list(set(hotelsToVisit).intersection(records["venues_to_go"]))
        for venue in records["venues_with_no_drinks"]:
            if venue not in hotelsNotToVisit.keys():
                hotelsNotToVisit[venue] = {}
                hotelsNotToVisit[venue]["nothing_to_eat"] = []
                hotelsNotToVisit[venue]["nothing_to_drink"] = [records["user_name"]]
            else:
                hotelsNotToVisit[venue]["nothing_to_drink"] += [records["user_name"]]

        for venue in records["venues_with_no_food"]:
            if venue not in hotelsNotToVisit.keys():
                hotelsNotToVisit[venue] = {}
                hotelsNotToVisit[venue]["nothing_to_drink"] = []
                hotelsNotToVisit[venue]["nothing_to_eat"] = [records["user_name"]]
            else:
                hotelsNotToVisit[venue]["nothing_to_eat"] += [records["user_name"]]
    return({'hotels_to_visit' : hotelsToVisit, 'hotels_not_to_visit': hotelsNotToVisit})

another way of calculating `hotelsToVisit`, we **are not** using this 

In [392]:
hotelsToVisit = functools.reduce(lambda venueA, venueB: set(venueA).intersection(venueB),                            
                 map(lambda user: user['venues_to_go'], features))

## Getting it all together

In [10]:
%time

(userRecords, venueRecords) = loadData()
(itemsInVenue, allVenues, allFoodItems, allDrinkItems) = updateData()
features = list(map(lambda user: getDistinctInVenue(user["name"], user["drinks"], user["wont_eat"]), userRecords))
getRecommendations(features)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.25 µs


{'hotels_to_visit': ['The Cambridge', 'Spice of life'],
 'hotels_not_to_visit': {'Fabrique': {'nothing_to_eat': [],
   'nothing_to_drink': ['Robert Webb', 'David Lang']},
  'Sultan Sofrasi': {'nothing_to_eat': [],
   'nothing_to_drink': ['Robert Webb']},
  'El Cantina': {'nothing_to_eat': ['Bobby Robson'],
   'nothing_to_drink': ['Robert Webb', 'Bobby Robson']},
  'Tally Joe': {'nothing_to_eat': [], 'nothing_to_drink': ['Robert Webb']},
  'Wagamama': {'nothing_to_eat': [], 'nothing_to_drink': ['Robert Webb']},
  'Spirit House': {'nothing_to_eat': [], 'nothing_to_drink': ['Alan Allen']},
  'Twin Dynasty': {'nothing_to_eat': ['David Lang'],
   'nothing_to_drink': ['David Lang']}}}

## Unit Test cases

there are several unit test cases that must be written for this exercise, extensively covering, data types, meta-data, data quality, etc, but as of now I am just covering this unit test case.

In [12]:
class testOutputs(unittest.TestCase):
    """
    testing different scenarios
    """
    
    def testHotelsSelection(self):
        # testing for a particular drink which is available only in one venue
        userRecords = [{'name': 'John Davis', 'wont_eat': ['Fish1'], 'drinks': ['Coffee']}]
        features = list(map(lambda user: getDistinctInVenue(user["name"], user["drinks"], user["wont_eat"]), userRecords))
        self.assertEqual(getRecommendations(features)['hotels_to_visit'], ['Fabrique'])
        
    def testHotelsSelection2(self):
        # testing by removing a user whose selection was causing may rejections in venues
        (userRecords, venueRecords) = loadData()
        userRecords = list(filter(lambda user: user["name"] != 'Robert Webb', userRecords))
        features = list(map(lambda user: getDistinctInVenue(user["name"], user["drinks"], user["wont_eat"]), userRecords))
        self.assertEqual(list(set(getRecommendations(features)['hotels_to_visit']) - set(['Tally Joe',  'Wagamama',  'Sultan Sofrasi',  'The Cambridge',  'Spice of life'])), [])
        
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

.
----------------------------------------------------------------------
Ran 2 tests in 0.015s

OK
