In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [22]:
testInputs= ['good ambiance restaurants, serving fish',
'restaurants serving hygienic food',
'must visit restaurants have fish',
'restaurants serving healthy food near Airport',
'restaurants with good dance floor and music',
'best cakes',
'authentic chicken biryani under Rs 200']

doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [3]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [4]:
labels = {}

for s in testInputs:
    doc1 = nlp(s)
    labels[s] = []
    for ent in doc1.ents:
        labels[s].append(ent.label_)

labels

{'good ambiance restaurants, serving fish': [],
 'restaurants serving hygienic food': [],
 'must visit restaurants': [],
 'restaurants serving healthy food near Airport': ['GPE'],
 'restaurants with good dance floor and music': [],
 'best cakes': [],
 'authentic chicken biryani under Rs 200': ['PERSON', 'CARDINAL']}

In [12]:
pos = {}
for s in testInputs:
    doc1 = nlp(s)
    for token in doc1:
        print (token.text)
        print ("--------------------------------------")
        print(token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha)
        if 'NN' in token.tag_ and token.is_stop == False:
            pos[token.text] = [token.tag_, token.dep_]
    print ("---------------------------------------")
print ("---------------------------------------")
print (pos)
for k in pos:
    print (k)
    list(pos[k]).append(nlp(k).ents)

good
--------------------------------------
good ADJ JJ amod xxxx True
ambiance
--------------------------------------
ambiance NOUN NN compound xxxx True
restaurants
--------------------------------------
restaurant NOUN NNS ROOT xxxx True
,
--------------------------------------
, PUNCT , punct , False
serving
--------------------------------------
serve VERB VBG acl xxxx True
fish
--------------------------------------
fish NOUN NN dobj xxxx True
---------------------------------------
restaurants
--------------------------------------
restaurant NOUN NNS ROOT xxxx True
serving
--------------------------------------
serve VERB VBG acl xxxx True
hygienic
--------------------------------------
hygienic ADJ JJ amod xxxx True
food
--------------------------------------
food NOUN NN dobj xxxx True
---------------------------------------
must
--------------------------------------
must VERB MD aux xxxx True
visit
--------------------------------------
visit VERB VB ROOT xxxx True
restaura

In [6]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [13]:
pos

{'ambiance': ['NN', 'compound'],
 'restaurants': ['NNS', 'ROOT'],
 'fish': ['NN', 'dobj'],
 'food': ['NN', 'dobj'],
 'Airport': ['NNP', 'pobj'],
 'dance': ['NN', 'compound'],
 'floor': ['NN', 'pobj'],
 'music': ['NN', 'conj'],
 'cakes': ['NNS', 'ROOT'],
 'chicken': ['NN', 'nsubj'],
 'biryani': ['NNS', 'ROOT'],
 'Rs': ['NNS', 'pobj']}

In [11]:

# token level
ent_ada_0 = [doc[0].text, doc[0].ent_type_]
ent_ada_1 = [doc[1].text, doc[1].ent_type_]
ent_london_5 = [doc[5].text, doc[5].ent_type_]
print(ent_ada_0)  # ['Ada', 'PERSON', 'Q7259']
print(ent_ada_1)  # ['Lovelace', 'PERSON', 'Q7259']
print(ent_london_5)  # ['London', 'GPE', 'Q84']


['Apple', 'ORG']
['is', '']
['U.K.', 'GPE']


In [19]:
## removing Stop Words
import stanza
stanza.download('en')       # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline('en') # This sets up a default neural pipeline in English
doc = nlp("Barack Obama was born in Hawaii.  He was elected president in 2008.")
doc.sentences[0].print_dependencies()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 14.0MB/s]                    
2020-11-27 22:06:31 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/en/default.zip: 100%|██████████| 428M/428M [10:09<00:00, 702kB/s]   
2020-11-27 22:16:54 INFO: Finished downloading models and saved to /Users/devanshsharma/stanza_resources.
2020-11-27 22:16:54 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2020-11-27 22:16:54 INFO: Use device: cpu
2020-11-27 22:16:54 INFO: Loading: tokenize
2020-11-27 22:16:54 INFO: Loading: pos
2020-11-27 22:16:55 INFO: Loading: lemma
2020-11-27 22:16:55 INFO: Loading: depparse
2020-11-27 22:16:57 INFO: Loading: senti

('Barack', 4, 'nsubj:pass')
('Obama', 1, 'flat')
('was', 4, 'aux:pass')
('born', 0, 'root')
('in', 6, 'case')
('Hawaii', 4, 'obl')
('.', 4, 'punct')


In [26]:
doc = nlp("Barack Obama was born in Hawaii.  He was elected president in 2008.")
doc.sentences[0].print_dependencies()

('Barack', 4, 'nsubj:pass')
('Obama', 1, 'flat')
('was', 4, 'aux:pass')
('born', 0, 'root')
('in', 6, 'case')
('Hawaii', 4, 'obl')
('.', 4, 'punct')


In [35]:
doc.sentences[0]

[
  {
    "id": 1,
    "text": "Barack",
    "lemma": "Barack",
    "upos": "PROPN",
    "xpos": "NNP",
    "feats": "Number=Sing",
    "head": 4,
    "deprel": "nsubj:pass",
    "misc": "start_char=0|end_char=6",
    "ner": "B-PERSON"
  },
  {
    "id": 2,
    "text": "Obama",
    "lemma": "Obama",
    "upos": "PROPN",
    "xpos": "NNP",
    "feats": "Number=Sing",
    "head": 1,
    "deprel": "flat",
    "misc": "start_char=7|end_char=12",
    "ner": "E-PERSON"
  },
  {
    "id": 3,
    "text": "was",
    "lemma": "be",
    "upos": "AUX",
    "xpos": "VBD",
    "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
    "head": 4,
    "deprel": "aux:pass",
    "misc": "start_char=13|end_char=16",
    "ner": "O"
  },
  {
    "id": 4,
    "text": "born",
    "lemma": "bear",
    "upos": "VERB",
    "xpos": "VBN",
    "feats": "Tense=Past|VerbForm=Part|Voice=Pass",
    "head": 0,
    "deprel": "root",
    "misc": "start_char=17|end_char=21",
    "ner": "O"
  },
  {
    "id": 5

In [20]:
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize  
  
example_sent = """This is a sample sentence, 
                  showing off the stop words filtration."""
  
stop_words = set(stopwords.words('english'))  
  
word_tokens = word_tokenize(example_sent)  
  
filtered_sentence = [w for w in word_tokens if not w in stop_words]  
  
filtered_sentence = []  
  
for w in word_tokens:  
    if w not in stop_words:  
        filtered_sentence.append(w)  
  
print(word_tokens)  
print(filtered_sentence)  

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [21]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [1]:
## After cleaning stop words


## Getting Data

## find integers from the text 

## lets get some string data that contains food 
import pandas as pd 
df = pd.read_csv('dataPreprocessedRest.csv')

In [2]:
df.columns

Index(['Unnamed: 0', 'name', 'online_order', 'book_table', 'votes', 'location',
       'dish_liked', 'cuisines', 'approx_cost(for two people)', 'menu_item',
       'rate', 'type', 'reviews1', 'num_reviews'],
      dtype='object')

In [10]:
df['dish_liked'][2].split(',')

['Icecream Cake',
 ' Brownie',
 ' Waffles',
 ' Chocolate Icecreams',
 ' Thick Shakes']

In [15]:
s = df['dish_liked'].dropna().reset_index()

In [22]:
s['dishArr'] = s['dish_liked'].apply(lambda x: list(set(x.split(','))))

In [24]:
s['dish_liked'][0]

'Icecream Cake, Brownie, Waffles, Chocolate Icecreams, Thick Shakes'

In [25]:
len(s['dishArr'][0])

5

In [27]:
df.columns

Index(['Unnamed: 0', 'name', 'online_order', 'book_table', 'votes', 'location',
       'dish_liked', 'cuisines', 'approx_cost(for two people)', 'menu_item',
       'rate', 'type', 'reviews1', 'num_reviews'],
      dtype='object')

In [29]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,online_order,book_table,votes,location,dish_liked,cuisines,approx_cost(for two people),menu_item,rate,type,reviews1,num_reviews
0,0,Unique Brew Cafe Resto,No,No,0,Indiranagar,,Fast Food,200.0,[],3.9,Dine-out & Desserts,[],0
1,1,Jayanthi Sagar,No,No,21,Koramangala 5th Block,,"South Indian, North Indian, Chinese",200.0,[],3.1,Dine-out & Desserts,['Rated 2.0 RATEDn Works only because its che...,13
2,2,Rock Stone Ice Cream Factory,Yes,No,131,BTM,"Icecream Cake, Brownie, Waffles, Chocolate Ice...",Ice Cream,230.0,"['Midnight Indulgence Cake', 'Butterscotch Mel...",4.0,Delivery,['Rated 4.0 RATEDn Ice creams are really tast...,8
3,3,Punjabi by Nature 2.0,No,No,3236,BTM,"Paneer Tikki, Mutton Raan, Mango Margarita, Cr...",North Indian,,[],4.2,Delivery,['Rated 3.0 RATEDn It has a beautiful ambianc...,139
4,4,Rayalaseema Chefs,Yes,Yes,225,Marathahalli,"Bamboo Chicken, Butter Naan, Mutton Biryani, P...","North Indian, Biryani, Andhra, Chinese",800.0,[],3.9,Delivery,['Rated 5.0 RATEDn Had Good experience with t...,4


In [1]:
## Food library NLTK 

from nltk.corpus import wordnet as wn
food = wn.synset('food.n.02')


In [8]:
foodList = list(set([w for s in food.closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))

In [7]:
foodList

['round_steak',
 'offal',
 'Nova_lox',
 'blade_roast',
 "calves'_feet",
 'Sally_Lunn',
 'top_round',
 'Berlin_doughnut',
 'sinker',
 'sugar_snap_pea',
 'roast_lamb',
 'fortune_cookie',
 'grape',
 'hindquarter',
 'cherimolla',
 'Concord_grape',
 'bread',
 'grey_mullet',
 'cauliflower',
 'shadberry',
 'leek',
 'Bavarian_blue',
 'brick_cheese',
 'ruggelach',
 'sour_bread',
 'babka',
 'currant',
 'corn_dab',
 'bratwurst',
 'scuppernong',
 'fresh_food',
 'Nova_Scotia_lox',
 'soft_roll',
 'black-eyed_pea',
 'Catawba',
 'smelt',
 'tagliatelle',
 'winkle',
 'Vienna_sausage',
 'beigel',
 'crab',
 'sirloin',
 'loaf',
 'Yellow_Delicious',
 'fritter',
 'frumenty',
 'pea',
 'rack',
 "farmer's_cheese",
 'wedding_cake',
 'dandelion_green',
 'bay_scallop',
 'soy',
 'luncheon_meat',
 'baba_au_rhum',
 'anchovy_pear',
 'puffed_rice',
 'red_onion',
 'runner_bean',
 'smallmouth_bass',
 'banana_bread',
 'beef_roast',
 'honeydew',
 'tenderloin',
 'side',
 'finnan_haddie',
 'flapjack',
 'journey_cake',
 'scol

In [None]:
### Output: {} with keys (location, budget, ambience, misc)