In [1]:
import json
import csv

from pprint import pprint

import pandas as pd

# from py2neo import Graph

from openie import StanfordOpenIE
from extract_entity import use_nltk_ner, use_spacy_ner, use_stanford_ner, use_allen_ner
from extract_relation import AllanRE
from resolve_coreference import AllenCR, StanfordCR, SpacyCR
from utils.spell_check import levenshtein_ratio_and_distance

from IPython.core.display import display, HTML
from libs.gpr_pub import visualization

In [2]:
''' Initialize variables '''
# dataset = r"data/input/restaurant_20_dataset.json"
dataset = r"data/input/data_100/restaurant_dataset.json"
relation_triplets_file = r"data/output/kg/relation_triplets.csv"

users_csv = r"data/output/kg/users.csv"
restaurants_csv = r"data/output/kg/restaurants.csv"
ratings_csv = r"data/output/kg/ratings.csv"

In [3]:
''' open restaurant dataset '''
file = open(dataset,)
restaurant_data = json.load(file)
file.close()

# check data sample
# restaurant = restaurant_data['restaurants'][0]
# print(restaurant)

In [None]:
''' Test GraphDB '''
# from module.neo4j.graph_db import GraphDB
# graph = GraphDB("bolt://localhost:7687", "neo4j", "erclab")
# test connection
# graph.run("UNWIND range(1, 3) AS n RETURN n, n * n as n_sq")
# graph.print_greeting("hello, world")
# graph.close()

In [None]:
###############################################################################################################
# Making tables in the Graph database from Yelp structured json dataset                                       #
#                                                                                                             #
# DB Schema is defined as follows:                                                                            #
#                                                                                                             #
# ##################  ##################  ############  ############  ##########  ###########  #############  #
# # `Restaurant`   #  # `User`         #  # `Aspect` #  # `Review` #  # `City` #  # `State` #  # `Country` #  #
# ##################  ##################  ############  ############  ##########  ###########  #############  #
# # - ID           #  # - ID           #  # - ID     #  # - ID     #  # - ID   #  # - ID    #  # - ID      #  #
# # - name         #  # - name         #  # - name   #  # - text   #  # - name #  # - name  #  # - name    #  #
# # - address      #  # - review_count #  ############  ############  ##########  ###########  #############  #
# # - postal_code  #  ##################                                                                      #
# # - rating       #                                                                                          #
# ##################                                                                                          #
#                                                                                                             #
###############################################################################################################

#################################################
#                                               #
# make triples from yelp structured json data   #
#                                               #
# User       -> visit(date) -> Review           #
# User       -> wrote(date) -> Review           #
# User       -> rate(star)  -> Restaurant       #
# User       -> has_friend  -> User             #
# Restaurant -> has_review  -> Review           #
# Restaurant -> has         -> Attribute/Aspect #
# Restaurant -> in_city     -> City             #
# City       -> in_state    -> State            #
# State      -> in_country  -> Country          #
#                                               #
#################################################
from module.neo4j.graph_db import GraphDB

# graph = GraphDB("bolt://localhost:7687", "neo4j", "erclab")
graph = GraphDB("bolt://localhost:11012", "neo4j", "erclab")

# Inserting users
users = restaurant_data['users']
relation_triplets = []

users_list = []
restaurants_list = []
ratings_list = []

print("inserting users in DB")
print("Total users: "+str(len(users)))
i=0
for user in users:
    graph.create_user(user["user_id"], user["name"], user["gender"], user["age"], user["fans"],
                      user["review_count"], user["average_stars"])

    users_list.append([i, user["user_id"], user["name"], user["gender"], user["age"], user["fans"],
                      user["review_count"], user["average_stars"], "|".join(fri for fri in user["friends"])])
    i+=1

for user in users:
    friends = user['friends']

    if len(friends) == 0:
        continue

    for friend in friends:
        # insert relation to GraphDB
        graph.create_user_has_friend_relation(user["user_id"], friend)
        # insert relation to triplet list
        relation_triplets.append([user["user_id"], 'HAS_FRIEND', friend])

    
# Inserting restaurants
restaurants = restaurant_data['restaurants']
print("Inserting restaurants in DB")
print("Total restaurants: "+str(len(restaurants)))
i = 0
for restaurant in restaurants:

    # restaurant_photos = json.dumps(restaurant['photos'])
    photos = []
    if restaurant['photos']:
        for photo in restaurant['photos']:
            photos.append(photo['photo_id'])

    restaurant_photos = "["+",".join(p for p in photos)+"]"

    graph.create_restaurant(restaurant["rest_id"], restaurant["name"], restaurant["address"],
                                restaurant["postal_code"], restaurant["rating"], restaurant_photos)

    restaurants_list.append([i, restaurant["rest_id"], restaurant["name"], restaurant["postal_code"],
                           restaurant["city"], restaurant["state"], restaurant["country"], restaurant["rating"],
                           "|".join(cat.strip() for cat in restaurant["category"].split(",")
                                    if cat.strip() != "Restaurants")])

    # Inserting categories
    categories = restaurant['category'].split(',')
    for category in categories:
        category = category.strip()
        if category == 'Restaurants':
            continue

        category_id = "_".join(t.title() for t in category.split())
        graph.create_category(category_id, category.title())
        graph.create_restaurant_has_category_relation(restaurant["rest_id"], category_id)
        relation_triplets.append([restaurant["rest_id"], 'HAS_CATEGORY', category_id])

    # Inserting reviews
    reviews = restaurant['reviews']
    
    print("["+str(i)+"] Inserting reviews for restaurant "+restaurant["rest_id"]+" - "+restaurant["name"]+"in DB")
    print("Total reviews: "+str(len(reviews)))
    for review in reviews:
        graph.create_review(review["review_id"], review["text"])

        ratings_list.append([review["user_id"], restaurant["rest_id"], review["rating"], review["date"]])

        # calculate visit count and avg rating
        total_rating = 0
        visit_count = 0
        for rev in reviews:
            if rev["user_id"] == review["user_id"]:
                total_rating += rev["rating"]
                visit_count += 1
                # print("visit_count: "+str(visit_count)+" - rating: "+str(rev["rating"])+" - total_rating: "+str(total_rating))
        avg_rating = total_rating/visit_count

        # for debugging about user multiple visit
        # if visit_count > 1:
        #     print("Visit Multiple Times: "+review["user_id"]+" - count: "+str(visit_count)+" - avg_rate: "+str(avg_rating))

        # Add reviews relations to GraphDB
        graph.create_restaurant_has_review_relation(restaurant["rest_id"], review["review_id"])
        graph.create_user_write_review_relation(review["user_id"], review["review_id"], review["date"])
        graph.create_user_visit_restaurant_relation(review["user_id"], restaurant["rest_id"], visit_count)
        graph.create_user_rate_restaurant_relation(review["user_id"], restaurant["rest_id"], avg_rating)

        # Add reviews relations to triplet list
        relation_triplets.append([restaurant["rest_id"], 'HAS_REVIEW', review["review_id"]])
        relation_triplets.append([review["user_id"], 'WRITE_REVIEW{date:'+str(review["date"])+'}', review["review_id"]])
        relation_triplets.append([review["user_id"], 'VISIT{count:'+str(visit_count)+'}', restaurant["rest_id"]])
        relation_triplets.append([review["user_id"], 'RATE{star:'+str(avg_rating)+'}', restaurant["rest_id"]])
        
    # Inserting city, state, country with relationships
    # Remove state (Only use city and country)
    city, state, country = restaurant["city"], restaurant["state"], restaurant["country"]
    unique_country = "_".join(cc for cc in country.split())
    # unique_state = f'{state}-{unique_country}'
    # unique_city = f'{city}-{state}-{unique_country}'
    unique_city = f'{"_".join(cc for cc in city.split())}-{unique_country}'

    # print("inserting city, state, country for restaurant "+restaurant["rest_id"]+" - "+restaurant["name"]+"in DB")
    print("Inserting city, country for restaurant "+restaurant["rest_id"]+" - "+restaurant["name"]+"in DB")
    graph.create_city(unique_city, city)
    # graph.create_state(unique_state, state)
    graph.create_country(unique_country, country)

    # Add City Country relations to GraphDB
    graph.create_restaurant_city_relation(restaurant["rest_id"], unique_city)
    # graph.create_city_state_relation(unique_city, unique_state)
    graph.create_city_country_relation(unique_city, unique_country)

    # Add City Country relations to triplet list
    relation_triplets.append([restaurant["rest_id"], 'LOCATED_IN', unique_city])
    relation_triplets.append([unique_city, 'LOCATED_IN', unique_country])

    i += 1

graph.close()

In [None]:
''' Store relation_triplets to csv file '''
relation_triplets_df = pd.DataFrame(relation_triplets, columns=['subject', 'relation', 'object'])
relation_triplets_df.to_csv(relation_triplets_file, index=False)

users_df = pd.DataFrame(users_list, columns=['id', 'user_id', 'name', 'gender', 'age', 'fans', 'review_count',
                                             'average_stars', 'friends'])
users_df.to_csv(users_csv, index=False)

restaurants_df = pd.DataFrame(restaurants_list, columns=['id', 'rest_id', 'name', 'postal_code', 'city', 'state',
                                                         'country', 'rating', 'category'])
restaurants_df.to_csv(restaurants_csv, index=False)

ratings_df = pd.DataFrame(ratings_list, columns=['user_id', 'rest_id', 'rating', 'date'])
ratings_df.to_csv(ratings_csv, index=False)

In [None]:
import glob
import argparse
import logging
import pickle
import re
import json
import csv
from pprint import pprint

import pandas as pd

import spacy
from spacy import displacy
from spacy.matcher import Matcher
from spacy.util import filter_spans
from openie import StanfordOpenIE
from extract_entity import use_nltk_ner, use_spacy_ner, use_stanford_ner, use_allen_ner
from extract_relation import AllanRE
from resolve_coreference import AllenCR, StanfordCR, SpacyCR
from utils.spell_check import levenshtein_ratio_and_distance


from IPython.core.display import display, HTML
from libs.gpr_pub import visualization

In [None]:
# credit: https://github.com/wang-h/bert-relation-classification/blob/master/utils.py
def clean_str(text):
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=<>]", " ", text)
    text = re.sub(r"[0-9]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"that's", "that is ", text)
    text = re.sub(r"there's", "there is ", text)
    text = re.sub(r"it's", "it is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
#     text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

def replace_subject_entity(text, entity):
    text = re.sub(r"\bI\b|\bi\b|\bWe\b|\bwe\b", entity, text)
    return text

# Clean reviews text
def clean_reviews(restaurants_data, save_clean_data=False):    
    rest_obj = dict()
    restaurants = restaurants_data
    for restaurant in restaurants:
        for review in restaurant['reviews']:
            review["text"] = clean_str(review["text"])
            review["text"] = replace_subject_entity(review["text"], review["name"])
    
    if save_clean_data:
        rest_obj['restaurants'] = restaurants
        with open('data/input/cleaned_reviews.json', 'w') as outfile:
            json.dump(rest_obj, outfile)
        
    
def split_sentence(text):
    '''
    splits review into a list of sentences using spacy's sentence parser
    '''
    nlp = spacy.load('en_core_web_sm')
    review = nlp(text)
    bag_sentence = []
    start = 0
    for token in review:
        if token.sent_start:
            bag_sentence.append(review[start:(token.i - 1)])
            start = token.i
        if token.i == len(review) - 1:
            bag_sentence.append(review[start:(token.i + 1)])
    return bag_sentence


def triple_pruning(triples, ner_dict):
    entity_set = set(ner_dict.keys())
    final_triples = []

    for row, col in triples.iterrows():
        col['subject'] = col['subject'].strip()

        # check if Named Entity in subject sentence fragment
        # found_entity = False
        # for named_entity in entity_set:
        #     if named_entity in col['subject']:
        #         col['subject'] = named_entity
        #         found_entity = True
        #
        # if found_entity:
        final_triples.append(('Node', col['subject'], col['relation'], 'Node', col['object']))

    triple_df = pd.DataFrame(final_triples, columns=['Type1', 'Entity1', 'Relationship', 'Type2', 'Entity2']).drop_duplicates()
    return triple_df

In [None]:
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
OBJECTS = ["dobj", "dative", "attr", "oprd"]
ATTRS = ["acomp"]
POSS_NOUN = ["NOUN", "PROPN", "X"]


def get_compounds(tokens):
    compounds = []
    lefts = list(tokens.lefts)
    compounds.extend([tok for tok in lefts if tok.dep_ == 'compound'])
    return compounds


def get_subject_compound(v):
    subs = []
    compounds = []
    for tok in v.lefts:
        if tok.dep_ in SUBJECTS and tok.pos_ != "DET":
            compounds = get_compounds(tok)
            compounds.extend([tok])
    if compounds:
        subs.extend(compounds)
    return subs


def get_object_compound(v):
    objs = []
    compounds = []
    for tok in v.rights:
        if tok.dep_ in OBJECTS:
            compounds = get_compounds(tok)
            compounds.extend([tok])
    if compounds:
        objs.extend(compounds)
    return objs


def get_attribute_compound(av):
    objs = []
    compounds = []
    for tok in av.rights:
        if tok.dep_ in ATTRS:
            compounds = get_compounds(tok)
            compounds.extend([tok])
    if compounds:
        objs.extend(compounds)
    return objs


def get_prep_object_compound(p):
    p_objs = []
    compounds = []
    for tok in p.rights:
        if tok.dep_ == 'pobj':
            compounds = get_compounds(tok)
            compounds.extend([tok])
    if compounds:
        p_objs.extend(compounds)
    return p_objs


def get_prep(p):
    prep = False
    for tok in p.rights:
        if tok.dep_ == 'prep':
            prep = tok
    return prep

nlp = spacy.load('en_core_web_sm')
def get_lexical_triplets_pairs(text_doc, verbose=False):

    doc = nlp(text_doc)

#     print([(ent.text, ent.label_) for ent in doc.ents])

    triplets = []

    verbs = [tok for tok in doc if tok.pos_ == "VERB"]

    # getting sub, verb, obj triples
    # Angela, visit, silver spoon restaurant
    for v in verbs:
        subs = get_subject_compound(v)
        objs = get_object_compound(v)

        if subs and objs:
            triplets.append([' '.join(str(i) for i in subs), v, ' '.join(str(i) for i in objs)])

    # getting sub, verb_prep, p_obj triples
    # Angela, visit_with, Angela's friends
    for v in verbs:
        subs = get_subject_compound(v)
        prep = get_prep(v)

        p_objs = False
        if prep:
            p_objs = get_prep_object_compound(prep)

        if subs and p_objs:
            triplets.append([' '.join(str(i) for i in subs), str(v)+'_'+str(prep), ' '.join(str(i) for i in p_objs)])

    # getting sub, possession, obj triples
    # Silver spoon restaurant, has, Chicken biryani
    subs, objs = (False, False)
    poss_nouns = [tok for tok in doc if tok.pos_ in POSS_NOUN]
    for n in poss_nouns:
        children = list(n.children)
        for child in children:
            if child.dep_ == 'poss' and child.pos_ in POSS_NOUN:
                compounds = get_compounds(child)
                compounds.extend([child])
                subs = compounds

                compounds = get_compounds(n)
                compounds.extend([n])
                objs = compounds

                if subs and objs:
                    triplets.append([' '.join(str(i) for i in subs), 'has', ' '.join(str(i) for i in objs)])

    # getting sub, aux, complementary object triples
    # food, are, good
    subs, objs = (False, False)
    aux_verbs = [tok for tok in doc if tok.pos_ == "AUX"]
    for av in aux_verbs:
        subs = get_subject_compound(av)
        objs = get_attribute_compound(av)

        if subs and objs:
            triplets.append([' '.join(str(i) for i in subs), av, ' '.join(str(i) for i in objs)])

#     pprint(triplets)
    return triplets

In [None]:
from transformers import BertModel, BertForTokenClassification, BertTokenizer
import torch
import torch.nn as nn

class MenuNER:
    
    def __init__(self):
        self.model = BertForTokenClassification.from_pretrained(
            "/home/muzamil/Projects/Python/ML/NLP/NER/BERT-NER/out_ner/")
        self.tokenizer = BertTokenizer.from_pretrained(
            "/home/muzamil/Projects/Python/ML/NLP/NER/MenuNER/model/FoodieBERT/cased_L-12_H-768_A-12")
        
    def extract_menu_ner(self, restaurants, write_results=False):

        menu_ner = dict()
        for restaurant in restaurants['restaurants']:
            for review in restaurant['reviews']:

                sequence = review['text']
                predict = self._extract_menu_ner(self.model, self.tokenizer, sequence)
                menu_ner.update(predict)
#                 print(predict)

#         print(menu_ner)
        if write_results:
               # opening the csv file in 'w' mode
            open_menu_extracted_file = open(menu_extracted_file, 'w')
            writer = csv.DictWriter(open_menu_extracted_file, fieldnames=entity_headers)

            writer.writeheader()
            for key, value in menu_ner.items():
                writer.writerow({entity_headers[0]: key,
                                 entity_headers[1]: value})
        
    
    def extract_menu_ner_single(self, sequence):
        menu_ner = dict()
        predict = self._extract_menu_ner(self.model, self.tokenizer, sequence)
        menu_ner.update(predict)
        return predict
                
    @staticmethod
    def _extract_menu_ner(model, tokenizer, sequence):

        label_list = [
            "O",       # Outside of a named entity
            "B-MENU",  # Beginning of a menu entity
            "I-MENU",  # menu entity
        ]
        
        # Bit of a hack to get the tokens with the special tokens
        tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence, max_length=512, truncation=True)))
        inputs = tokenizer.encode(sequence, return_tensors="pt")

        predict = {}
        if inputs.size()[1] > 512:
            return predict

        outputs = model(inputs)[0]
        predictions = torch.argmax(outputs, dim=2)

        full_token = ''
        
        for token, prediction in zip(tokens, predictions[0].tolist()):
            if token != '[CLS]' and token != '[SEP]':
                if prediction > 3:
                    continue
                if label_list[prediction-1] in ["B-MENU", "I-MENU"]:
                    if token.startswith('##'):
                        full_token = full_token + token.replace("##", "")
                    else:
                        if full_token:
                            full_token = full_token +" "+token
                        else:
                            full_token = token
                elif full_token:
                    if token.startswith('##'):
                        full_token = full_token + token.replace("##", "")
                    else:
                        predict[full_token] = "MENU"
                        full_token = ''

        # Make first letter capitan and all small case for MenuNER
        predict = dict((key.title(), value) for (key, value) in predict.items())
        return predict

In [7]:
# Load MenuNER model to extract Food entities
from module.ner.menu_ner import MenuNER

menu_ner = MenuNER()
review_text = "Nice place busy during lunch time. " \
              "The chicken biryani is super butter chicken is very authentic."
result = menu_ner.extract_menu_ner_single(review_text)
print(result)


{'chicken biryani': 'MENU', 'butter chicken': 'MENU'}


In [11]:
# 1
output_path = "./data/output/"
ner_pickles_op = output_path + "ner/"
cr_pickles_op = output_path + "cr/"

''' read restaurant knowledgebase data '''
knowledge_base_entities = r"data/kb/entities/"
kb_restaurant_file = knowledge_base_entities + "restaurant.csv"
kb_menu_file = knowledge_base_entities + "menu.csv"
kb_menu_file1 = knowledge_base_entities + "menu1.csv"
kb_general_file = knowledge_base_entities + "general.csv"
kb_restaurant_aspects_file = knowledge_base_entities + "restaurant_aspects.csv"
kb_menu_aspects_file = knowledge_base_entities + "menu_attrs.csv"

entity_headers = ['Name', 'Label']

In [None]:
# 2
''' Add restaurants to `kb_restaurant_file` '''

data = []
for restaurant in restaurant_data['restaurants']:
    data.append((restaurant['name'], 'RESTAURANT'))

rest_data_df = pd.DataFrame(data, columns=entity_headers).drop_duplicates()
rest_data_df.to_csv(kb_restaurant_file, index=False)

In [None]:
# 3
''' Fixed some MENU entiies manually '''

''' Remove Duplicate entities from MENU file '''
df = pd.read_csv(kb_menu_file, sep=",")
df.drop_duplicates(subset=None, inplace=True)
df.to_csv(kb_menu_file, index=False)

In [12]:
# 4
''' make dataframes for kb '''
kb_restaurant_df = pd.read_csv(kb_restaurant_file, header=0, names=entity_headers)
kb_menu_df = pd.read_csv(kb_menu_file, header=0, names=entity_headers)
# kb_general_df = pd.read_csv(kb_general_file, header=0, names=['Name', 'Label'])
kb_restaurant_aspects_df = pd.read_csv(kb_restaurant_aspects_file, header=0, names=entity_headers)
kb_menu_aspects_df = pd.read_csv(kb_menu_aspects_file, header=0, names=entity_headers)
# print(kb_restaurant_aspects_df)

In [None]:
''' Insert Menus in Neo4J database '''
graph = GraphDB("bolt://localhost:7687", "neo4j", "erclab")

kb_menu_df_distinct = kb_menu_df.drop_duplicates()

for i in kb_menu_df_distinct.index:
    
    menu_name = kb_menu_df_distinct['Name'][i]
    menu_id_arr = menu_name.lower().split()
    menu_id = "_".join(id for id in menu_id_arr)

    # Inserting menus
    graph.create_menu(menu_id, menu_name)

graph.close()

In [None]:
''' Insert Aspects in Neo4J database '''
graph = GraphDB("bolt://localhost:7687", "neo4j", "erclab")

kb_restaurant_aspects_df_distinct = kb_restaurant_aspects_df.drop_duplicates()

for i in kb_restaurant_aspects_df_distinct.index:
    
    aspect_name = kb_restaurant_aspects_df_distinct['Name'][i]
    aspect_id_arr = aspect_name.lower().split()
    aspect_id = "_".join(id for id in aspect_id_arr)

    # Inserting menus
    graph.create_aspect(aspect_id, aspect_name)

graph.close()

In [None]:
# 5
from module.processor.text_processor import clean_reviews
# Clean reviews text
clean_reviews(restaurant_data['restaurants'], save_clean_data=True)

In [None]:
# 6
''' open cleaned restaurant dataset '''
restaurant_cleaned_reviews_file = r"data/input/cleaned_reviews.json"
file = open(restaurant_cleaned_reviews_file,)
restaurant_cleaned_reviews = json.load(file)
file.close()

In [None]:
# 7
''' coreference resolution '''
allen_cr = AllenCR(True)
predictor, nlp = allen_cr.load_models()

rest_index=0
rest_count = len(restaurant_cleaned_reviews['restaurants'])
for restaurant in restaurant_cleaned_reviews['restaurants']:

    rest_index += 1
    print("Processing Restaurant "+str(rest_index)+"/"+str(rest_count))
    
    reviews = restaurant['reviews']
    rev_index=0
    rev_count = len(reviews)
    for review in reviews:
        
        rev_index +=1
        print("\t Processing Review "+str(rev_index)+"/"+str(rev_count))
        
        dummy_sentence = review["name"] + " visit " + restaurant["name"] + "."
        doc = dummy_sentence + " " + review["text"]
#         print(review["text"])
        clusters = predictor.predict(doc)['clusters']
        nlp_doc = nlp(doc)
        coref_resolved = allen_cr.improved_replace_corefs(nlp_doc, clusters)
        
        ## split doc into sentences and remove first sentence
        #     # nlp_small.add_pipe(nlp_small.create_pipe('sentencizer'))
        nlp_doc = nlp(coref_resolved)
        sentences = [sent.string.strip() for sent in nlp_doc.sents]
        ## remove dumy_sentence
        sentences.pop(0)
        rev = " ".join([sent for sent in sentences])
        review["text"] = rev
        
#         print('resolved')
#         print(review['text'])

# print(restaurant_cleaned_reviews['restaurants'][0]['reviews'][0]['text'])

In [None]:
# 8
rest_obj = dict()
rest_obj['restaurants'] = restaurant_cleaned_reviews['restaurants']
with open('data/input/coref_resolved_reviews.json', 'w') as outfile:
    json.dump(rest_obj, outfile)

In [None]:
# 9
''' open coref_resolved_reviews restaurant dataset '''
restaurant_coref_resolved_reviews_file = r"data/input/data_100/coref_resolved_reviews.json"
file = open(restaurant_coref_resolved_reviews_file,)
restaurant_cleaned_reviews = json.load(file)
file.close()

In [None]:
kb_restaurant_dict = {}
kb_restaurant_aspects_dict = {}
kb_menu_dict = {}
# kb_menu_aspects_dict = {}
kb_general_dict = {}

''' read restaurant NER dictionary '''
with open(kb_restaurant_file, mode='r') as infile:
    reader = csv.reader(infile)
    i = 0
    for rows in reader:
        if i == 0:
            i += 1
            continue
        kb_restaurant_dict[rows[0]] = rows[1]

''' read restaurant aspects NER dictionary '''
with open(kb_restaurant_aspects_file, mode='r') as infile:
    reader = csv.reader(infile)
    i = 0
    for rows in reader:
        if i == 0:
            i += 1
            continue
        kb_restaurant_aspects_dict[rows[0]] = rows[1]

''' read menu NER dictionary '''
with open(kb_menu_file, mode='r') as infile:
    reader = csv.reader(infile)
    i = 0
    for rows in reader:
        if i == 0:
            i += 1
            continue
        kb_menu_dict[rows[0]] = rows[1]

In [13]:
# 10
# from module.processor.relation_processor import TripletProcessor
from module.processor.relation_processor import TripletProcessor
triplet_processor = TripletProcessor()
triplet_processor.init_kb_dict(kb_restaurant_file, kb_restaurant_aspects_file,  kb_menu_file)
triplet_processor.init_lexical_analyzer()

In [None]:
lexicon_file_path = 'data/opinion_lexicon-en/'

def load_opinion_lexicon():
    # Load opinion lexicon
    neg_file = open(lexicon_file_path + "negative-words.txt", encoding="ISO-8859-1")
    pos_file = open(lexicon_file_path + "positive-words.txt", encoding="ISO-8859-1")
    neg = [line.strip() for line in neg_file.readlines()]
    pos = [line.strip() for line in pos_file.readlines()]
    opinion_words = neg + pos
    return opinion_words, pos, neg

opinion_words, pos, neg = load_opinion_lexicon()

def process_triple_pruning(triples, rest_id, review, ner_dict):
    # entity_set = set(ner_dict.keys())
    final_triples = []

    for row, col in triples.iterrows():
        col['subject'] = col['subject'].strip()
        col['object'] = col['object'].strip()

        ''' Fixing Misspelling Menus '''
        valid_menu = False
        menu_subject = False
        for key, value in kb_menu_dict.items():
            if key.lower() == col['subject'].lower():
                valid_menu = True
                menu_subject = True
                break
                
        if not valid_menu:
            for key, value in kb_menu_dict.items():
                subject_ratio = levenshtein_ratio_and_distance(key.lower(), col['subject'].lower(), 
                                                               ratio_calc=True)
                if subject_ratio > 0.85:
                    entity_word = menu_ner.extract_menu_ner_single(
                        "I like "+col['subject'].lower()+"from this restaurant.")
                    correction_word = menu_ner.extract_menu_ner_single(
                        "I like "+key.lower()+"from this restaurant.")
    #                 key_id = "_".join(k for k in key.split())
                    if entity_word and correction_word:
                        print("updating subject ["+col['subject']+"] with ["+key+"]")
                        col['subject'] = key
                        menu_subject = True
                        break
        
        valid_menu = False
        menu_object = False
        for key, value in kb_menu_dict.items():
            if key.lower() == col['object'].lower():
                valid_menu = True
                menu_object = True
                break

        if not valid_menu:
            for key, value in kb_menu_dict.items():
                object_ratio  = levenshtein_ratio_and_distance(key.lower(), col['object'].lower(), ratio_calc=True)
                if object_ratio > 0.85:
                    entity_word = menu_ner.extract_menu_ner_single(
                        "I like "+col['object'].lower()+"from this restaurant.")
                    correction_word = menu_ner.extract_menu_ner_single(
                        "I like "+key.lower()+"from this restaurant.")
                    
                    if entity_word and correction_word:
                        print("updating object [" + col['object'] + "] with [" + key + "]")
                        col['object'] = key
                        menu_object = True
                        break
        
        ''' check if subject is a valid entity '''
        valid_subject = False
        
        if menu_subject:
            menu_id = "_".join(k for k in col['subject'].split())
            col['subject'] = menu_id
            valid_subject = True
            
        restaurant_subject = False
        if not menu_subject:
            for key, value in kb_restaurant_dict.items():
                if key.lower() == col['subject'].lower():
                    col['subject'] = rest_id
                    restaurant_subject = True
                    valid_subject = True
                    break

#         menu_subject = False
#         if not restaurant_subject:
#             for key, value in kb_menu_dict.items():
#                 if key.lower() == col['subject'].lower():
#                     menu_id = "_".join(k for k in col['subject'].split())
#                     col['subject'] = menu_id
#                     menu_subject = True
#                     valid_subject = True
#                     break
        
        user_subject = False
        if not restaurant_subject:
            if review['name'].lower() == col['subject'].lower():
                col['subject'] = review['user_id']
                user_subject = True
                valid_subject = True

                    
        general_subject = False
        if not user_subject:
            for key, value in ner_dict.items():
                if key.lower() == col['subject'].lower():
                    general_subject = True
                    valid_subject = True
                    break

        res_aspect_subject = False
        if not general_subject:
            for key, value in kb_restaurant_aspects_dict.items():
                if key.lower() == col['subject'].lower():
                    aspect_id = "_".join(k for k in col['subject'].split())
                    col['subject'] = aspect_id
                    res_aspect_subject = True
                    valid_subject = True
                    break

        # menu_aspect_subject = False
        # if not general_subject:
        #     for key, value in kb_menu_aspects_dict.items():
        #         if key.lower() == col['subject'].lower():
        #             menu_aspect_subject = True
        #             valid_subject = True
        #             break

        ''' check if object is a valid entity '''
        valid_object = False
        
        if menu_object:
            menu_id = "_".join(k for k in col['object'].split())
            col['object'] = menu_id
            valid_object = True
            
        restaurant_object = False
        if not menu_object:
            for key, value in kb_restaurant_dict.items():
                if key.lower() == col['object'].lower():
                    col['object'] = rest_id
                    restaurant_object = True
                    valid_object = True
                    break

#         menu_object = False
#         if not restaurant_object:
#             for key, value in kb_menu_dict.items():
#                 if key.lower() == col['object'].lower():
#                     menu_id = "_".join(k for k in col['object'].split())
#                     col['object'] = menu_id
#                     menu_object = True
#                     valid_object = True
#                     break

        user_object = False
        if not restaurant_object:
            if review['name'].lower() == col['object'].lower():
                col['object'] = review['user_id']
                user_object = True
                valid_object = True
                
        general_object = False
        if not user_object:
            for key, value in ner_dict.items():
                if key.lower() == col['object'].lower():
                    general_object = True
                    valid_object = True
                    break

        res_aspect_object = False
        if not general_object:
            for key, value in kb_restaurant_aspects_dict.items():
                if key.lower() == col['object'].lower():
                    aspect_id = "_".join(k for k in col['object'].split())
                    col['object'] = aspect_id
                    res_aspect_object = True
                    valid_object = True
                    break

        attr_obj = False
        if not res_aspect_object:
#             opinion_words, pos, neg = load_opinion_lexicon()
            if col['object'].lower() in opinion_words:
                attr_obj = True
                valid_object = True

#         print(col['subject']+" : "+col['object']+" : "+str(valid_subject)+" : "+str(valid_object))
        if valid_subject and valid_object:
            if menu_subject:
                final_triples.append(('Node', rest_id, 'has_menu', 'Node', col['subject']))
            if restaurant_subject and menu_object:
                col['relation'] = 'has_menu'
            if res_aspect_subject and (res_aspect_object or attr_obj):
                col['relation'] = 'is'
            final_triples.append(('Node', col['subject'], col['relation'], 'Node', col['object']))

    triple_df = pd.DataFrame(final_triples, columns=['Type1', 'Entity1', 'Relationship', 'Type2', 'Entity2'])\
        .drop_duplicates()
    return triple_df

In [17]:
import spacy
from module.ner.menu_ner import MenuNER
from module.lexicon.lexical_analyzer import SpacyLexicalAnalyzer

menu_ner = MenuNER()
spacy_lexical_analyzer = SpacyLexicalAnalyzer()
spacy_lexical_analyzer.load_spacy_models("small")
nlp = spacy.load('en_core_web_sm')

ner_dict = {}

review_text = "Nice place busy during lunch time. " \
              "The chicken biryani is super, butter chicken is very authentic."
doc = nlp(review_text)

for x in doc.ents:
    entity_span = x.text

    has_restaurant_entity = False
    i = 0
    for kb_restaurant in kb_restaurant_df['Name']:
        ratio = levenshtein_ratio_and_distance(kb_restaurant.lower(), entity_span.lower(), ratio_calc=True)
        if ratio > 0.90:
            has_restaurant_entity = True
        if has_restaurant_entity:
            break
        i += 1

    if has_restaurant_entity:
        continue

    if x.label_ not in ["CARDINAL", "ORDINAL"]:
        ner_dict[x.text] = x.label_

text = review_text
tuple_pairs = spacy_lexical_analyzer.get_lexical_triplets_pairs(text)
tuple_pairs_df = pd.DataFrame(tuple_pairs, columns=['subject', 'relation', 'object'])
print(tuple_pairs_df)

           subject relation     object
0  chicken biryani       is      super
1   butter chicken       is  authentic


In [None]:
import spacy
from module.ner.menu_ner import MenuNER
from module.lexicon.lexical_analyzer import SpacyLexicalAnalyzer
from module.neo4j.graph_db import GraphDB

# graph = GraphDB("bolt://localhost:7687", "neo4j", "erclab")
# graph = GraphDB("bolt://localhost:11012", "neo4j", "erclab")

menu_ner = MenuNER()
spacy_lexical_analyzer = SpacyLexicalAnalyzer()
spacy_lexical_analyzer.load_spacy_models("small")
triples_df = pd.DataFrame()
nlp = spacy.load('en_core_web_sm')

# Making data for GNN attribute based node embeddings
user_menus_for_data = []
restaurant_menu_aspect_for_data = []

# Make list of dictionary for joint relation extraction
joint_relations = []

rest_index=0
rest_count = len(restaurant_cleaned_reviews['restaurants'])
for restaurant in restaurant_cleaned_reviews['restaurants']:

    single_restaurant_menus = []
    single_restaurant_aspects = []

    rest_index += 1
    # if rest_index == 2:
    #     break
    print("Processing Restaurant "+str(rest_index)+"/"+str(rest_count))

    reviews = restaurant['reviews']

    rev_index=0
    rev_count = len(reviews)
    for review in reviews:

        rev_index +=1
        print("\t Processing Review "+str(rev_index)+"/"+str(rev_count))

#         if rev_index < 6:
#             continue

        ner_dict = {}
        doc = nlp(review["text"])

        for x in doc.ents:
            entity_span = x.text

            has_restaurant_entity = False
            i = 0
            for kb_restaurant in kb_restaurant_df['Name']:
                ratio = levenshtein_ratio_and_distance(kb_restaurant.lower(), entity_span.lower(), ratio_calc=True)
                if ratio > 0.90:
                    has_restaurant_entity = True
                if has_restaurant_entity:
                    break
                i += 1

            if has_restaurant_entity:
                continue

            # has_menu_entity = False
            # i = 0
            # for kb_menu in kb_menu_df['Name']:
            #     ratio = levenshtein_ratio_and_distance(kb_menu.lower(), entity_span.lower(), ratio_calc=True)
            #     # print(menu.lower(), entity_span.lower(), str(ratio))
            #     if ratio > 0.90:
            #         has_menu_entity = True
            #     if has_menu_entity:
            #         break
            #     i += 1
            #
            # if has_menu_entity:
            #     continue

            if x.label_ not in ["CARDINAL", "ORDINAL"]:
                ner_dict[x.text] = x.label_

        text = review["text"]
        tuple_pairs = spacy_lexical_analyzer.get_lexical_triplets_pairs(text)
        tuple_pairs_df = pd.DataFrame(tuple_pairs, columns=['subject', 'relation', 'object'])
        # pairs = list(set(tuple(sub) for sub in tuple_pairs))
        rest = {"rest_id": restaurant["rest_id"], "name": restaurant["name"]}
        tuple_pairs_prune, pro_rev, _menus, _aspects = triplet_processor.process_triple_pruning(menu_ner,
                                                                    tuple_pairs_df, rest, review, ner_dict,
                                                                    store_in_db=False,  graph_db=None)
        triples_df = pd.concat([triples_df, tuple_pairs_prune])

        # print("_menus: "+str(len(_menus))+" "+str(len(_aspects)))
        user_menus_for_data.append([pro_rev["user_id"], "|".join(menu for menu in _menus)])
        single_restaurant_menus.extend(_menus)
        single_restaurant_aspects.extend(_aspects)

        joint_relation = {"sentText": str(restaurant['name'])+", "+pro_rev['name']+", "+pro_rev['text'],
                                "relationMentions": [ {"em1Text":triple["sub_ent"],
                                                       "em2Text":triple["obj_ent"],
                                                       "label":triple["relation"]}
                                                      for index, triple in tuple_pairs_prune.iterrows()
                                                      if triple['sub_ent'] and triple['obj_ent']]}
        if joint_relation['relationMentions']:
            joint_relations.append(joint_relation)
        # if rev_index == 7:
        #     break

    # print("single_restaurant_menus: "+str(len(single_restaurant_menus))+" "+str(len(single_restaurant_aspects)))
    single_restaurant_menus = list(set(single_restaurant_menus))
    single_restaurant_aspects = list(set(single_restaurant_aspects))
    # print("single_restaurant_menus: "+str(len(single_restaurant_menus))+" "+str(len(single_restaurant_aspects)))
    restaurant_menu_aspect_for_data.append([restaurant["rest_id"], "|".join(menu for menu in single_restaurant_menus),
                                "|".join(aspect for aspect in single_restaurant_aspects)])
    # print("single_restaurant_menus: "+str(len(joint_relations)))
    # print(joint_relations)

# {
#     "sentText": "restaurant_name, reviewer, review"
#     "relationMentions": [
#     {
#         "em1Text": "Bananaman",
#         "em2Text": "Brooke-Taylor",
#         "label": "starring"
#     },
#     {"em1Text": "Bananaman",
#      "em2Text": "Bright",
#      "label": "creator"
#      }
#     ]
# }
print(triples_df)

In [None]:
triples_df.to_csv("data/output/kg/input_data.txt-out2.csv", index=False)

user_menus_for_data_df = pd.DataFrame(user_menus_for_data, columns=['user_id', 'menus'])
user_menus_for_data_df.to_csv("data/output/kg/user_menu.csv", index=False)

restaurant_menu_aspect_for_data_df = pd.DataFrame(restaurant_menu_aspect_for_data, columns=['rest_id', 'menus',
                                                                                            'aspects'])
restaurant_menu_aspect_for_data_df.to_csv("data/output/kg/restaurant_menu_aspect.csv", index=False)

In [None]:
from pprint import pprint
pprint(joint_relations)

In [None]:
import json
with open("data/output/kg/joint_relation_100.json", 'w') as joint_relation_file:
    json.dump(joint_relations, joint_relation_file)

In [None]:
'''[{'sentText': 'Silver Spoon, Violet, The food is alright  not that great. Chicken briyani is very dry. '
              'Just went there once.',
  'relationMentions':
    [
        {'em1Text': 'Silver Spoon', 'em2Text': 'Food', 'label': 'HAS_ASPECT'},
        {'em1Text': 'Great', 'em2Text': 'Silver Spoon', 'label': 'ASPECT_ATTR_FOR'},
        {'em1Text': 'Food', 'em2Text': 'Great', 'label': 'IS'},
        {'em1Text': 'Silver Spoon', 'em2Text': 'Chicken Biryani', 'label': 'HAS_MENU'},
        {'em1Text': 'Dry', 'em2Text': 'Silver Spoon', 'label': 'MENU_ATTR_FOR'},
        {'em1Text': 'Chicken Biryani', 'em2Text': 'Dry', 'label': 'IS'}
    ]
  },
 {'sentText': 'Silver Spoon, Marie, Marie have  really tried to like Silver Spoon but Silver Spoon disappoints '
              'me more and more each time Marie visit. At first  the portions were generous and the food was '
              'delicious. Today was the last time Marie will  be visiting  though. It was the third time Marie '
              'was given three small bits of butter chicken in a combo when previous visits Marie was given '
              'two heaping scoops. When my husband and Marie pointed it out to the server and asked for an '
              'actual serving the server spooned three more pieces in but then disappeared into the back '
              'with the container. the server reemerged a minute or so later and told my husband and Marie '
              'the server had to check if it was ok. Marie get it if Marie were asking for a huge amount but '
              'Marie were not. Just the same amount given to my husband and Marie when asking for the '
              'vegetable curry. The thing that bugs me the most is that Marie have no clue what actually '
              'happened to my container when the container was taken out of my sight. Wo not  be returning.',
  'relationMentions':
      [
          {'em1Text': 'Silver Spoon', 'em2Text': 'Food', 'label': 'HAS_ASPECT'},
          {'em1Text': 'Delicious', 'em2Text': 'Silver Spoon', 'label': 'ASPECT_ATTR_FOR'},
          {'em1Text': 'Food', 'em2Text': 'Delicious', 'label': 'IS'}
      ]
  },
 {'sentText': "Silver Spoon, Priyam, Silver Spoon is ideal for days you simply want to take out good  "
              "fast  and already made food. Silver Spoon have a wide variety of Indian dishes "
              "consisting of chicken  beef  goat  fish and vegetarian. Priyam especially like "
              "Silver Spoon's Biryani  chicken and rice  special. their Biryani  chicken and rice  "
              "special's delicious food at a good price. The food is laid out like a buffet for you "
              "to see and you simply ask the server for the dishes you want. Personally Priyam like "
              "taking out the meat dishes from here and eating with rice or rotis that Priyam already "
              "have at home. Priyam's favourite dishes are Achari chicken  the goat  the fish pakoras "
              "and the biryani. Silver Spoon is well maintained and clean making Silver Spoon more appealing.",
  'relationMentions':
      [
          {'em1Text': 'Silver Spoon', 'em2Text': 'Chicken', 'label': 'HAS_MENU'},
          {'em1Text': 'Silver Spoon', 'em2Text': 'Ideal', 'label': 'IS'}
      ]
  },
 {'sentText': "Silver Spoon, John, John were initially introduced to the food from Silver Spoon from a "
              "group lunch event at work. John have been regulars ever since. Silver Spoon is a takeout "
              "restaurant with a great selection of hot foods from Silver Spoon's display window. So no "
              "waiting. Everything is fresh and has an authentic home made flavour. The rice is long grain "
              "and light. The chicken pieces are tasty. The sauces are rich and tasty. John have always "
              "found the staff friendly and helpful and willing to explain the various dishes. When you "
              "get into your car with your food the smells are so delicious you cannot be sure you can get "
              "home without trying some. Highly recommended.",
  'relationMentions':
      [
          {'em1Text': 'Silver Spoon', 'em2Text': 'Chicken Pieces', 'label': 'HAS_MENU'},
          {'em1Text': 'Tasty', 'em2Text': 'Silver Spoon', 'label': 'MENU_ATTR_FOR'},
          {'em1Text': 'Chicken Pieces', 'em2Text': 'Tasty', 'label': 'IS'},
          {'em1Text': 'Silver Spoon', 'em2Text': 'Sauce', 'label': 'HAS_MENU'},
          {'em1Text': 'Rich', 'em2Text': 'Silver Spoon', 'label': 'MENU_ATTR_FOR'},
          {'em1Text': 'Sauce', 'em2Text': 'Rich', 'label': 'IS'}
      ]
  },
 {'sentText': 'Silver Spoon, Katya, Katya have  been here many times  and had varied experiences  but overall  Katya am a fan of Silver Spoon. Even though Silver Spoon have expanded to another location in Scarborough  so that there are now two Silver Spoons the Pickering location has managed to maintain consistent quality  although consistent quality can vary on days. Well  what restaurant does not  have that problem You might be suddenly short - staffed  someone down with the flu   or perhaps one of your deliveries did not  make it on time It is good food. Katya prefer the Pakistani cuisine :  Nihari Haleem not always available  and several of the biryanis. Silver Spoon are now offering Sindhi Biryani on weekends  and Katya m so pleased ! The only way that Katya could get Sindhi Biryani in the past was to make Sindhi Biryani Katya using a packaged spice blend. If you want to get Naan  Katya recommend the Butter Naan. Although the Butter Naan costs a bit more the butter enhancements really take Naan to the next level of browning and savoury tastes. Ok  things sit under the heat lamp but things do not seem to suffer that much  particularly something like Nihari  which is a stew - type dish that can withstand the warming table. Everything has always tasted fresh when Katya have ordered it. Katya find that the samosas are fairly standard  but there are other appetizers which are more exciting. For people who like the dark meat of the chicken Lollypop Chicken is great and consistently good. Katya also enjoy Shami Kabab. If you like fish  do not  skip the Fingerfish  which is consistently tasty. Whenever Katya have been there  which is perhaps once a month or two  for the past five years the portions have been generous. Katya am certainly a fan  and Silver Spoon fills an important niche in the local  Pickering community as a convenient place to pick up some Halal take - away of decent quality.', 'relationMentions': [{'em1Text': 'Silver Spoon', 'em2Text': 'Sindhi Biryani', 'label': 'HAS_MENU'}, {'em1Text': 'Silver Spoon', 'em2Text': 'Shami Kebab', 'label': 'HAS_MENU'}, {'em1Text': 'Katya', 'em2Text': 'Shami Kebab', 'label': 'ORDER'}, {'em1Text': 'Silver Spoon', 'em2Text': 'Lollypop Chicken', 'label': 'HAS_MENU'}, {'em1Text': 'Great', 'em2Text': 'Silver Spoon', 'label': 'MENU_ATTR_FOR'}, {'em1Text': 'Lollypop Chicken', 'em2Text': 'Great', 'label': 'IS'}]}, {'sentText': "Silver Spoon, Heli, Oh  Silver Spoon. Heli have  tried to love Silver Spoon. Heli have  given Silver Spoon so many chances. But this just is not  working out anymore. To start :  the food is not  exactly terrible. the food just...not very good. Heli have  tried almost every meat dish  the chana masala  samosas  biriyanis  etc. Indian take - out is hardly known for Indian take - out's healthiness but when you order a veal karahi and get two sad chunks of meat floating in half a container in oil it is  a little disheartening. Add to that the fact that food is often sitting under a heat lamp for God knows how long... On the upside :  the chicken biriyani is good bang for your buck  -  especially during the week when the chicken biriyani is  on special. You get a fairly flavourful container packed with rice and meat. a fairly flavourful container packed with rice and meat  ready to go so you rarely have to wait more than a few minutes for your order. There was that one time they ran out of chicken biriyani  which is understandable when the chicken biriyani is  on special  but then offered me a  regular biriyani read : JUST rice  for the same price. Ummm  say what   Heli do love their beef samosas -  it is  just to bad that they NEVER HAVE ANY. Seriously  Heli have been here early in the evening  at random times during the week  and they have loads of sad looking veggie samosas  but no beef. Heli feel like the owners need to do a serious evaluation of what works and does not  work. If you notice you are  always selling out of a certain item  and NOT another   does not  it make sense to up a certain item while downsizing another Who knows. My biggest gripe was the chana masala incident. The last time Heli went in  Heli ordered a chana masala  which Heli m apathetic about  but my husband likes . The girl at the counter kind of made a face and said Oh... no. Heli do not  want a chana masala  which Heli m apathetic about but my husband likes. Heli was confused. There was a whole tray of a chana masala  which Heli m apathetic about but my husband likes there  brimming with chick - pea goodness. a chana masala  which Heli m apathetic about but my husband likes not... good today. What What does that mean  Is the taste just  off  or had a chana masala  which Heli m apathetic about but my husband likes spoiled  In either event why is there a full tray of  not good  chana masala here for customers to look at  and possible walk out the door with   Heli stared at a full tray of  not good  chana masala  and The girl at the counter motioned at a full tray of  not good  chana masala again Heli was just going to remove a full tray of  not good  chana masala from here anyway. But  um  Heli did not . And Heli was the only person in the store. From the wide glass window  before Heli even stepped in  Heli could see that there was no one else even there. She had not  even come over until Heli would  been standing there for a few minutes. It really bothers me that Heli still really do not  know what was so wrong with an entire tray of food that an entire tray of food would warrant being thrown out  -  and why an entire tray of food HADN T been thrown out immediately if an entire tray of food really could not  be saved. It was clear from the crispiness on the top you know that kind of layer of  kin  that develops   that an entire tray of food had been sitting for a while. Ultimately  Heli m really quite relieved the staff person told me before Heli bought a full tray of  not good  chana masala took a full tray of  not good  chana masala home and got diarrhea or something this would be a much harsher review if that had happened... And Heli guess that says something positive about the customer service  sorta... As more and more Indian food places -  both takeout and dine - in  -  emerge in the region  store owners need to realize that customers are going to eventually become more discerning. Heli really wanted this place to be a favorite  -  after all  this place is  very convenient for me  -  but  sorry Silver Spoon Silver Spoon just do not  make the cut.", 'relationMentions': [{'em1Text': 'Silver Spoon', 'em2Text': 'Chana Masala', 'label': 'HAS_MENU'}, {'em1Text': 'Heli', 'em2Text': 'Chana Masala', 'label': 'ORDER'}, {'em1Text': 'Silver Spoon', 'em2Text': 'Beef', 'label': 'HAS_MENU'}, {'em1Text': 'Heli', 'em2Text': 'Beef', 'label': 'ORDER'}, {'em1Text': 'Silver Spoon', 'em2Text': 'Food', 'label': 'HAS_ASPECT'}, {'em1Text': 'Terrible', 'em2Text': 'Silver Spoon', 'label': 'ASPECT_ATTR_FOR'}, {'em1Text': 'Food', 'em2Text': 'Terrible', 'label': 'IS'}]}, {'sentText': "Silver Spoon, Adnan, Silver Spoon do not have lots of crowd and it taste is different than other locations. Adnan would not say the food is completely bad but certainly not the best. Silver Spoon's biryani  the holy grail of southeast Asian food  is just so spicy and sometimes taste raw  like something is missing. Adnan would suggest Their biryani  the holy grail of southeast Asian food to masses  try before you buy.", 'relationMentions': [{'em1Text': 'Silver Spoon', 'em2Text': 'Biryani', 'label': 'HAS_MENU'}, {'em1Text': 'Adnan', 'em2Text': 'Biryani', 'label': 'ORDER'}, {'em1Text': 'Silver Spoon', 'em2Text': 'Food', 'label': 'HAS_ASPECT'}, {'em1Text': 'Bad', 'em2Text': 'Silver Spoon', 'label': 'ASPECT_ATTR_FOR'}, {'em1Text': 'Food', 'em2Text': 'Bad', 'label': 'IS'}]}, {'sentText': "Silver Spoon, Atif, Atif was first to review Silver Spoon. Atif gave Silver Spoon   stars and below review remains. Atif do feel it has fallen off big time. Atif have given you so many chances your staff is very nice. Atif's last straw was the cockroach on the fridge. Atif called the guy out from the back to show the cockroach on the fridge. the cockroach on the fridge was about mid adult size  with them wirey attennas. the cockroach on the fridge was crawling around  and Atif pictured the cockroach on the fridge being baked into Atif's shami kabab Atif just ordered. The food is on warmers. The food is either doused in oil to not get dry... Or The food is Dry. Atif have a sense of humour  and Atif literally have to never go back there ever again after posting this. Otherwise Atif will get curry cockroach to go ! Fail !  !  !  !  !  !  !  !  !  !  !  !  !  !", 'relationMentions': [{'em1Text': 'Silver Spoon', 'em2Text': 'Staff', 'label': 'HAS_ASPECT'}, {'em1Text': 'Nice', 'em2Text': 'Silver Spoon', 'label': 'ASPECT_ATTR_FOR'}, {'em1Text': 'Staff', 'em2Text': 'Nice', 'label': 'IS'}]}]

'''
from module.ner.menu_ner import MenuNER
menu_ner = MenuNER()

with open('data/output/kg/joint_relation_100.json', 'r') as json_file:
    json_data = json_file.read()
    joint_relations = json.loads(json_data)

In [None]:
'''
Change title line code from MenuNER module
'''
from pprint import pprint

kb_menu_dict = {}
with open(kb_menu_file, mode='r') as infile:
    reader = csv.reader(infile)
    i = 0
    for rows in reader:
        if i == 0:
            i += 1
            continue
        kb_menu_dict[rows[0]] = rows[1]

print(str(len(kb_menu_dict)))

def split_rest_user_from_text(text, sep, pos):
    text = text.split(sep)
    return text[0], sep.join(text[pos:]).strip()

index = 0
total_relations = len(joint_relations)

for j_relation in joint_relations:
    # if index < 1:
    #     index += 1
    #     continue
    index += 1
    print("processing review: "+str(index)+"/"+str(total_relations))

    rest_name, split_review = split_rest_user_from_text(j_relation['sentText'], ',', 2)
    # print(split_review)

    entity_dict = menu_ner.extract_menu_ner_single(split_review)
    # pprint(entity_dict)

    for k, v in entity_dict.items():
        # print(k)
        j_relation['sentText'] = j_relation['sentText'].replace(k, k.title())

    for k, v in entity_dict.items():
        # print(k)

        valid_menu = False
        for key, value in kb_menu_dict.items():
            if key.lower() == k.lower():
                valid_menu = True
                # j_relation['sentText'] = j_relation['sentText'].replace(k, key.title())
                k = key.title()
                break

        if not valid_menu:
            for key, value in kb_menu_dict.items():
                # Check if misspelled menu word
                subject_ratio = levenshtein_ratio_and_distance(key.lower(), k.lower(), ratio_calc=True)
                if subject_ratio > 0.85:
                    # print("updating subject [" + k + "] with [" + key + "]")
                    j_relation['sentText'] = j_relation['sentText'].replace(k, key.title())
                    # k = key.title()
                    valid_menu = True
                    break

        if valid_menu:
            menu_with_rel_found = False
            for rel in j_relation['relationMentions']:
                if rel['em1Text'].lower() == k.lower():
                    rel['em1Text'] = rel['em1Text'].title()

            for rel in j_relation['relationMentions']:
                if rel['em2Text'].lower() == k.lower() and rel['label'] == 'HAS_MENU':
                    # print('Menu relation already present')
                    rel['em2Text'] = rel['em2Text'].title()
                    menu_with_rel_found = True
                    break

            if not menu_with_rel_found:
                # print('Menu relation not found')
                j_relation['relationMentions'].append({'em1Text': rest_name,
                                                       'em2Text': k.title(),
                                                       'label': 'HAS_MENU'})




    # pprint(j_relation)
    # break

In [None]:
with open('data/output/kg/joint_relation_100.json', 'r') as json_file:
    json_data = json_file.read()
    joint_relations = json.loads(json_data)

In [None]:
import re
def find_word(text, search):
    res = re.findall('\\b'+search+'\\b', text, flags=re.IGNORECASE)
    if len(res) > 0:
        return True
    else:
        return False

for relation in joint_relations:
    relation['sentText'] = relation['sentText'].lower()
    for rel_mention in relation['relationMentions']:
        rel_mention['em1Text'] = rel_mention['em1Text'].lower()
        rel_mention['em2Text'] = rel_mention['em2Text'].lower()
        rel_mention['label'] = rel_mention['label'].lower()

for relation in joint_relations:
    relation_mentions = relation['relationMentions']
    for rel_mention in relation_mentions[:]:
        em1 = find_word(relation['sentText'], rel_mention['em1Text']) #relation['sentText'].find(rel_mention['em1Text'])
        em2 = find_word(relation['sentText'], rel_mention['em2Text']) #relation['sentText'].find(rel_mention['em2Text'])

        # if em1 == -1 or em2 == -1:
        if not em1 or not em2:
            relation_mentions.remove(rel_mention)

In [None]:
import json
with open("data/output/kg/joint_relation_100.json", 'w') as joint_relation_file:
    json.dump(joint_relations, joint_relation_file)

In [None]:
import random

random.shuffle(joint_relations)

In [None]:
with open("data/output/kg/new_test.json", 'w') as f:
    item_no = 0
    for item in joint_relations:
        item_no += 1
        if item_no == 401:
            break
        f.write("%s\n" % item)


In [None]:
with open("data/output/kg/new_valid.json", 'w') as f:
    item_no = 0
    for item in joint_relations:
        item_no += 1
        if item_no < 401:
            continue
        if item_no == 801:
            break
        f.write("%s\n" % item)

In [None]:
with open("data/output/kg/new_train.json", 'w') as f:
    item_no = 0
    for item in joint_relations:
        item_no += 1
        if item_no < 801:
            continue
        f.write("%s\n" % item)

In [None]:
'''
If menus are null put empty string ('') and remove duplicate menus from the list
'''
user_menu_df = pd.read_csv("data/output/kg/user_menu.csv")
print(str(len(user_menu_df)))

user_menu_df['menus'] = user_menu_df['menus'].apply(lambda x: x if not pd.isnull(x) else '')
user_menu_df = user_menu_df.groupby("user_id")

user_merge_menus = []
for key, item in user_menu_df:
    group = user_menu_df.get_group(key)
    group_menus = list(group["menus"])
    group_menus = group_menus[0].split("|")
    group_menus = set(group_menus)

    user_merge_menus.append([list(group['user_id'])[0], '|'.join(str(menu) for menu in group_menus if menu != '')])

print(str(len(user_menu_df)))

user_menu_df = pd.DataFrame(user_merge_menus, columns=['user_id', 'menus'])
user_menu_df.to_csv("data/output/kg/user_menu.csv", index=False)

In [None]:
'''
Merge Menus from user_menu.csv to user.csv
'''
user_ori_df = pd.read_csv(users_csv)
user_menu_temp_df = pd.read_csv("data/output/kg/user_menu.csv")
user_merged_df = pd.merge(user_ori_df, user_menu_temp_df, on=['user_id'], how='left')
user_merged_df.to_csv(users_csv, index=False)

In [None]:
'''
Merge Menus & Aspects from restaurant_menu_aspect.csv to restaurant.csv
'''

rest_ori_df = pd.read_csv(restaurants_csv)
rest_menu_aspect_temp_df = pd.read_csv("data/output/kg/restaurant_menu_aspect.csv")
rest_merged_df = pd.merge(rest_ori_df, rest_menu_aspect_temp_df, on=['rest_id'], how='left')
rest_merged_df.to_csv(restaurants_csv, index=False)

In [None]:
import datetime

rating_df = pd.read_csv(ratings_csv)
restaurant_df = pd.read_csv(restaurants_csv)
user_df = pd.read_csv(users_csv)


rating_df['date']=rating_df['date'].apply(lambda x: int(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime("%s")))

for item, row in user_df.iterrows():
    rating_df['user_id'] = rating_df['user_id'].replace([row['user_id']], row['id'])

for item, row in restaurant_df.iterrows():
    rating_df['rest_id'] = rating_df['rest_id'].replace([row['rest_id']], row['id'])

rating_df.to_csv(ratings_csv, index=False)