# Perform approximate search against reference features

## Initialize Recommendation project

In [1]:
import os, sys
sys.path.insert(0, '/Users/shashank/Workspace/Orgs/Ether/ai-engine/services/recommendation/')

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import json as js
from scipy.spatial.distance import cosine
import pickle
from fuzzywuzzy import process, fuzz
from collections import Counter, OrderedDict
from nltk import pos_tag, word_tokenize
import pickle

In [4]:
from vectorize import Vectorizer
from watchers import RecWatchers
from explain import Explainability
from utils import Utils

In [5]:
def to_json(data, filename):
    with open(filename + ".json", "w", encoding="utf-8") as f_:
        js.dump(data, f_, ensure_ascii=False, indent=4)

def read_json(json_file):
    with open(json_file) as f_:
        meeting = js.load(f_)
    return meeting

In [6]:
pos_list = ["NN", "NNS", "NNP"]

## Initiate Sentence Encoder Lambda function

In [7]:
from boto3 import client, session
from botocore.client import Config

In [8]:
aws_config = Config(
        connect_timeout=180,
        read_timeout=300,
        retries={"max_attempts": 2},
        region_name="us-east-1",
    )

In [9]:
lambda_client = client("lambda", config=aws_config)

In [10]:
vec = Vectorizer(lambda_client, "sentence-encoder-lambda")

In [11]:
s3_sess = session.Session(aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"))
conn = s3_sess.client("s3")

In [12]:
s3_bucket = "io.etherlabs.staging2.contexts"

## Load reference features

In [14]:
with open("features/reference_user_text_vector.pickle", 'rb') as f_:
    reference_user_vector_data = pickle.load(f_)

with open("features/reference_user_kw_vector.pickle", 'rb') as f_:
    reference_user_kw_vector_data = pickle.load(f_)
    
ref_user_dict = read_json("data/reference_prod_user.json")
ref_user_info_dict = {k: ref_user_dict[k]["keywords"] for k in ref_user_dict.keys()}

In [15]:
9000**(1/2)

94.86832980505137

In [16]:
np.log2(9000)

13.1357092861044

In [292]:
num_buckets=200
hash_size=16


In [335]:
staging_url = "https://hooks.slack.com/services/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"
prod_url = "https://hooks.slack.com/services/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"
watcher_obj = RecWatchers(
            reference_user_dict=ref_user_dict,
            user_vector_data=reference_user_kw_vector_data,
            vectorizer=vec,
            s3_client=conn,
            web_hook_url=staging_url,
            num_buckets=num_buckets,
            hash_size=hash_size
            )

In [336]:
watcher_obj.us.num_features_in_input

{'0bbbfe84-c661-45af-8d0f-fcd5258bba38': 0,
 '1a215425-8449-4fca-ba95-7d768b595b80': 0,
 '84fbaa66-a247-4ea2-9ae0-53f3a2e519d6': 0,
 'c66797a9-2e6d-46ad-9573-926e57f7dac3': 0,
 '2c944512-17a0-4912-9a16-6a3408da807c': 0,
 '7e7ccbba-232d-411a-a95a-d3f244a35f40': 0,
 '75bdf310-110b-4b8f-ab88-b16fafce920e': 0,
 'b4a57b25-de68-446c-ac99-0f856d3fe4d5': 0,
 'b1e8787a-9a1f-4859-ac11-cbb6a8124fd9': 0,
 '65bb8395-2fb5-4409-a4bb-59bb707f1375': 0,
 'fb52cb66-3aec-4795-aee3-8ccfd904d315': 0,
 '62b6ae1d-7f83-4b0b-b205-5f7c72bc3368': 0,
 'ecfeeb75-7f0a-4d47-af1e-bd513929264a': 0,
 '8d6db5f7-d9b7-4c54-ba38-fe710ffcaf3f': 0,
 '81a3e154-6937-4fce-ba1c-f972faa209b2': 0}

In [337]:
watcher_obj.featurize_reference_users()
watcher_obj.us.num_features_in_input

{'0bbbfe84-c661-45af-8d0f-fcd5258bba38': 355,
 '1a215425-8449-4fca-ba95-7d768b595b80': 608,
 '84fbaa66-a247-4ea2-9ae0-53f3a2e519d6': 336,
 'c66797a9-2e6d-46ad-9573-926e57f7dac3': 250,
 '2c944512-17a0-4912-9a16-6a3408da807c': 0,
 '7e7ccbba-232d-411a-a95a-d3f244a35f40': 1016,
 '75bdf310-110b-4b8f-ab88-b16fafce920e': 247,
 'b4a57b25-de68-446c-ac99-0f856d3fe4d5': 361,
 'b1e8787a-9a1f-4859-ac11-cbb6a8124fd9': 2164,
 '65bb8395-2fb5-4409-a4bb-59bb707f1375': 238,
 'fb52cb66-3aec-4795-aee3-8ccfd904d315': 777,
 '62b6ae1d-7f83-4b0b-b205-5f7c72bc3368': 1194,
 'ecfeeb75-7f0a-4d47-af1e-bd513929264a': 251,
 '8d6db5f7-d9b7-4c54-ba38-fe710ffcaf3f': 313,
 '81a3e154-6937-4fce-ba1c-f972faa209b2': 490}

In [338]:
np.sum(list(watcher_obj.us.num_features_in_input.values()))

8600

In [339]:
segment_obj = [{
		"id": "0f1aa05f2e944142a18974369bb9b789",
		"originalText": "We go into that issue because we read the from the Json field column tables. We we have their columns available, but we are not json and value from that column which is creating a problem but then you know, bring a that solution we should start needed it from that columns and only few places we've done ready need to do that the is the case. I think this not of code is the if you want to use the column portion of the values json log then there will be George changes setting. U a D then we start fields and all not so in case of meeting meeting and recording and markers. We have both Json fields and the call proper call call let let can that like that what I telling is will not do anything with that in the Json and value of d there right. We will replace the swing field with the Hyphen iPhone that's. The we are reading the ID from the Json. I'm not from the column. So in the days very while waiting from a database we need to change oriented B a force layer. I I will from the value where it is like it a relationship right but then the column or as become the that valuation can value. So if we start reading from actual columns we should not ask that's what I thinking less less. Let's see. And also just ping up existing links which we have doing the recording link right. That we it work automatically because those will not having open do id. Yeah, maybe if we can translate like internally and finish through that again. From layer where we to transfer it okay let me do five okay. I'll make the new are you working today. Yeah. Actually yesterday was what little bit from this cloud distribution task like basically what you what we to do is that me or we want to point it in and if the API part has a slack in okay part will do it routed to API p gateway but if it doesn't have any other if you are anything with before we have go to load we will go back to the same whatever we have the proxy right? So so the distribution and all to so that that part from started working yesterday I talk to trishant. So there are no blockage from there are point of view so we can like today be able to read that reference script for it. So after this I want to work on a P a service solution connect and is moving back to external load. The API will directed of external. So that with the proper point id.",
		"confidence": 0.86578494,
		"startTime": "2019-01-29T06:03:02Z",
		"endTime": "2019-01-29T06:03:14Z",
		"duration": 12,
		"recordingId": "9789538caa6d4857a7d6bf47584aafd8",
		"spokenBy": "0bbbfe84-c661-45af-8d0f-fcd5258bba38",
		"languageCode": "en-US",
		"transcriber": "google_speech_api",
		"status": "completed",
		"transcriptId": "e62bba58-b959-4aab-af7e-a75737fd736c",
		"createdAt": "2019-01-29T06:03:18.049750006Z",
		"updatedAt": "2019-01-29T06:03:39.783385299Z"
	}]

In [340]:
import uuid
segment_user_id = [str(uuid.UUID(u["spokenBy"])) for u in segment_obj]
segment_user_names = [ref_user_dict[u].get("name") for u in segment_user_id]

In [341]:
segment_user_id
segment_user_names

['Parshwa Nemi Jain']

## Provide query keywords or text

### Test with Keywords

In [365]:
slack_kw_input = "response to John Searle, symbols with certain inputs, symbolic inputs, question of whether computers, type of computer, Artificial intelligence, famous philosophical arguments, real time John Searles Chinese room, fields of the philosophy, computer would be intelligent"

In [366]:
query_text = "MapReduce is just a computing framework. HBase has nothing to do with it. That said, you can efficiently put or fetch data to/from HBase by writing MapReduce jobs. Alternatively you can write sequential programs using other HBase APIs, such as Java, to put or fetch the data. But we use Hadoop, HBase etc to deal with gigantic amounts of data, so that doesn't make much sense. Using normal sequential programs would be highly inefficient when your data is too huge. Coming back to the first part of your question, Hadoop is basically 2 things: a Distributed FileSystem (HDFS) + a Computation or Processing framework (MapReduce). Like all other FS, HDFS also provides us storage, but in a fault tolerant manner with high throughput and lower risk of data loss (because of the replication). But, being a FS, HDFS lacks random read and write access. This is where HBase comes into picture. It's a distributed, scalable, big data store, modelled after Google's BigTable. It stores data as key/value pairs. Hadoop is basically 3 things, a FS (Hadoop Distributed File System), a computation framework (MapReduce) and a management bridge (Yet Another Resource Negotiator). HDFS allows you store huge amounts of data in a distributed (provides faster read/write access) and redundant (provides better availability) manner. And MapReduce allows you to process this huge data in a distributed and parallel manner. But MapReduce is not limited to just HDFS. Being a FS, HDFS lacks the random read/write capability. It is good for sequential data access. And this is where HBase comes into picture. It is a NoSQL database that runs on top your Hadoop cluster and provides you random real-time read/write access to your data"

In [367]:
query_keywords = [
    "kind of update", 
    "summary segments", 
    "Brute Force fabric", 
    "share and dislike button", 
    "open the me tap", "invite Zoom", 
    "resume during playback", 
    "email and the", "select slack Channel", "share like the screen design"
]

In [368]:
query_keywords = [w for w in slack_kw_input.split(", ")]

In [369]:
query_keywords

['response to John Searle',
 'symbols with certain inputs',
 'symbolic inputs',
 'question of whether computers',
 'type of computer',
 'Artificial intelligence',
 'famous philosophical arguments',
 'real time John Searles Chinese room',
 'fields of the philosophy',
 'computer would be intelligent']

In [370]:
# watcher_obj.featurize_reference_users()
hash_result = watcher_obj.perform_hash_query(input_list=query_keywords)

num results 5019


In [371]:
top_user_dict, top_words, suggested_users = watcher_obj.get_recommended_watchers(input_query_list=query_keywords, 
                                                                                 input_kw_query=query_keywords,
                                                                                hash_result=hash_result,
                                                                                 segment_obj=segment_obj,
                                                                                n_kw=10)

In [372]:
named_hash_result = {ref_user_dict[u]["name"]: score for u, score in hash_result.items()}

In [373]:
watcher_obj.utils.sort_dict_by_value(named_hash_result)

OrderedDict([('Shashank', 0.656496062992126),
             ('Karthik Muralidharan', 0.6139028475711893),
             ('Trishanth Diwate', 0.611336032388664),
             ('Arjun Kini', 0.6040816326530613),
             ('Krishna Sai', 0.6038338658146964),
             ('mithun', 0.5982142857142857),
             ('Venkata Dikshit', 0.5928835489833642),
             ('Deep Moradia', 0.5761772853185596),
             ('Reagan Rewop', 0.5585585585585585),
             ('Nisha Yadav', 0.552),
             ('Vani', 0.5252100840336135),
             ('Vamshi Krishna', 0.5098684210526315),
             ('Parshwa Nemi Jain', 0.4732394366197183),
             ('Shubham', 0.46215139442231074)])

In [377]:
top_user_dict

{'Shashank': 0.9502228120635137,
 'Karthik Muralidharan': 0.7296724191256977,
 'Arjun Kini': 0.6390368491362369,
 'Trishanth Diwate': 0.6215599091513562}

In [378]:
top_words

['small ontology',
 'algorithmic side',
 'Binary',
 'language model',
 'create life functions',
 'data simulation',
 'fundamental concepts',
 'object objective',
 'building Lambda functions',
 'examine subject implementation']

In [379]:
suggested_users

['Shashank', 'Karthik Muralidharan']

## Post to Slack for testing

In [312]:
import requests
import jsonlines
import logging
import json as js
import os
from pathlib import Path
from collections import OrderedDict
import requests
import numpy as np
import uuid
import hashlib

In [316]:
def post_to_slack(
    instance_id, segment_keyphrase_list, user_list, user_scores, suggested_user_list,segment_user_names, word_list
):
    input_keyphrase_list = segment_keyphrase_list

    service_name = "recommendation-service"
    msg_text = "*Recommended users for meeting: {}* \n *Segment summary*: ```{}```\n".format(
        instance_id, _reformat_list_to_text(input_keyphrase_list)
    )

    if len(user_list) == 0:
        msg_format = "[{}]: {} *NA*: `No recommendations available for this segment...`".format(
                service_name,
                msg_text)

    else:
        msg_format = "[{}]: {} *Segment Users*: ```{}```\n*Related Users*: ```{}```\n *User Confidence Scores*: ```{}```\n *Suggested Users*: ```{}```\n *Related Words*: ```{}```".format(
            service_name,
            msg_text,
            _reformat_list_to_text(segment_user_names),
            _reformat_list_to_text(user_list),
            _reformat_list_to_text(user_scores),
            _reformat_list_to_text(suggested_user_list),
            _reformat_list_to_text(word_list),
        )

    slack_payload = {"text": msg_format}
    requests.post(
        url=staging_url, data=js.dumps(slack_payload).encode()
    )

def _reformat_list_to_text(input_list):
    try:
        if type(input_list[0]) != str:
            formatted_text = ", ".join(
                ["{:.2f}".format(i) for i in input_list]
            )
        else:
            formatted_text = ", ".join([str(w) for w in input_list])
    except Exception as e:
        formatted_text = input_list
        logger.warning(e)

    return formatted_text

In [317]:
def make_validation_data(
    input_query,
    user_list,
    user_scores,
    suggested_user_list,
    word_list,
    segment_users,
    instance_id=None,
    context_id=None,
    segment_obj=None,
    upload=False
):
    if instance_id is None:
        instance_id = ""
        context_id = ""
        segment_id = ""
    
    validation_dict = {}
    for i in range(len(input_query)):
        validation_dict.update(
            {
                "text": input_query,
                "labels": user_list,
                "meta": {
                    "instanceId": instance_id,
                    "segmentId": segment_id,
                    "suggestedUsers": suggested_user_list,
                    "userScore": user_scores,
                    "keyphrases": input_query,
                    "relatedWords": word_list,
                    "positiveLabels": segment_users,
                },
            }
        )
    write_to_jsonl(validation_dict)

    if upload:
        upload_validation_data(
            validation_dict=validation_dict, instance_id=instance_id, context_id=context_id, delete=False
        )

def write_to_jsonl(validation_dict, prefix="watchers_", file_name=None):
    validation_id = hash_sha_object()
    save_dir = "/Users/shashank/Workspace/Orgs/Ether/ai-engine/tests/recommendation_service/validation/"
    file_name = prefix + validation_id + ".jsonl"
    with jsonlines.open(os.path.join(save_dir, file_name), mode="w") as writer:
        writer.write(validation_dict)

def upload_validation_data(
    validation_dict, instance_id, context_id, prefix="watchers_", delete=False
):
    validation_id = hash_sha_object()
    file_name = prefix + instance_id + "_" + validation_id + ".jsonl"
    with jsonlines.open(file_name, mode="w") as writer:
        writer.write(validation_dict)

    s3_path = "validation/recommendations/" + file_name

    try:
        s3_client.upload_to_s3(
            file_name=file_name, object_name=s3_path
        )
    except Exception as e:
        print(e)

    if delete:
        # Once uploading is successful, check if NPZ exists on disk and delete it
        local_path = Path(file_name).absolute()
        if os.path.exists(local_path):
            os.remove(local_path)
        
def hash_sha_object() -> str:
    uid = uuid.uuid4()
    uid = str(uid)
    hash_object = hashlib.sha1(uid.encode())
    hash_str = hash_object.hexdigest()
    return hash_str

def normalize(score, scores_list):
    normalized_score = (score - np.mean(scores_list)) / (
        np.max(scores_list) - np.min(scores_list)
    )
    return normalized_score

In [321]:
post_to_slack(
    instance_id="s2-test",
    segment_keyphrase_list=query_keywords,
    user_list=list(top_user_dict.keys()),
    user_scores=list(top_user_dict.values()),
    suggested_user_list= suggested_users,
    segment_user_names = segment_user_names,
    word_list=top_words
)

In [88]:
segment_user_names = ""
make_validation_data(
    input_query=query_keywords,
    user_list=list(top_user_dict.keys()),
    user_scores=list(top_user_dict.values()),
    suggested_user_list=suggested_users,
    word_list=top_words,
    segment_users=segment_user_names,
    upload=False
)