# Perform approximate search against reference features

## Initialize Recommendation project

In [2]:
import os, sys
sys.path.insert(0, '/Users/shashank/Workspace/Orgs/Ether/ai-engine/services/recommendation/')

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import numpy as np
import json as js
from scipy.spatial.distance import cosine
import pickle
from fuzzywuzzy import process, fuzz
from collections import Counter, OrderedDict
from nltk import pos_tag, word_tokenize
import pickle

In [5]:
from vectorize import Vectorizer
from watchers import RecWatchers
from explain import Explainability
from utils import Utils

In [6]:
def to_json(data, filename):
    with open(filename + ".json", "w", encoding="utf-8") as f_:
        js.dump(data, f_, ensure_ascii=False, indent=4)

def read_json(json_file):
    with open(json_file) as f_:
        meeting = js.load(f_)
    return meeting

In [7]:
pos_list = ["NN", "NNS", "NNP"]

## Initiate Sentence Encoder Lambda function

In [17]:
from boto3 import client, session
from botocore.client import Config

In [18]:
aws_config = Config(
        connect_timeout=180,
        read_timeout=300,
        retries={"max_attempts": 2},
        region_name="us-east-1",
    )

In [19]:
lambda_client = client("lambda", config=aws_config)

In [20]:
vec = Vectorizer(lambda_client, "sentence-encoder-lambda")

In [21]:
s3_sess = session.Session(aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"))
conn = s3_sess.client("s3")

In [23]:
s3_bucket = "io.etherlabs.staging2.contexts"

## Load reference features

In [41]:
with open("reference_user_text_vector.pickle", 'rb') as f_:
    reference_user_vector_data = pickle.load(f_)

with open("reference_user_kw_vector.pickle", 'rb') as f_:
    reference_user_kw_vector_data = pickle.load(f_)
    
ref_user_dict = read_json("reference_prod_user.json")
ref_user_info_dict = {k: ref_user_dict[k]["keywords"] for k in ref_user_dict.keys()}

In [98]:
9000**(1/2)

94.86832980505137

In [99]:
np.log2(9000)

13.1357092861044

In [100]:
num_buckets=200
hash_size=16


In [101]:
staging_url = "https://hooks.slack.com/services/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"
watcher_obj = RecWatchers(
            reference_user_dict=ref_user_dict,
            user_vector_data=reference_user_kw_vector_data,
            vectorizer=vec,
            s3_client=conn,
            web_hook_url=staging_url,
            num_buckets=num_buckets,
            hash_size=hash_size
            )

In [102]:
watcher_obj.us.num_features_in_input

{'0bbbfe84-c661-45af-8d0f-fcd5258bba38': 0,
 '1a215425-8449-4fca-ba95-7d768b595b80': 0,
 '84fbaa66-a247-4ea2-9ae0-53f3a2e519d6': 0,
 'c66797a9-2e6d-46ad-9573-926e57f7dac3': 0,
 '2c944512-17a0-4912-9a16-6a3408da807c': 0,
 '7e7ccbba-232d-411a-a95a-d3f244a35f40': 0,
 '75bdf310-110b-4b8f-ab88-b16fafce920e': 0,
 'b4a57b25-de68-446c-ac99-0f856d3fe4d5': 0,
 'b1e8787a-9a1f-4859-ac11-cbb6a8124fd9': 0,
 '65bb8395-2fb5-4409-a4bb-59bb707f1375': 0,
 'fb52cb66-3aec-4795-aee3-8ccfd904d315': 0,
 '62b6ae1d-7f83-4b0b-b205-5f7c72bc3368': 0,
 'ecfeeb75-7f0a-4d47-af1e-bd513929264a': 0,
 '8d6db5f7-d9b7-4c54-ba38-fe710ffcaf3f': 0,
 '81a3e154-6937-4fce-ba1c-f972faa209b2': 0}

In [104]:
watcher_obj.featurize_reference_users()
watcher_obj.us.num_features_in_input

{'0bbbfe84-c661-45af-8d0f-fcd5258bba38': 355,
 '1a215425-8449-4fca-ba95-7d768b595b80': 608,
 '84fbaa66-a247-4ea2-9ae0-53f3a2e519d6': 336,
 'c66797a9-2e6d-46ad-9573-926e57f7dac3': 250,
 '2c944512-17a0-4912-9a16-6a3408da807c': 0,
 '7e7ccbba-232d-411a-a95a-d3f244a35f40': 1016,
 '75bdf310-110b-4b8f-ab88-b16fafce920e': 247,
 'b4a57b25-de68-446c-ac99-0f856d3fe4d5': 361,
 'b1e8787a-9a1f-4859-ac11-cbb6a8124fd9': 2164,
 '65bb8395-2fb5-4409-a4bb-59bb707f1375': 238,
 'fb52cb66-3aec-4795-aee3-8ccfd904d315': 777,
 '62b6ae1d-7f83-4b0b-b205-5f7c72bc3368': 1194,
 'ecfeeb75-7f0a-4d47-af1e-bd513929264a': 251,
 '8d6db5f7-d9b7-4c54-ba38-fe710ffcaf3f': 313,
 '81a3e154-6937-4fce-ba1c-f972faa209b2': 490}

In [105]:
np.sum(list(watcher_obj.us.num_features_in_input.values()))

8600

## Provide query keywords or text

### Test with Keywords

In [106]:
slack_kw_input = "allow user, integration with zoom the Standalone app, slack installation, marketplace in Zoom, install like add to zoom"

In [107]:
query_text = "MapReduce is just a computing framework. HBase has nothing to do with it. That said, you can efficiently put or fetch data to/from HBase by writing MapReduce jobs. Alternatively you can write sequential programs using other HBase APIs, such as Java, to put or fetch the data. But we use Hadoop, HBase etc to deal with gigantic amounts of data, so that doesn't make much sense. Using normal sequential programs would be highly inefficient when your data is too huge. Coming back to the first part of your question, Hadoop is basically 2 things: a Distributed FileSystem (HDFS) + a Computation or Processing framework (MapReduce). Like all other FS, HDFS also provides us storage, but in a fault tolerant manner with high throughput and lower risk of data loss (because of the replication). But, being a FS, HDFS lacks random read and write access. This is where HBase comes into picture. It's a distributed, scalable, big data store, modelled after Google's BigTable. It stores data as key/value pairs. Hadoop is basically 3 things, a FS (Hadoop Distributed File System), a computation framework (MapReduce) and a management bridge (Yet Another Resource Negotiator). HDFS allows you store huge amounts of data in a distributed (provides faster read/write access) and redundant (provides better availability) manner. And MapReduce allows you to process this huge data in a distributed and parallel manner. But MapReduce is not limited to just HDFS. Being a FS, HDFS lacks the random read/write capability. It is good for sequential data access. And this is where HBase comes into picture. It is a NoSQL database that runs on top your Hadoop cluster and provides you random real-time read/write access to your data"

In [108]:
query_keywords = [
    "kind of update", 
    "summary segments", 
    "Brute Force fabric", 
    "share and dislike button", 
    "open the me tap", "invite Zoom", 
    "resume during playback", 
    "email and the", "select slack Channel", "share like the screen design"
]

In [109]:
query_keywords = [w for w in slack_kw_input.split(", ")]

In [110]:
query_keywords

['allow user',
 'integration with zoom the Standalone app',
 'slack installation',
 'marketplace in Zoom',
 'install like add to zoom']

In [111]:
# watcher_obj.featurize_reference_users()
hash_result = watcher_obj.perform_hash_query(input_list=query_keywords)

(5, 512)
num results 4101


In [112]:
watcher_obj.us.lsh.tables[0].table

{2425: [{'label': '0bbbfe84-c661-45af-8d0f-fcd5258bba38'},
  {'label': 'c66797a9-2e6d-46ad-9573-926e57f7dac3'},
  {'label': 'b4a57b25-de68-446c-ac99-0f856d3fe4d5'},
  {'label': 'fb52cb66-3aec-4795-aee3-8ccfd904d315'},
  {'label': '62b6ae1d-7f83-4b0b-b205-5f7c72bc3368'},
  {'label': '62b6ae1d-7f83-4b0b-b205-5f7c72bc3368'}],
 52709: [{'label': '0bbbfe84-c661-45af-8d0f-fcd5258bba38'},
  {'label': '84fbaa66-a247-4ea2-9ae0-53f3a2e519d6'},
  {'label': 'fb52cb66-3aec-4795-aee3-8ccfd904d315'},
  {'label': '62b6ae1d-7f83-4b0b-b205-5f7c72bc3368'}],
 2153: [{'label': '0bbbfe84-c661-45af-8d0f-fcd5258bba38'},
  {'label': '0bbbfe84-c661-45af-8d0f-fcd5258bba38'},
  {'label': 'c66797a9-2e6d-46ad-9573-926e57f7dac3'},
  {'label': 'b4a57b25-de68-446c-ac99-0f856d3fe4d5'},
  {'label': '62b6ae1d-7f83-4b0b-b205-5f7c72bc3368'},
  {'label': '62b6ae1d-7f83-4b0b-b205-5f7c72bc3368'}],
 52581: [{'label': '0bbbfe84-c661-45af-8d0f-fcd5258bba38'},
  {'label': 'b1e8787a-9a1f-4859-ac11-cbb6a8124fd9'},
  {'label': '62b6

In [113]:
top_user_dict, top_words, suggested_users = watcher_obj.get_recommended_watchers(input_query_list=query_keywords, 
                                                                                 input_kw_query=query_keywords,
                                                                                hash_result=hash_result,
                                                                                n_kw=10)

(5, 512)
num results 8202
(5, 512)
num results 12303
(5, 512)
num results 16404


In [114]:
named_hash_result = {ref_user_dict[u]["name"]: score for u, score in hash_result.items()}

In [115]:
watcher_obj.utils.sort_dict_by_value(named_hash_result)

OrderedDict([('Vamshi Krishna', 0.7088815789473685),
             ('Parshwa Nemi Jain', 0.5943661971830986),
             ('Nisha Yadav', 0.592),
             ('Trishanth Diwate', 0.5748987854251012),
             ('Deep Moradia', 0.5678670360110804),
             ('Vani', 0.5294117647058824),
             ('mithun', 0.5208333333333334),
             ('Karthik Muralidharan', 0.49581239530988275),
             ('Shashank', 0.45570866141732286),
             ('Krishna Sai', 0.43769968051118213),
             ('Reagan Rewop', 0.41827541827541825),
             ('Venkata Dikshit', 0.3987985212569316),
             ('Arjun Kini', 0.3836734693877551),
             ('Shubham', 0.3784860557768924)])

In [116]:
top_words

['Zoom app',
 'latest sample web API',
 'golang client Library',
 'host SDK',
 'react app',
 'dashboard page user',
 'SDK launch',
 'external API close',
 'desktop apps update',
 'Docker file']

In [117]:
suggested_users

['Vamshi Krishna', 'Parshwa Nemi Jain']

## Post to Slack for testing

In [118]:
import requests
import jsonlines
import logging
import json as js
import os
from pathlib import Path
from collections import OrderedDict
import requests
import numpy as np
import uuid
import hashlib

In [119]:
def post_to_slack(
    instance_id, segment_keyphrase_list, user_list, user_scores, suggested_user_list, word_list
):
    input_keyphrase_list = segment_keyphrase_list

    service_name = "recommendation-service"
    msg_text = "*Recommended users for meeting: {}* \n *Segment summary*: ```{}```\n".format(
        instance_id, _reformat_list_to_text(input_keyphrase_list)
    )

    msg_format = "[{}]: {} *Related Users*: ```{}```\n *User Confidence Scores*: ```{}```\n *Suggested Users*: ```{}```\n *Related Words*: ```{}```".format(
        service_name,
        msg_text,
        _reformat_list_to_text(user_list),
        _reformat_list_to_text(user_scores),
        _reformat_list_to_text(suggested_user_list),
        _reformat_list_to_text(word_list),
    )

    slack_payload = {"text": msg_format}
    requests.post(
        url=staging_url, data=js.dumps(slack_payload).encode()
    )

def _reformat_list_to_text(input_list):
    try:
        if type(input_list[0]) != str:
            formatted_text = ", ".join(
                ["{:.2f}".format(i) for i in input_list]
            )
        else:
            formatted_text = ", ".join([str(w) for w in input_list])
    except Exception as e:
        formatted_text = input_list
        logger.warning(e)

    return formatted_text

In [132]:
def make_validation_data(
    input_query,
    user_list,
    user_scores,
    suggested_user_list,
    word_list,
    segment_users,
    instance_id=None,
    context_id=None,
    segment_obj=None,
    upload=False
):
    if instance_id is None:
        instance_id = ""
        context_id = ""
        segment_id = ""
    
    validation_dict = {}
    for i in range(len(input_query)):
        validation_dict.update(
            {
                "text": input_query,
                "labels": user_list,
                "meta": {
                    "instanceId": instance_id,
                    "segmentId": segment_id,
                    "suggestedUsers": suggested_user_list,
                    "userScore": user_scores,
                    "keyphrases": input_query,
                    "relatedWords": word_list,
                    "positiveLabels": segment_users,
                },
            }
        )
    write_to_jsonl(validation_dict)

    if upload:
        upload_validation_data(
            validation_dict=validation_dict, instance_id=instance_id, context_id=context_id, delete=False
        )

def write_to_jsonl(validation_dict, prefix="watchers_", file_name=None):
    validation_id = hash_sha_object()
    save_dir = "/Users/shashank/Workspace/Orgs/Ether/ai-engine/tests/recommendation_service/validation/"
    file_name = prefix + validation_id + ".jsonl"
    with jsonlines.open(os.path.join(save_dir, file_name), mode="w") as writer:
        writer.write(validation_dict)

def upload_validation_data(
    validation_dict, instance_id, context_id, prefix="watchers_", delete=False
):
    validation_id = hash_sha_object()
    file_name = prefix + instance_id + "_" + validation_id + ".jsonl"
    with jsonlines.open(file_name, mode="w") as writer:
        writer.write(validation_dict)

    s3_path = "validation/recommendations/" + file_name

    try:
        s3_client.upload_to_s3(
            file_name=file_name, object_name=s3_path
        )
    except Exception as e:
        print(e)

    if delete:
        # Once uploading is successful, check if NPZ exists on disk and delete it
        local_path = Path(file_name).absolute()
        if os.path.exists(local_path):
            os.remove(local_path)
        
def hash_sha_object() -> str:
    uid = uuid.uuid4()
    uid = str(uid)
    hash_object = hashlib.sha1(uid.encode())
    hash_str = hash_object.hexdigest()
    return hash_str

def normalize(score, scores_list):
    normalized_score = (score - np.mean(scores_list)) / (
        np.max(scores_list) - np.min(scores_list)
    )
    return normalized_score

In [121]:
post_to_slack(
    instance_id="s2-test",
    segment_keyphrase_list=query_keywords,
    user_list=list(top_user_dict.keys()),
    user_scores=list(top_user_dict.values()),
    suggested_user_list= suggested_users,
    word_list=top_words
)

In [133]:
segment_user_names = ""
make_validation_data(
    input_query=query_keywords,
    user_list=list(top_user_dict.keys()),
    user_scores=list(top_user_dict.values()),
    suggested_user_list=suggested_users,
    word_list=top_words,
    segment_users=segment_user_names,
    upload=False
)