In [1]:
import os, sys
sys.path.insert(0, '/Users/shashank/Workspace/Orgs/Ether/ai-engine/services/recommendations')

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import json as js
from scipy.spatial.distance import cosine

In [227]:
from vectorize import Vectorizer
from lsh import Table, LSH, WordSearch, UserSearch, MinHash
from graph_query import DgraphClient

In [6]:
url = "111.93.155.194:9080"

In [7]:
vectorizer = Vectorizer()
dg = DgraphClient(url=url)

In [8]:
def to_json(data, filename):
    with open(filename + ".json", "w", encoding="utf-8") as f_:
        js.dump(data, f_, ensure_ascii=False, indent=4)

def read_json(json_file):
    with open(json_file) as f_:
        meeting = js.load(f_)
    return meeting

In [9]:
user_kw_query = """
query mlChannelUserKw($n: string, $t: int) {
    MLchannelShashankKw(func: type("Channel"))
  @filter(eq(name, "ml-ai")) @cascade{
        uid
        xid
    	hasContext {
          xid
          attribute
          associatedMind {
            name
            type
          }
          hasMeeting (first: $t){
            xid
            hasSegment {
              authoredBy @filter(anyofterms(name, $n)) {
                name
                xid
              }
              hasKeywords {
                values
              }
            }
          }
        }
   }
}
"""

In [10]:
variables = {"$n": "shashank@etherlabs.io", "$t": "60"}
user_kw_resp = dg.perform_query(query=user_kw_query, variables=variables)

In [29]:
user_kw_resp

{'MLchannelShashankKw': [{'uid': '0x403c3',
   'xid': '898068b8-3c18-44c0-9aa0-f6135edd8f44',
   'hasContext': {'xid': '01DBB3SN99AVJ8ZWJDQ57X9TGX',
    'attribute': 'contextId',
    'associatedMind': [{'name': 'AI', 'type': 'domain'}],
    'hasMeeting': [{'xid': 'fdabb4c7-78ad-4ccd-af7b-17b89ce46534',
      'hasSegment': [{'authoredBy': {'name': 'Shashank',
         'xid': '7e7ccbba-232d-411a-a95a-d3f244a35f40'},
        'hasKeywords': {'values': ['fall back on segment wise',
          'domains groups',
          'algorithmic side',
          'mechanism or remove',
          'edge case percentage']}},
       {'authoredBy': {'name': 'Shashank',
         'xid': '7e7ccbba-232d-411a-a95a-d3f244a35f40'},
        'hasKeywords': {'values': ['noisy and stuff',
          'Keys',
          'Keys his level',
          'problem is like meet call entities',
          'kind of intersection',
          'candida graphs']}},
       {'authoredBy': {'name': 'Shashank',
         'xid': '7e7ccbba-232d-411

In [55]:
def format_response(resp, function_name):
    user_id_dict = {}
    user_kw_dict = {}
    user_kw = []

    for info in resp[function_name]:
        context_obj = info["hasContext"]
        meeting_obj = context_obj["hasMeeting"]
        for m_info in meeting_obj:
            segment_obj = m_info["hasSegment"]
            for segment_info in segment_obj:         
                try:
                    user_id = segment_info.get("authoredBy")["xid"]
                    user_name = segment_info.get("authoredBy")["name"]
                    user_id_dict.update({
                        user_id: user_name
                    })

                    keyword_object = segment_info["hasKeywords"]
                    user_kw.extend(list(set(keyword_object["values"])))
                except Exception as e:
                    print(e)
                    continue
            
                user_kw_dict.update({
                            user_id: user_kw
                        })
    
    return user_kw_dict, user_id_dict

In [56]:
user_kw_dict, user_id_dict = format_response(resp=user_kw_resp, function_name='MLchannelShashankKw')

In [57]:
user_kw_dict, user_id_dict

({'7e7ccbba-232d-411a-a95a-d3f244a35f40': ['fall back on segment wise',
   'domains groups',
   'mechanism or remove',
   'algorithmic side',
   'edge case percentage',
   'noisy and stuff',
   'Keys his level',
   'problem is like meet call entities',
   'candida graphs',
   'kind of intersection',
   'Keys',
   'copy paste',
   'free download',
   'big HR',
   'edge case',
   'big engineering community',
   'Collins audio',
   'Collins',
   'Grass will distort the location',
   'entire stage',
   'Segment ideas',
   'Segment',
   'meeting we get key feature vectors',
   'good approach',
   'key phrases',
   'key phrase service',
   'serialize them and store',
   'data indygraf',
   'recommended Watchers',
   'the entire stage',
   'key feature vectors',
   'Watchers',
   'lot more flexible',
   'context study',
   'query on existing data',
   'Freddy',
   'context instance',
   'figure out a proper',
   'Neptune',
   'mapping uuid',
   'problems with new ideas',
   'uids map',
   'da

In [334]:
ref_user_kw_query = """
query mlChannelUserKw($t: int) {
    mlChannelUserKw(func: type("Channel"))
  @filter(eq(name, "ml-ai")) @cascade{
        uid
        xid
        belongsTo @filter(eq(name, "ether-labs")) {
            name
            attribute
        }
    	hasContext {
          xid
          attribute
          associatedMind {
            name
            type
          }
          hasMeeting (first: $t){
            xid
            hasSegment {
              authoredBy {
                name
                xid
              }
              hasKeywords {
                values
              }
            }
          }
        }
   }
}
"""

In [335]:
user_name_query = """
{
  test(func: eq(xid, $n)){
    name
    xid
    attribute
  }
}
"""

In [337]:
var = {"$t": "20"}
ref_user_kw_resp = dg.perform_query(query=ref_user_kw_query, variables=var)

In [338]:
ref_user_kw_resp

{'mlChannelUserKw': [{'uid': '0x403c3',
   'xid': '898068b8-3c18-44c0-9aa0-f6135edd8f44',
   'belongsTo': {'name': 'ether-labs', 'attribute': 'workspaceId'},
   'hasContext': {'xid': '01DBB3SN99AVJ8ZWJDQ57X9TGX',
    'attribute': 'contextId',
    'associatedMind': [{'name': 'AI', 'type': 'domain'}],
    'hasMeeting': [{'xid': '5841513d-20bd-4745-8bcc-8626e4602477',
      'hasSegment': [{'authoredBy': {'name': 'Venkata Dikshit',
         'xid': 'b1e8787a-9a1f-4859-ac11-cbb6a8124fd9'},
        'hasKeywords': {'values': ['step process',
          'scoring mechanism',
          'small step']}},
       {'authoredBy': {'name': 'Venkata Dikshit',
         'xid': 'b1e8787a-9a1f-4859-ac11-cbb6a8124fd9'},
        'hasKeywords': {'values': ['data set',
          'AG pick',
          'engineering bioengineering',
          'cement SE model']}},
       {'authoredBy': {'name': 'Venkata Dikshit',
         'xid': 'b1e8787a-9a1f-4859-ac11-cbb6a8124fd9'},
        'hasKeywords': {'values': ['Production O

In [223]:
def format_reference_response(resp, function_name):
    user_dict = {}
    user_kw = []

    for info in resp[function_name]:
        context_obj = info["hasContext"]
        meeting_obj = context_obj["hasMeeting"]
        for m_info in meeting_obj:
            segment_obj = m_info["hasSegment"]
            for segment_info in segment_obj: 
                segment_kw = []
                try:
                    user_id = segment_info.get("authoredBy")["xid"]
                    user_name = segment_info.get("authoredBy")["name"]
                    
                    try:
                        u_id = user_dict[user_id]
                    except KeyError:
                        user_dict.update({
                            user_id: {
                                    "name": user_name,
                                    "keywords": None
                                }
                            })        

                    keyword_object = segment_info["hasKeywords"]
                    segment_kw.extend(list(set(keyword_object["values"])))
                    
                    user_kw_list = user_dict[user_id].get("keywords")
                    if user_kw_list is not None:
                        user_kw_list.extend(segment_kw)
                        user_dict[user_id].update({
                                "keywords": list(set(user_kw_list))
                            })
                    else:
                        user_dict[user_id].update({
                                "keywords": segment_kw
                            })
                except Exception as e:
                    print(e)
                    continue
    
    return user_dict

In [224]:
ref_user_dict = format_reference_response(resp=ref_user_kw_resp, function_name='mlChannelUserKw')

In [225]:
len(ref_user_dict.keys())

13

In [226]:
ref_user_dict

{'0bbbfe84-c661-45af-8d0f-fcd5258bba38': {'name': 'Parshwa Nemi Jain',
  'keywords': ['Karthik',
   'single participant',
   'call in that case',
   'Protest at button',
   'unable for this participant',
   'send the issue',
   'meeting is terminate for Andre',
   'slack right call',
   'class for the integration',
   'Gypsy',
   'start like basic display',
   'work to pay',
   'decide to open the app store',
   'interface and a parent class',
   'common sections',
   'lend The Columns',
   'instance from a control',
   'host SDK',
   'records request',
   'apply loaded on top',
   'Dashboard',
   'yesterday I gave bill',
   'call from web',
   'Name',
   'Angle that video',
   'joined as true something real will investigate',
   'product manager',
   'Html',
   'discussion data',
   'release for bill',
   'decides whether that speaker component',
   'dashboard page user',
   'Button',
   'reproduce that like video',
   'audio tag',
   'Docker file',
   'Chinese inside provider time',


In [355]:
ml_ref_user_dict = format_reference_response(resp=ref_user_kw_resp, function_name='mlChannelUserKw')

In [356]:
ml_ref_user_dict

{'b1e8787a-9a1f-4859-ac11-cbb6a8124fd9': {'name': 'Venkata Dikshit',
  'keywords': ['error app basis',
   'key phases',
   'tomorrow on the trolls',
   'Richard',
   'Production Office',
   'long update',
   'validation set',
   'weird case',
   'retain that model',
   'wait a queue',
   'way this month',
   'fifteen minutes',
   'Roc stories',
   'full sir',
   'false negatives',
   'segment as an input',
   'placeholder need to work',
   'increase it to twelve seconds',
   'graph files',
   'entity list',
   'second step',
   'segment feature Vector',
   'Acetone Production Data',
   'segment keep party',
   'category rock',
   'a Next Step',
   'people could keep test',
   'key places',
   'snapshot of the model',
   'stick the topics',
   'current scope charcoal Services',
   'correct form',
   'keywords in a description field',
   'second number',
   'handle the exception',
   'wild customer data',
   'law of large numbers',
   'chapter communities',
   'deployment of the key plac

In [357]:
new_ref_user_dict = {**ref_user_dict, **ml_ref_user_dict}

In [377]:
from copy import deepcopy
new_ref_user_dict = deepcopy(ref_user_dict)
for user_id, it in ml_ref_user_dict.items():
    name = it["name"]
    ml_keyword_list = it["keywords"]
    if user_id in ref_user_dict.keys():
        keyword_list = ref_user_dict[user_id]["keywords"]
        keyword_list.extend(ml_keyword_list)
        new_ref_user_dict[user_id]["keywords"] = keyword_list
    else:
        new_ref_user_dict.update({
            user_id: {
                "name": name,
                "keywords": ml_keyword_list
            }
        })

In [378]:
len(new_ref_user_dict)

15

In [379]:
for k, v in new_ref_user_dict.items():
    print(v["name"])

Parshwa Nemi Jain
Vamshi Krishna
mithun
Franklin Ferdinand
Nisha Yadav
Shashank
Trishanth Diwate
Deep Moradia
Venkata Dikshit
Vani
Reagan Rewop
Karthik Muralidharan
Krishna Sai
Shubham
Arjun Kini


In [513]:
len(new_ref_user_dict)

14

In [514]:
to_json(new_ref_user_dict, filename="reference_prod_user")

## Remove users (if any) that are being compared against

In [None]:
for k in modified_ref_user_id_dict.keys():
    if k in user_kw_dict.keys():
        del ref_user_kw_dict[k]

In [409]:
ref_user_kw_dict = {k: new_ref_user_dict[k]["keywords"] for k in new_ref_user_dict.keys()}

In [402]:
len(new_ref_user_dict)

14

In [401]:
del new_ref_user_dict['2c944512-17a0-4912-9a16-6a3408da807c']

In [410]:
len(ref_user_kw_dict)

14

In [509]:
import pickle
with open('reference_user_kw_vector.pickle', 'rb') as f_:
    user_vectors = pickle.load(f_)

In [411]:
us = UserSearch(input_dict=ref_user_kw_dict, vectorizer=Vectorizer(), user_vector_data=None, num_buckets=8, hash_size=4)

In [412]:
us.featurize(write=True)

In [414]:
us.num_features_in_input

{'0bbbfe84-c661-45af-8d0f-fcd5258bba38': 337,
 '1a215425-8449-4fca-ba95-7d768b595b80': 473,
 '84fbaa66-a247-4ea2-9ae0-53f3a2e519d6': 304,
 'c66797a9-2e6d-46ad-9573-926e57f7dac3': 217,
 '7e7ccbba-232d-411a-a95a-d3f244a35f40': 1707,
 '75bdf310-110b-4b8f-ab88-b16fafce920e': 204,
 'b4a57b25-de68-446c-ac99-0f856d3fe4d5': 292,
 'b1e8787a-9a1f-4859-ac11-cbb6a8124fd9': 3507,
 '65bb8395-2fb5-4409-a4bb-59bb707f1375': 162,
 'fb52cb66-3aec-4795-aee3-8ccfd904d315': 1011,
 '62b6ae1d-7f83-4b0b-b205-5f7c72bc3368': 1048,
 '8d6db5f7-d9b7-4c54-ba38-fe710ffcaf3f': 298,
 'ecfeeb75-7f0a-4d47-af1e-bd513929264a': 173,
 '81a3e154-6937-4fce-ba1c-f972faa209b2': 439}

In [415]:
alt_keywords = ['Google Calendar', 
                'Platform Mitten', 
                'GPU', 
                'API changes yesterday', 
                'ICS', 
                'topics in a structured format', 
                'Golang', 
                'parse this ICS message', 
                'store structured data', 
                'occur then the meeting URL']

In [392]:
user_keywords = ref_user_dict['b1e8787a-9a1f-4859-ac11-cbb6a8124fd9'].get('keywords')

In [416]:
result = us.query(kw_list=alt_keywords)

num results 19933


In [417]:
for r in sorted(result, key=result.get, reverse=True):
    print(r, new_ref_user_dict[r]["name"], result[r])

84fbaa66-a247-4ea2-9ae0-53f3a2e519d6 mithun 2.3651315789473686
75bdf310-110b-4b8f-ab88-b16fafce920e Trishanth Diwate 2.3284313725490198
7e7ccbba-232d-411a-a95a-d3f244a35f40 Shashank 2.0615114235500878
81a3e154-6937-4fce-ba1c-f972faa209b2 Arjun Kini 2.050113895216401
65bb8395-2fb5-4409-a4bb-59bb707f1375 Vani 2.04320987654321
0bbbfe84-c661-45af-8d0f-fcd5258bba38 Parshwa Nemi Jain 2.026706231454006
fb52cb66-3aec-4795-aee3-8ccfd904d315 Reagan Rewop 2.0217606330365974
b4a57b25-de68-446c-ac99-0f856d3fe4d5 Deep Moradia 2.0136986301369864
c66797a9-2e6d-46ad-9573-926e57f7dac3 Nisha Yadav 1.967741935483871
62b6ae1d-7f83-4b0b-b205-5f7c72bc3368 Karthik Muralidharan 1.9561068702290076
8d6db5f7-d9b7-4c54-ba38-fe710ffcaf3f Krishna Sai 1.895973154362416
b1e8787a-9a1f-4859-ac11-cbb6a8124fd9 Venkata Dikshit 1.8388936412888508
1a215425-8449-4fca-ba95-7d768b595b80 Vamshi Krishna 1.8350951374207187
ecfeeb75-7f0a-4d47-af1e-bd513929264a Shubham 1.8208092485549132


In [351]:
m_result = [result[r] for r in result]

In [353]:
np.mean(m_result)

0.7579684035201257

## Ablation tests

In [508]:
num_buckets_list = [2, 4, 6, 8, 10, 12, 14]
hash_size_list = [3, 4, 6, 8, 10, 12, 15, 20]

In [511]:
for h in hash_size_list:
    for num in num_buckets_list:
        print()
        print("Bucket size: {}, Hash size: {}".format(num, h))
        print()
        us = UserSearch(input_dict=ref_user_kw_dict, vectorizer=Vectorizer(), user_vector_data=user_vectors, num_buckets=num, hash_size=h)
        us.featurize(write=False)
        result = us.query(kw_list=alt_keywords)
        for r in sorted(result, key=result.get, reverse=True):
            print(r, new_ref_user_dict[r]["name"], result[r])


Bucket size: 2, Hash size: 3

num results 49803
0bbbfe84-c661-45af-8d0f-fcd5258bba38 Parshwa Nemi Jain 5.231454005934718
b4a57b25-de68-446c-ac99-0f856d3fe4d5 Deep Moradia 5.154109589041096
84fbaa66-a247-4ea2-9ae0-53f3a2e519d6 mithun 5.1019736842105265
65bb8395-2fb5-4409-a4bb-59bb707f1375 Vani 5.061728395061729
62b6ae1d-7f83-4b0b-b205-5f7c72bc3368 Karthik Muralidharan 4.983778625954199
7e7ccbba-232d-411a-a95a-d3f244a35f40 Shashank 4.975395430579965
fb52cb66-3aec-4795-aee3-8ccfd904d315 Reagan Rewop 4.954500494559841
81a3e154-6937-4fce-ba1c-f972faa209b2 Arjun Kini 4.954441913439635
c66797a9-2e6d-46ad-9573-926e57f7dac3 Nisha Yadav 4.944700460829493
ecfeeb75-7f0a-4d47-af1e-bd513929264a Shubham 4.8497109826589595
75bdf310-110b-4b8f-ab88-b16fafce920e Trishanth Diwate 4.8431372549019605
b1e8787a-9a1f-4859-ac11-cbb6a8124fd9 Venkata Dikshit 4.775021385799829
8d6db5f7-d9b7-4c54-ba38-fe710ffcaf3f Krishna Sai 4.701342281879195
1a215425-8449-4fca-ba95-7d768b595b80 Vamshi Krishna 4.687103594080338



## Filteration

In [418]:
mean_result_score = np.mean([result[r] for r in result])

In [420]:
similar_users = {k: result[k] for k in sorted(result, key=result.get, reverse=True) if result[k] >= mean_result_score}

In [421]:
similar_users

{'84fbaa66-a247-4ea2-9ae0-53f3a2e519d6': 2.3651315789473686,
 '75bdf310-110b-4b8f-ab88-b16fafce920e': 2.3284313725490198,
 '7e7ccbba-232d-411a-a95a-d3f244a35f40': 2.0615114235500878,
 '81a3e154-6937-4fce-ba1c-f972faa209b2': 2.050113895216401,
 '65bb8395-2fb5-4409-a4bb-59bb707f1375': 2.04320987654321,
 '0bbbfe84-c661-45af-8d0f-fcd5258bba38': 2.026706231454006,
 'fb52cb66-3aec-4795-aee3-8ccfd904d315': 2.0217606330365974}

In [425]:
similar_users_kw_dict = {k: new_ref_user_dict[k]["keywords"] for k in similar_users.keys()}

In [432]:
similar_users_kw_list = list(set([words for user_kw in similar_users_kw_dict.values() for words in user_kw]))

In [433]:
similar_users_kw_list

['Gypsy meet code',
 'URL',
 'key phases',
 'tomorrow on the trolls',
 'Library',
 'call in that case',
 'Server',
 'check from code',
 'today and sink',
 'GPT',
 'hyphens with video ideas',
 'aggregation of sentences',
 'Diablo Cody love instant',
 'long update',
 'Computing feature vectors',
 'retain that model',
 'God is a joke',
 'twelve so they offer virtual',
 'bills and lot',
 'fifteen minutes',
 'Roc stories',
 'search engine',
 'Notebook pass',
 'slack right call',
 'proximate search kind',
 'start like basic display',
 'segment idea',
 'engineering aspect',
 'decide to open the app store',
 'graph files',
 'set initial schema',
 'entity list',
 'second step',
 'sing this calls data',
 'meeting slices cover',
 'segment is a permanent',
 'category rock',
 'Fenton',
 'bit confused',
 'ICS',
 'key places',
 'apply loaded on top',
 'correct form',
 'keywords in a description field',
 'good day',
 'glass used in production mirror',
 'ideal comparison',
 'call from web',
 'properly 

In [435]:
ws = WordSearch(input_list=similar_users_kw_list, vectorizer=Vectorizer(), num_buckets=10)

In [436]:
ws.featurize()

In [437]:
sim_keyword_result = ws.query(kw_list=alt_keywords)

In [447]:
for sw in sorted(sim_keyword_result, key=sim_keyword_result.get, reverse=True):
    if sw not in alt_keywords:
        for user_id, kw_list in similar_users_kw_dict.items():
            if sw in kw_list:
                print(sw, new_ref_user_dict[user_id]["name"], sim_keyword_result[sw])

golang parser Trishanth Diwate 0.0234375
offline queries Reagan Rewop 0.021484375
API service Reagan Rewop 0.021484375
current schema Shashank 0.021484375
slack with two sections mithun 0.01953125
publish an event saying group Shashank 0.01953125
start calling Lambda Shashank 0.01953125
work different Lambda Shashank 0.01953125
expose apis mithun 0.017578125
websocket message mithun 0.017578125
second the API mithun 0.017578125
preference to organization Shashank 0.017578125
send segments Shashank 0.017578125
language code Arjun Kini 0.017578125
set initial schema Shashank 0.017578125
channel user keywords Shashank 0.017578125
rest API Parshwa Nemi Jain 0.017578125
text over the tokens Arjun Kini 0.017578125
combination frameworks Shashank 0.017578125
DeGraff client Shashank 0.017578125
API Parshwa Nemi Jain 0.017578125
like high level topics Shashank 0.015625
schema is fine weather Shashank 0.015625
configuration and production Reagan Rewop 0.015625
group write info Reagan Rewop 0.015

In [470]:
all_similar_user_freq_list = []
similar_user_freq_list = []
mean_word_sim = np.mean([sim_keyword_result[r] for r in sim_keyword_result])
for sw in sorted(sim_keyword_result, key=sim_keyword_result.get, reverse=True):
    if sw not in alt_keywords:
        for user_id, kw_list in similar_users_kw_dict.items():
            if sw in kw_list:
                all_similar_user_freq_list.append(user_id)
                if sim_keyword_result[sw] >= mean_word_sim:
                    similar_user_freq_list.append(user_id)
                # print(sw, new_ref_user_dict[user_id]["name"], sim_keyword_result[sw])

In [449]:
similar_user_freq_list

['75bdf310-110b-4b8f-ab88-b16fafce920e',
 'fb52cb66-3aec-4795-aee3-8ccfd904d315',
 'fb52cb66-3aec-4795-aee3-8ccfd904d315',
 '7e7ccbba-232d-411a-a95a-d3f244a35f40',
 '84fbaa66-a247-4ea2-9ae0-53f3a2e519d6',
 '7e7ccbba-232d-411a-a95a-d3f244a35f40',
 '7e7ccbba-232d-411a-a95a-d3f244a35f40',
 '7e7ccbba-232d-411a-a95a-d3f244a35f40',
 '84fbaa66-a247-4ea2-9ae0-53f3a2e519d6',
 '84fbaa66-a247-4ea2-9ae0-53f3a2e519d6',
 '84fbaa66-a247-4ea2-9ae0-53f3a2e519d6',
 '7e7ccbba-232d-411a-a95a-d3f244a35f40',
 '7e7ccbba-232d-411a-a95a-d3f244a35f40',
 '81a3e154-6937-4fce-ba1c-f972faa209b2',
 '7e7ccbba-232d-411a-a95a-d3f244a35f40',
 '7e7ccbba-232d-411a-a95a-d3f244a35f40',
 '0bbbfe84-c661-45af-8d0f-fcd5258bba38',
 '81a3e154-6937-4fce-ba1c-f972faa209b2',
 '7e7ccbba-232d-411a-a95a-d3f244a35f40',
 '7e7ccbba-232d-411a-a95a-d3f244a35f40',
 '0bbbfe84-c661-45af-8d0f-fcd5258bba38',
 '7e7ccbba-232d-411a-a95a-d3f244a35f40',
 '7e7ccbba-232d-411a-a95a-d3f244a35f40',
 'fb52cb66-3aec-4795-aee3-8ccfd904d315',
 'fb52cb66-3aec-

In [471]:
from collections import Counter

In [472]:
af = Counter(all_similar_user_freq_list)

In [473]:
af

Counter({'75bdf310-110b-4b8f-ab88-b16fafce920e': 150,
         'fb52cb66-3aec-4795-aee3-8ccfd904d315': 426,
         '7e7ccbba-232d-411a-a95a-d3f244a35f40': 743,
         '84fbaa66-a247-4ea2-9ae0-53f3a2e519d6': 230,
         '81a3e154-6937-4fce-ba1c-f972faa209b2': 338,
         '0bbbfe84-c661-45af-8d0f-fcd5258bba38': 264,
         '65bb8395-2fb5-4409-a4bb-59bb707f1375': 126})

In [474]:
f = Counter(similar_user_freq_list)

In [475]:
f

Counter({'75bdf310-110b-4b8f-ab88-b16fafce920e': 60,
         'fb52cb66-3aec-4795-aee3-8ccfd904d315': 189,
         '7e7ccbba-232d-411a-a95a-d3f244a35f40': 343,
         '84fbaa66-a247-4ea2-9ae0-53f3a2e519d6': 106,
         '81a3e154-6937-4fce-ba1c-f972faa209b2': 134,
         '0bbbfe84-c661-45af-8d0f-fcd5258bba38': 101,
         '65bb8395-2fb5-4409-a4bb-59bb707f1375': 51})

In [490]:
total_res = np.sum([s for s in af.values()])
total_res = 1
final_counter = {u: (f[u]/af[u])*total_res for u in af.keys()}

    

In [491]:
final_counter

{'75bdf310-110b-4b8f-ab88-b16fafce920e': 0.4,
 'fb52cb66-3aec-4795-aee3-8ccfd904d315': 0.44366197183098594,
 '7e7ccbba-232d-411a-a95a-d3f244a35f40': 0.4616419919246299,
 '84fbaa66-a247-4ea2-9ae0-53f3a2e519d6': 0.4608695652173913,
 '81a3e154-6937-4fce-ba1c-f972faa209b2': 0.39644970414201186,
 '0bbbfe84-c661-45af-8d0f-fcd5258bba38': 0.38257575757575757,
 '65bb8395-2fb5-4409-a4bb-59bb707f1375': 0.40476190476190477}

In [482]:
new_ref_user_dict['7e7ccbba-232d-411a-a95a-d3f244a35f40']["name"]

'Shashank'

In [492]:
for u_id, score in final_counter.items():
    print(new_ref_user_dict[u_id]["name"], score)

Trishanth Diwate 0.4
Reagan Rewop 0.44366197183098594
Shashank 0.4616419919246299
mithun 0.4608695652173913
Arjun Kini 0.39644970414201186
Parshwa Nemi Jain 0.38257575757575757
Vani 0.40476190476190477


## Explainability

In [494]:
from nltk import word_tokenize, pos_tag, sent_tokenize

In [500]:
pos_list= ['NN', 'NNS', 'NNP']

In [505]:
pos_ = ['NN', 'NNS']

In [506]:
pos_ in pos_list

False

In [507]:
for sw in sorted(sim_keyword_result, key=sim_keyword_result.get, reverse=True):
    if sw not in alt_keywords:
        for user_id, kw_list in similar_users_kw_dict.items():
            if sw in kw_list:
                print(sw, new_ref_user_dict[user_id]["name"], sim_keyword_result[sw], pos_tag(word_tokenize(sw)))

golang parser Trishanth Diwate 0.0234375 [('golang', 'NN'), ('parser', 'NN')]
offline queries Reagan Rewop 0.021484375 [('offline', 'NN'), ('queries', 'NNS')]
API service Reagan Rewop 0.021484375 [('API', 'NNP'), ('service', 'NN')]
current schema Shashank 0.021484375 [('current', 'JJ'), ('schema', 'NN')]
slack with two sections mithun 0.01953125 [('slack', 'NN'), ('with', 'IN'), ('two', 'CD'), ('sections', 'NNS')]
publish an event saying group Shashank 0.01953125 [('publish', 'VB'), ('an', 'DT'), ('event', 'NN'), ('saying', 'VBG'), ('group', 'NN')]
start calling Lambda Shashank 0.01953125 [('start', 'NN'), ('calling', 'VBG'), ('Lambda', 'NNP')]
work different Lambda Shashank 0.01953125 [('work', 'NN'), ('different', 'JJ'), ('Lambda', 'NNP')]
expose apis mithun 0.017578125 [('expose', 'RB'), ('apis', 'NN')]
websocket message mithun 0.017578125 [('websocket', 'NN'), ('message', 'NN')]
second the API mithun 0.017578125 [('second', 'JJ'), ('the', 'DT'), ('API', 'NNP')]
preference to organi

In [517]:
top_words = []
for sw in sorted(sim_keyword_result, key=sim_keyword_result.get, reverse=True):
    if sw not in alt_keywords:
        top_words.append(sw)

In [518]:
top_words

['golang parser',
 'offline queries',
 'API service',
 'current schema',
 'slack with two sections',
 'publish an event saying group',
 'start calling Lambda',
 'work different Lambda',
 'expose apis',
 'websocket message',
 'second the API',
 'preference to organization',
 'send segments',
 'language code',
 'set initial schema',
 'channel user keywords',
 'rest API',
 'text over the tokens',
 'combination frameworks',
 'DeGraff client',
 'API',
 'like high level topics',
 'schema is fine weather',
 'configuration and production',
 'group write info',
 'schema to platform side',
 'segment keywords',
 'handle all these grouping segments',
 'topic extraction service',
 'fallback mechanism',
 'set a defined schema',
 'API to just connect',
 'slack Channel keywords',
 'check from code',
 'extract entities key phrases',
 'hierarchical communities',
 'second one create Lambda service',
 'assign a different identifiers',
 'current script',
 'official of features',
 'lookup CSV',
 'groups and

In [519]:
top_10 = top_words[:10]

In [522]:
def filter_pos(word_list):
    filtered_word = []
    
    for word in word_list:
        pos_word = pos_tag(word_tokenize(word))
        counter = 0
        for tags in pos_word:
            p = tags[1]
            if p in pos_list:
                counter += 1

        if counter == len(word_tokenize(word)):
            filtered_word.append(word)
    return filtered_word

In [523]:
filter_pos(top_10)

['golang parser', 'offline queries', 'API service', 'websocket message']