### Section 1: Setup and Imports

In [1]:
import pandas as pd
import numpy as np
import warnings
from transformers import pipeline
import nltk
import re
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from rake_nltk import Rake
import spacy
from typing import List
import json
from pydantic import BaseModel,Field
from groq import Groq
from typing import List
import pytextrank
from dotenv import load_dotenv
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download("punkt_tab")
warnings.filterwarnings('ignore')
load_dotenv('.env')


  from .autonotebook import tqdm as notebook_tqdm



g:\SNU\csc\Intern\Intern - HCL\Assignment\hcl\lib\site-packages


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\edith\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\edith\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\edith\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\edith\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Section 2: Data Loading

In [2]:
df = pd.read_excel("Assessment_Stories.xlsx")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Story ID  20 non-null     int64 
 1   Story     20 non-null     object
dtypes: int64(1), object(1)
memory usage: 448.0+ bytes


In [3]:
df.head()

Unnamed: 0,Story ID,Story
0,597,He is talented and always ahead of the game He...
1,1254,Over the past seven years I have had the privi...
2,414,Its been only few months working with her but ...
3,2,My manager is a great leader. She is always su...
4,999,My manager consistently demonstrates remarkabl...


### Section 3: Model Loadings

In [4]:
# Keyword generator from text
pipe = pipeline("text2text-generation", model="ilsilfverskiold/bart-keyword-extractor")

Device set to use cpu


In [5]:
# Sentence Transformer - generates embeddings for the sentence/stories
model = SentenceTransformer('all-MiniLM-L6-v2')

### Section 4: Text Cleaning Function

In [6]:
stop_words = set(stopwords.words('english'))
def clean_text(text):
    """
    Cleans text by:
    1. Converting to lowercase
    2. Removing special characters
    3. Removing stopwords
    4. Lemmatization
    5. Removing short words
    """
    
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    excluded_words = r'\b(manager)\b'
    text = re.sub(excluded_words, '', text)
    
    tokens = word_tokenize(text)
    
    tokens = [token for token in tokens if token not in stop_words]
    #word lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Remove short words (length < 3)
    tokens = [token for token in tokens if len(token) > 2]
    cleaned_text = ' '.join(tokens)
    return cleaned_text, tokens

In [7]:
df["cleaned_text"], df["tokens"] = zip(*df["Story"].apply(clean_text))

### Section 5: Keyword Extraction

#### TF-IDF

In [8]:
# Initialize TF-IDF
tfidf = TfidfVectorizer(
        stop_words='english',
        ngram_range=(3,3),  # Consider both unigrams and bigrams
        max_features=1000
    )
tfidf_matrix = tfidf.fit_transform(df['cleaned_text'])
feature_names = tfidf.get_feature_names_out()

In [9]:
keywords_per_doc = []
for doc_index in range(tfidf_matrix.shape[0]):
    tfidf_scores = tfidf_matrix[doc_index].toarray()[0]
    # Get top keywords
    top_indices = np.argsort(tfidf_scores)[-5:][::-1]
    top_keywords = [feature_names[i] for i in top_indices]
    
    keywords_per_doc.append(top_keywords)

In [10]:
df["keywords_tf-idf"]=keywords_per_doc
df["keywords_tf-idf"].head(5)

0    [requirement pit hole, people expertise value,...
1    [working various manager, ability guidance cru...
2    [better direct reporting, working look like, b...
3    [time high standard, success showed important,...
4    [shoe level empathy, truly grasp perspective, ...
Name: keywords_tf-idf, dtype: object

#### RAKE : Rapid Automatic Keyword Extraction

In [11]:
def keyword_rake(rakee:Rake,text):
    rakee.extract_keywords_from_text(text)
    return rakee.get_ranked_phrases()

In [12]:
r = Rake()
df["keywords_rake"] = [keyword_rake(r,text) for text in df["Story"]]

In [13]:
df["keywords_rake"].head(5)

0    [plans deliverables reasonably great, recognis...
1    [providing valuable input, past seven years, o...
2    [hcl people transform people empowers, pings h...
3    [feeling really overwhelmed, team members succ...
4    [manager consistently demonstrates remarkable ...
Name: keywords_rake, dtype: object

#### TextRank

In [14]:
def keyword_textrank(nlp,text):
    doc = nlp(text)
    key_words = [i.text for i in doc._.phrases[:6]]
    return key_words

In [15]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")
df["keywords_text_rank"] = [keyword_textrank(nlp,text) for text in df["Story"]]

In [16]:
df["keywords_text_rank"].head(5)

0    [new things, pit holes, enough time, Recognise...
1    [valuable suggestions, valuable input, various...
2    [HCL people, people, ease, confidence, first, ...
3    [time, help, a supportive manager, ways, guida...
4    [fosters collaboration trust, remarkable compo...
Name: keywords_text_rank, dtype: object

#### BART - Keyword Generator 
Text-to-Text Gen model

In [17]:
def extract_keywords(text):
    model_output = pipe(text)
    output = []
    for i in model_output[0]["generated_text"].split(","):
        output.append(i.strip())
    return output
df["keywords_uncleaned"] = df['Story'].apply(extract_keywords)
df["keywords_cleaned_text"] = df['cleaned_text'].apply(extract_keywords)

In [18]:
def unique_keywords(key1:list,key2:list):
    combined = list(set(key1 + key2))
    cleaned = [keyword for keyword in combined if not re.search(r'\b(manager|HCL)\b', keyword, re.IGNORECASE)]    
    return cleaned
df["keywords"] = [unique_keywords(i,j) for i,j in zip(df["keywords_uncleaned"],df["keywords_cleaned_text"])]
    

In [19]:
print("Index 1")
print(df['Story'].iloc[1])
print('Keywords : ',df['keywords'].iloc[1])

Index 1
Over the past seven years I have had the privilege of working with various managers who have been instrumental in supporting my career growth and providing valuable input for my professional development. During challenging times they offered valuable suggestions and assistance enabling me to enhance both my technical skills and social abilities. Their guidance has been crucial to my overall growth as a professional.
Keywords :  ['technical skill', 'social ability guidance', 'career growth', 'professional development']


### Section 6: Elasticsearch Setup and Indexing
- Elasticsearch is a powerful distributed search engine designed to handle keyword search efficiently.
- Supports exact matches, wildcards, customizable relevance scoring, etc.
- Offers semantic searche in beta version

In [20]:
# test_connection.py
from elasticsearch import Elasticsearch

es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic", "ExuU*TZhQEjtT_uFm9pY"),  
    verify_certs=False  
)

In [36]:
es.indices.delete(index="stories")

ObjectApiResponse({'acknowledged': True})

In [37]:
mapping = {
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "keywords": {"type": "keyword"},
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,  # Dimension of the model's embeddings
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

In [38]:
elastic_ind = "stories"
es.indices.create(index=elastic_ind, body=mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'stories'})

In [39]:
for index, row in df.iterrows():
    
    # generating embeddings
    text_embeddings =  model.encode(row['Story'])
    
    doc = {
        "story_id": row['Story ID'],
        "text": row['Story'],
        "keywords":row["keywords"],
        "text_vector":text_embeddings.tolist()
    }
    es.index(index=elastic_ind, document=doc)

In [41]:
resp = es.count(index=elastic_ind)
print(f"Total documents indexed: {resp['count']}")

Total documents indexed: 20


#### Search Implementation

##### Simple Search Implementation

In [42]:
# Simple search : first 3
resp = es.search(index=elastic_ind, query={"match": {"text": "helped me personally"}},size=3)
for hits in resp.body['hits']['hits']:
    print(hits['_source']['text'])
    print(hits['_source']['keywords'])
    print('-'*10)

We here at BD Sparks Campus have an exceptional manager. Gary is always willing to help us guide us when needed and offer advice to help us better ourselves for both HCL and of course BD. This is a unique campus environment and Gary has helped us foster an exceptional team of Techs encouraged learning and has been a tremendous manager not only to us here at Sparks Campus but me personally. I am grateful for the leadership from Gary and Steve Matt. We are Blessed to have such good Men leading the way for HCL and our teams.
['mentorship', 'grateful leadership', 'exceptional team tech', 'campus environment', 'leadership', 'spark campus']
----------
Its been only few months working with her but looks like I had to spend more than 4 years to find a manager like that. She made me comfortable feel at ease from the very first interaction. She transformed me motivated me replied to all my pings howsoever annoying they may have been helped me regain confidence which was lost somewhere during a l

##### Search using both text and keywords

In [43]:
# Search in both text and keywords : first 3
query = {
        "multi_match": {
            "query": "helped me personally",
            "fields": ["text", "keywords"]
        }
}
resp = es.search(index=elastic_ind, query=query,size=3)
for hits in resp.body['hits']['hits']:
    print(hits['_source']['text'])
    print(hits['_source']['keywords'])
    print('-'*10)

We here at BD Sparks Campus have an exceptional manager. Gary is always willing to help us guide us when needed and offer advice to help us better ourselves for both HCL and of course BD. This is a unique campus environment and Gary has helped us foster an exceptional team of Techs encouraged learning and has been a tremendous manager not only to us here at Sparks Campus but me personally. I am grateful for the leadership from Gary and Steve Matt. We are Blessed to have such good Men leading the way for HCL and our teams.
['mentorship', 'grateful leadership', 'exceptional team tech', 'campus environment', 'leadership', 'spark campus']
----------
Its been only few months working with her but looks like I had to spend more than 4 years to find a manager like that. She made me comfortable feel at ease from the very first interaction. She transformed me motivated me replied to all my pings howsoever annoying they may have been helped me regain confidence which was lost somewhere during a l

##### KNN Search with SentenceTransformer vectors

In [44]:
# KNN search with embeddings/vectors
def knn_search(query_text, top_k=5):
    # Generate embedding for the query
    query_vector = model.encode(query_text)
    
    # Construct the query
    query = {
        "field": "text_vector",
        "query_vector": query_vector,
        "k": top_k
    }
    
    response = es.search(index=elastic_ind, knn=query)
    return response

In [45]:
resp = knn_search("helped me personally")
for hits in resp.body['hits']['hits']:
    print(hits['_source']['text'], hits['_source']['keywords'])
    print('-'*10)

I genuinely want to take a moment to thank you for all the support you have given to me also for the whole team. your willingness to help and understanding are truly amazing. you have been patient with all of us you have guided us never made us feel low always spoken gentle to us in any kind of circumstances. one of your Inspiring habits which I have started following now save water and never waste food. ['inspiration', 'genuine gratitude', 'patience', 'patient support', 'support', 'team willingness']
----------
My reporting manager is very knowledgeable person who can reached out for any support in work and who also supports their team members in every aspect. Supporting the statements above am new member in the team my manager guided me for the system access with daily follow up email with customer and wherein I gained the knowledge by which now am able to handle the work individually. Your hands on approach helped us the meet ahead of the deadline for the Incidents and maintain a po

### Section 7: GenAI
- Useful for applications once sensitive employee information, such as names, is redacted.
- PII with tools like Microsoft Presidio or NLP techniques.

In [46]:
groq = Groq(api_key=os.getenv('GROQ'))

In [31]:
prompt = f"""
Given an employee's story about a WOW moment with their manager, extract meaningful keywords/hashtags that capture:
1. The situation/context (e.g., project deadline, personal challenge)
2. Specific impactful actions by the manager
3. The positive outcome or transformation
4. Emotional impact or personal growth
5. Notable support aspects

The keywords should:
- Capture inspiring/transformative moments
- Be 1-3 words long
- Exclude generic terms like "manager" or "HCL"
- Focus on the unique aspects of the experience
- Be suitable for finding similar inspiring stories

Story: {df['Story'][0]}
"""

In [32]:
class KeyWords(BaseModel):
    keywords:List[str] = Field(...,description="Extract the keywords from the story")

In [33]:

chat_completion = groq.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a Keyword/Hastag generator\n"
            f" The JSON object must use the schema: {json.dumps(KeyWords.model_json_schema(), indent=2)}",
        },
        {
            "role": "user",
            "content": prompt,
        },
    ],
    model="llama3-8b-8192",
    temperature=0,
    # Streaming is not supported in JSON mode
    stream=False,
    # Enable JSON mode by setting the response format
    response_format={"type": "json_object"},
)
KeyWords.model_validate_json(chat_completion.choices[0].message.content)


KeyWords(keywords=['Visionary Leadership', 'Proactive Approach', 'Expertise Recognition', 'Solution-Focused', 'Reasonable Planning', 'Collaborative Culture', 'Continuous Learning', 'Innovative Momentum', 'Acknowledging Others'])

In [47]:
df

Unnamed: 0,Story ID,Story,cleaned_text,tokens,keywords_tf-idf,keywords_rake,keywords_text_rank,keywords_uncleaned,keywords_cleaned_text,keywords
0,597,He is talented and always ahead of the game He...,talented always ahead game envisions requireme...,"[talented, always, ahead, game, envisions, req...","[requirement pit hole, people expertise value,...","[plans deliverables reasonably great, recognis...","[new things, pit holes, enough time, Recognise...","[talented, ahead of the game, team collaboration]","[talented, requirement, solution]","[solution, requirement, talented, ahead of the..."
1,1254,Over the past seven years I have had the privi...,past seven year privilege working various mana...,"[past, seven, year, privilege, working, variou...","[working various manager, ability guidance cru...","[providing valuable input, past seven years, o...","[valuable suggestions, valuable input, various...","[manager support, career growth, professional ...","[professional development, technical skill, so...","[technical skill, social ability guidance, car..."
2,414,Its been only few months working with her but ...,month working look like spend year find like m...,"[month, working, look, like, spend, year, find...","[better direct reporting, working look like, b...","[hcl people transform people empowers, pings h...","[HCL people, people, ease, confidence, first, ...","[HCL, direct reporting manager, transformation]","[month working, comfort, motivation]","[motivation, month working, comfort, transform..."
3,2,My manager is a great leader. She is always su...,great leader always supportive encouraging alw...,"[great, leader, always, supportive, encouragin...","[time high standard, success showed important,...","[feeling really overwhelmed, team members succ...","[time, help, a supportive manager, ways, guida...","[supportive manager, team members, leadership ...","[supportive leadership, team member support, l...","[team member support, leadership skills, suppo..."
4,999,My manager consistently demonstrates remarkabl...,consistently demonstrates remarkable composure...,"[consistently, demonstrates, remarkable, compo...","[shoe level empathy, truly grasp perspective, ...",[manager consistently demonstrates remarkable ...,"[fosters collaboration trust, remarkable compo...","[manager, composure, collaboration trust]","[compassiveness, patience, empathy understanding]","[composure, compassiveness, patience, empathy ..."
5,1113,As a manager myself I find myself faltering at...,find faltering time patience staff provided co...,"[find, faltering, time, patience, staff, provi...","[unbiased fairness scenario, needed continuall...","[cool calm demeanor allows, provided consisten...","[consistent guidance, times, mentor, patience,...","[manager, patience, leadership]","[confidence, consistency, leadership mentorship]","[confidence, leadership, patience, consistency..."
6,1219,for the past three months had been very depres...,past three month depressed unable manage work ...,"[past, three, month, depressed, unable, manage...","[positive solution created, work working style...","[created positive approach towards work, manag...","[work life balance, positive approach, work, a...","[Depression, Work Life Balance, Positive Appro...","[Depression, Work Life Balance, Positive Appro...","[Work Life Balance, Depression, Positive Appro..."
7,51,One standout WOW moment I experienced was duri...,one standout wow moment experienced particular...,"[one, standout, wow, moment, experienced, part...","[mounting unsure meet, tight timeline facing, ...","[team working alongside us offering guidance, ...","[numerous obstacles, mounting unsure, tight ti...","[team dynamics, dedication, teamwork]","[team dynamic, deadline pressure, team support]","[dedication, team dynamic, team support, deadl..."
8,746,I am grateful for the invaluable support and g...,grateful invaluable support guidance provided ...,"[grateful, invaluable, support, guidance, prov...","[necessary step assisted, project positive mot...","[professional growth offering opportunities, m...","[opportunities, guidance, success, the deleted...","[gratitude, support, professional growth]","[grateful, support guidance, professional growth]","[professional growth, grateful, support guidan..."
9,367,We here at BD Sparks Campus have an exceptiona...,spark campus exceptional gary always willing h...,"[spark, campus, exceptional, gary, always, wil...","[personally grateful leadership, steve matt bl...","[help us guide us, help us better, techs encou...","[BD Sparks Campus, Gary, such good Men, Sparks...","[leadership, mentorship, campus environment]","[spark campus, exceptional team tech, grateful...","[mentorship, grateful leadership, exceptional ..."



### Inference  

 **Problem Statement Limitations**:  
  - Keywords are not pre-defined.  
  - The number of keywords per story varies depending on its size and content.  

**If Keywords are Pre-defined**:  
  - **Keyword Extraction**: Use of zero-shot classification models like [Bart-large](https://huggingface.co/facebook/bart-large-mnli) to classify up to 10 predefined classes for keyword assignment.  
  
  - **Search**: Exact keyword search becomes feasible.  

 **If Keywords are Not Defined**:  
  - **Keyword Extraction** from stories:  
    - TF-IDF  
    - RAKE  
    - TextRank  
    - BART for keyword generation (Text-to-Text).  
  - **Search**: Use similarity-based searching with respect to the context and extracted keywords.  

 **Why Elasticsearch Over RAG?**<br>
  *Considering the vast employee count of HCLTech (~218,000), Elasticsearch offers distributed scalability to handle extensive data, ensuring efficient search and actionable insights from unstructured employee stories.*
  - **Performance and Scalability**:
    - the document retrieval is more flexible compared to other vectorDB
    - the scalability is much better in ElasticSearch
  - **Flexibility**: It handles both structured (pre-defined keywords) and unstructured data (extracted keywords or embeddings) seamlessly.  
  - Simple search methods to KNN based search is offered

 **Use of GenAI:**<br>
  *Employee confidentiallity needs to be protected first - redacting the sensitive information from the stories.*
  - Open source models like **Llama3.3** etc can be used to extract keywords.
  - can be used to extract context-based Keywords even from large user-stories.
---
