In [2]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import torch
import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [11]:
df = pd.read_csv('../data/raw/idcloud-content.csv')

In [5]:
passage_embeddings = list(model.encode(df['content'].to_list(), show_progress_bar=True))


Batches: 100%|██████████| 22/22 [00:24<00:00,  1.13s/it]


In [14]:
# Define a function to find relevant news articles based on a given query
def find_relevant_info(query):
    # Encode the query using the sentence transformer model
    query_embedding = model.encode(query)
    # Print the shape of the query embedding
    query_embedding.shape

    # Calculate the cosine similarity between the query embedding and the passage embeddings
    similarities = util.cos_sim(query_embedding, passage_embeddings)

    # Find the indices of the top 3 most similar passages
    top_indicies = torch.topk(similarities.flatten(), 3).indices

    # Get the top 3 relevant passages by slicing the summaries at 200 characters and adding an ellipsis
    top_relevant_passages = [df.iloc[x.item()]['content'][:200] + "..." for x in top_indicies]

    # Return the top 3 relevant passages
    return top_relevant_passages

In [15]:
find_relevant_info("Journeys")

['Success node :: ForgeRock Identity Cloud Docs The Success node is a required element indicating the journey ended successfully.\nForgeRock Identity Cloud\nYes\nForgeRock Access Management (self-managed)\n...',
 'Authentication nodes and journeys :: ForgeRock Identity Cloud Docs Authentication journeys provide fine-grained authentication by allowing multiple paths and decision points throughout\nthe authenticat...',
 'Self-service :: ForgeRock Identity Cloud Docs While many self-service activities take place during authentication in journeys and\nalso relate to creating or updating identities,\nthe use cases in this ...']

In [27]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cristian.sanchezp\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cristian.sanchezp\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [34]:
from transformers import pipeline
from rake_nltk import Rake

# Initialize the text summarization pipeline
summarization_pipeline = pipeline("summarization")

# Sample input text
text = df['content'][0]


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [35]:
# Generate summary using the transformer-based model
summary = summarization_pipeline(text, max_length=100, min_length=30, do_sample=False)

In [36]:
summary

[{'summary_text': ' Identity Cloud docs home :: ForgeRock Identity Cloud Docs . Part 1: Integrating Azure with ForgeRock ID Cloud using SAML2 using ESVs . Part 2: Integating Azure with Identity Cloud using OAuth2 client config in OIDC claim script . Getting Started for Identity Cloud: Access Management .'}]

In [37]:
summary[0]["summary_text"]

' Identity Cloud docs home :: ForgeRock Identity Cloud Docs . Part 1: Integrating Azure with ForgeRock ID Cloud using SAML2 using ESVs . Part 2: Integating Azure with Identity Cloud using OAuth2 client config in OIDC claim script . Getting Started for Identity Cloud: Access Management .'

In [38]:
# Extract keywords from the summary using RAKE
r = Rake()
r.extract_keywords_from_text(summary[0]["summary_text"])
keywords = r.get_ranked_phrases()


In [39]:

# Print the extracted keywords
print("Keywords:", keywords)

Keywords: ['identity cloud docs home :: forgerock identity cloud docs', 'forgerock id cloud using saml2 using esvs', 'identity cloud using oauth2 client config', 'identity cloud', 'oidc claim script', 'part 2', 'part 1', 'integrating azure', 'integating azure', 'getting started', 'access management']
