In [1]:
import os

- **`Ingestion`**
- **`Retrieval`**
- **`Generation`**

In [2]:
corpus_of_documents = ["The sun sets in the west, casting a warm glow over the horizon.",
  "Birds chirp melodiously at dawn, welcoming the new day.",
  "A gentle breeze rustles the leaves, creating a soothing sound.",
  "The aroma of freshly brewed coffee fills the kitchen every morning.",
  "Raindrops patter softly against the window, a comforting lullaby.",
  "Children laugh and play in the park, their joy infectious.",
  "The distant hum of traffic blends into the background noise of the city.",
  "A cat purrs contentedly while curled up on the couch.",
  "The smell of rain on dry earth is refreshing and nostalgic.",
  "A colorful butterfly flutters gracefully from flower to flower."]
corpus_of_documents

['The sun sets in the west, casting a warm glow over the horizon.',
 'Birds chirp melodiously at dawn, welcoming the new day.',
 'A gentle breeze rustles the leaves, creating a soothing sound.',
 'The aroma of freshly brewed coffee fills the kitchen every morning.',
 'Raindrops patter softly against the window, a comforting lullaby.',
 'Children laugh and play in the park, their joy infectious.',
 'The distant hum of traffic blends into the background noise of the city.',
 'A cat purrs contentedly while curled up on the couch.',
 'The smell of rain on dry earth is refreshing and nostalgic.',
 'A colorful butterfly flutters gracefully from flower to flower.']

### `Embeddings`

In [3]:
# Using simple frequency based method.
user_query = "I am an Indian and I live in India"
document = "India is a country for Indians and for everyone"

In [4]:
from collections import Counter
import math

In [5]:
# Query Tokens.
query_tokens = user_query.lower().split(" ")
query_tokens

['i', 'am', 'an', 'indian', 'and', 'i', 'live', 'in', 'india']

In [6]:
# Document Tokens.
document_tokens = document.lower().split(" ")
document_tokens


['india', 'is', 'a', 'country', 'for', 'indians', 'and', 'for', 'everyone']

In [7]:
# Creating Simple Embeddings.
query_counter = Counter(query_tokens) # Frequency of Tokens.
query_counter

Counter({'i': 2,
         'am': 1,
         'an': 1,
         'indian': 1,
         'and': 1,
         'live': 1,
         'in': 1,
         'india': 1})

In [8]:
document_counter = Counter(document_tokens) # Frequency of Tokens.
document_counter

Counter({'india': 1,
         'is': 1,
         'a': 1,
         'country': 1,
         'for': 2,
         'indians': 1,
         'and': 1,
         'everyone': 1})

In [9]:
document_counter.keys()

dict_keys(['india', 'is', 'a', 'country', 'for', 'indians', 'and', 'everyone'])

In [10]:
embed = [] # Creating an empty list and storing in it. This will be numerical representation of the query.
for i in query_counter.keys():
  embed.append(query_counter[i])
 # print(query_counter[i])
embed

[2, 1, 1, 1, 1, 1, 1, 1]

In [11]:
embed2 = [] # Creating an empty list and storing in it. This will be numerical representation of the document.
for i in document_counter.keys():
  embed2.append(document_counter[i])
embed2

[1, 1, 1, 1, 2, 1, 1, 1]

### `Similarity Scores Using Cosine Similarity`
- Similarity Score will be calculated in between `User Query` and the available `Data`.

In [12]:
print(user_query)
print(document)

I am an Indian and I live in India
India is a country for Indians and for everyone


In [13]:
# Similar words between user_query and the document.
for tokens in query_counter.keys() & document_counter.keys():
  print(tokens)

and
india


In [14]:
user_query2 = 'I am an Indian and I live in India and I love Indian Food'
document2 = "India is a country for Indians and for everyone and for those who loves Indian Food"

In [15]:
query_tokens2 = user_query2.lower().split(" ")
query_tokens2 = Counter(query_tokens2)
query_tokens2

Counter({'i': 3,
         'am': 1,
         'an': 1,
         'indian': 2,
         'and': 2,
         'live': 1,
         'in': 1,
         'india': 1,
         'love': 1,
         'food': 1})

In [16]:
document_tokens2 = document2.lower().split(" ")
document_tokens2 = Counter(document_tokens2)
document_tokens2

Counter({'india': 1,
         'is': 1,
         'a': 1,
         'country': 1,
         'for': 3,
         'indians': 1,
         'and': 2,
         'everyone': 1,
         'those': 1,
         'who': 1,
         'loves': 1,
         'indian': 1,
         'food': 1})

In [17]:
# Similar words between user_query and the document.
for tokens in query_tokens2.keys() & document_tokens2.keys():
  print(tokens)

india
and
food
indian


In [18]:
print(query_counter)
print('*'*100)
print(document_counter)

Counter({'i': 2, 'am': 1, 'an': 1, 'indian': 1, 'and': 1, 'live': 1, 'in': 1, 'india': 1})
****************************************************************************************************
Counter({'for': 2, 'india': 1, 'is': 1, 'a': 1, 'country': 1, 'indians': 1, 'and': 1, 'everyone': 1})


In [19]:
mylist = []
for tokens in query_counter.keys() & document_counter.keys():
  mylist.append(query_counter[tokens]*document_counter[tokens]) # Provides product of 2 common words i.e. 'and' & 'india' from two sentences.
mylist # [1,1] means product of 2 common words.
# {and:1, india:1 = [1,1]}

[1, 1]

In [20]:
dot_product = sum(mylist)
dot_product

2

In [21]:
query_magnitude = math.sqrt(sum(query_counter[token] ** 2 for token in query_counter))
query_magnitude

3.3166247903554

In [22]:
document_magnitude = math.sqrt(sum(document_counter[token] ** 2 for token in document_counter))
document_magnitude

3.3166247903554

In [23]:
# Similarity Score between user_query and the document sentence.
similarity = (dot_product)/(query_magnitude*document_magnitude)
similarity


0.18181818181818182

In [24]:
# Similarity Score among the common words between user_query and the document sentences using Function.
def cosine_similarity(query, document):
  query_tokens = query.lower().split(" ")
  document_tokens = document.lower().split(" ")

  query_counter = Counter(query_tokens)
  document_counter = Counter(document_tokens)

  dot_product = sum(query_counter[token] * document_counter[token] for token in query_counter.keys() & document_counter.keys())

  query_magnitude = math.sqrt(sum(query_counter[token] ** 2 for token in query_counter))
  document_magnitude = math.sqrt(sum(document_counter[token] ** 2 for token in document_counter))
  similarity = (dot_product)/(query_magnitude* document_magnitude) if query_magnitude * document_magnitude !=0 else 0
  return similarity

In [25]:
user_query = 'Is farooq good data scientist and genai engineer?'
document = 'Farooq is a genai engineer and he is very good in data science and machine learning'
cosine_similarity(user_query,document)

0.6324555320336759

In [26]:
user_query2 = 'Is farooq good with cooking skills?'
document2 = 'Farooq is a genai engineer and he is very good in data science and machine learning'
cosine_similarity(user_query2,document2)

0.3651483716701107

### `LLM`

In [35]:
#!pip install openai

In [38]:
from openai import OpenAI

In [39]:
client = OpenAI(api_key = OPENAI_API_KEY)
completion = client.chat.completions.create(
    model = 'gpt-3.5-turbo',
    messages = [
        {'role' : "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative approaches."},
        {'role': "system", "content": "Compose a poem that explains the concept of recursion in programing."}
    ]
)
print(completion.choices[0].message)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [27]:
# Fetching the Sentence which is most relevant to the query and the corpus based on Cosine Similarity.
# This is Part of 'RANKED RESULT'.
def return_response(query, corpus):
  similarities = []
  for doc in corpus:
    similarity = cosine_similarity(query, doc)
    similarities.append(similarity)
  return corpus_of_documents[similarities.index(max(similarities))] # Fetching the maximum similarity i.e. most relevant result

In [28]:
corpus_of_documents = ["The sun sets in the west, casting a warm glow over the horizon.",
  "Birds chirp melodiously at dawn, welcoming the new day.",
  "A gentle breeze rustles the leaves, creating a soothing sound.",
  "The aroma of freshly brewed coffee fills the kitchen every morning.",
  "Raindrops patter softly against the window, a comforting lullaby.",
  "Children laugh and play in the park, their joy infectious.",
  "The distant hum of traffic blends into the background noise of the city.",
  "A cat purrs contentedly while curled up on the couch.",
  "The smell of rain on dry earth is refreshing and nostalgic.",
  "A colorful butterfly flutters gracefully from flower to flower."]

In [29]:
# Ask query which has relevance to above corpus
user_query = 'Is smell of rain will be refreshing?'

In [30]:
return_response(user_query, corpus_of_documents)

'The smell of rain on dry earth is refreshing and nostalgic.'

In [31]:
user_query = 'what Children do in the park?'
return_response(user_query, corpus_of_documents)

'Children laugh and play in the park, their joy infectious.'

In [32]:
### 'GENERATION'
full_response = []

In [42]:
user_input = ""

In [41]:
relevant_document = return_response(user_query, corpus_of_documents)

In [45]:
prompt = f"""
This is the given information: {relevant_document}
The user input is: {user_input}
Compile the information to the user based on the given information and the user input.
"""
print(prompt)


This is the given information: Children laugh and play in the park, their joy infectious.
The user input is: 
Compile the information to the user based on the given information and the user input.



In [None]:
# Passing our Prompt to the OpenAI Model and generating the Output.
client = OpenAI(api_key = OPENAI_API_KEY)
completion = client.chat.completions.create(
    model = 'gpt-3.5-turbo',
    messages = [
        {'role' : "system", "content": "You are a friendly bot. You answer in very short sentences and do not include extra information."},
        {'role': "system", "content": prompt}
    ]
)
print(completion.choices[0].message)
print(completion.choices[0].message.content)