In [1]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sentences = [
    "The dog is playing in the park",
    "A puppy is running outside",
    "The cat is sleeping on the couch",
    "Python is a programming language",
    "Machine learning models need data",
    "I love coding in Python"
]

model = SentenceTransformer("all-MiniLM-L6-v2")

#Generate embeddings
embeddings = model.encode(sentences)

In [3]:
import numpy as np
def semantic_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    scalar1 = np.linalg.norm(vec1)
    scalar2 = np.linalg.norm(vec2)
    similarity = dot_product / (scalar1 * scalar2)
    return similarity

#Calculate similarity scores between: Sentence 1 and all others
for i, embedding in enumerate(embeddings):
    if i == 0:
        pass
    else:
        print(f"Similarity between Sentence 1 and Sentence {i + 1} is {semantic_similarity(embeddings[0], embedding)}")



Similarity between Sentence 1 and Sentence 2 is 0.3984106481075287
Similarity between Sentence 1 and Sentence 3 is 0.07135515660047531
Similarity between Sentence 1 and Sentence 4 is 0.0986606553196907
Similarity between Sentence 1 and Sentence 5 is -0.005180873442441225
Similarity between Sentence 1 and Sentence 6 is 0.09017310291528702


In [4]:
#Calculate similarity scores between: Sentence 4 and all others
for i, embedding in enumerate(embeddings):
    if i == 3:
        pass
    else:
        print(f"Similarity between Sentence 4 and Sentence {i + 1} is {semantic_similarity(embeddings[3], embedding)}")

Similarity between Sentence 4 and Sentence 1 is 0.0986606553196907
Similarity between Sentence 4 and Sentence 2 is 0.03952628746628761
Similarity between Sentence 4 and Sentence 3 is 0.019887205213308334
Similarity between Sentence 4 and Sentence 5 is 0.11328087002038956
Similarity between Sentence 4 and Sentence 6 is 0.7303749322891235


## Question 4


### Similarity Analysis

Query: "The dog is playing in the park" </br>
Most Similar: "A puppy is running outside" </br>
Least similar: "Machine learning models need data" </br>
Observations: The ideal threshold for this query is 1 as there is only one sentence with cosine similarity above 0.1.

### Similarity Analysis

Query: "Python is a programming language" </br>
Most Similar: "I love coding in Python" </br>
Least similar: "The cat is sleeping on the couch" </br>
Observations: The ideal threshold for this query is 2 as there are 2 sentences with cosine similarity above 0.1.


## Exercise 2



In [30]:
import re
def chunking(text, chunk_size = 100):
    chunks = []
    text = re.sub(r"\s+", " ", text)
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i: i + chunk_size])
    return chunks


In [31]:
document = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural intelligence displayed by humans and animals. Leading AI textbooks define
the field as the study of intelligent agents: any device that perceives its environment
and takes actions that maximize its chance of successfully achieving its goals.

Machine learning is a subset of artificial intelligence that focuses on the use of data
and algorithms to imitate the way that humans learn, gradually improving its accuracy.
Machine learning is an important component of the growing field of data science.

Deep learning is part of a broader family of machine learning methods based on artificial
neural networks with representation learning. Learning can be supervised, semi-supervised
or unsupervised. Deep learning architectures such as deep neural networks, deep belief
networks, recurrent neural networks and convolutional neural networks have been applied
to fields including computer vision, speech recognition, natural language processing,
machine translation, and bioinformatics.

Natural language processing is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural
language data. Challenges in natural language processing frequently involve speech
recognition, natural language understanding, and natural language generation.
"""


In [32]:
import numpy as np
def semantic_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    scalar1 = np.linalg.norm(vec1)
    scalar2 = np.linalg.norm(vec2)
    similarity = dot_product / (scalar1 * scalar2)
    return similarity

def similarity_search(query, text, chunk_size, top_k):
    scores = []
    chunks = chunking(text, chunk_size)
    embeddings = model.encode(chunks)
    embedded_query = model.encode(query)
    for index, embedding in enumerate(embeddings):
        scores.append((chunks[index], semantic_similarity(embedding, embedded_query)))
    scores.sort(key = lambda x: x[1], reverse=True)
    return scores[:top_k]


### Small Chunks

In [36]:
query = "What is machine learning?"
similarity_search(query, document, 100, 3)

[('g its accuracy. Machine learning is an important component of the growing field of data science. Dee',
  np.float32(0.6762383)),
 ('nce of successfully achieving its goals. Machine learning is a subset of artificial intelligence tha',
  np.float32(0.66835594)),
 ('p learning is part of a broader family of machine learning methods based on artificial neural networ',
  np.float32(0.54791474))]

### Medium Chunks

In [34]:
similarity_search(query, document, 200, 3)

[('t focuses on the use of data and algorithms to imitate the way that humans learn, gradually improving its accuracy. Machine learning is an important component of the growing field of data science. Dee',
  np.float32(0.7019916)),
 ('ntelligent agents: any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. Machine learning is a subset of artificial intelligence tha',
  np.float32(0.6086768)),
 ('p learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. Deep l',
  np.float32(0.5849977))]

### Large Chunks

In [35]:
similarity_search(query, document, 400, 3)

[(' Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of intelligent agents: any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. Machine learning is a subset of artificial intelligence tha',
  np.float32(0.65157336)),
 ('t focuses on the use of data and algorithms to imitate the way that humans learn, gradually improving its accuracy. Machine learning is an important component of the growing field of data science. Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. Deep l',
  np.float32(0.64503664)),
 ('earning architectures such as deep neural networks, deep belief networks, recurrent neural networks and convolutional neural networ

Small Chunks (100 chars):
- Number of chunks: 3
- Top result: 'g its accuracy. Machine learning is an important component of the growing field of data science. Dee'
- Score: 0.676
- Analysis: Performs fairly well but can still be improved

Medium Chunks (200 chars):
- Number of chunks: 3
- Top result: "t focuses on the use of data and algorithms to imitate the way that humans learn, gradually improving its accuracy. Machine learning is an important component of the growing field of data science. Dee"
- Score: 0.702
- Analysis: Gave a higher score than small chunks and as can be seen the chunk with the higher cosine similarity is a definition of machine learning.

Large Chunks (400 chars):
- Number of chunks: 3
- Top result: " Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of intelligent agents: any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. Machine learning is a subset of artificial intelligence tha"
- Score: 0.651
- Analysis: Gave the lowest similarity score but the top 3 chunks were very close in semantic meaning to the query

Best chunk size for this use case: Large  because: Large Chunks were the closest in semantic similarity to the given query.
```