In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers)
  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.3.0-cp312-none-macosx_11_0_arm64.whl.metadata (26 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.4.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.13.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (60 kB)
Collecting huggingface-hub>=0.15.1 (from sentence-transformers)
  Using cached huggingface_hub-0.23.0-py3-none-any.whl.metadata (12 kB)
Collecting filelock (from huggingface-hub>=0.15.1->sentence-transformer

In [2]:
import pandas as pd

pd.set_option("display.max_colwidth", 100)

In [4]:
df = pd.read_csv("sample_text.csv")
df.shape

(8, 2)

In [5]:
df

Unnamed: 0,text,category
0,Meditation and yoga can improve mental health,Health
1,"Fruits, whole grains and vegetables helps control blood pressure",Health
2,These are the latest fashion trends for this week,Fashion
3,Vibrant color jeans for male are becoming a trend,Fashion
4,The concert starts at 7 PM tonight,Event
5,Navaratri dandiya program at Expo center in Mumbai this october,Event
6,Exciting vacation destinations for your next trip,Travel
7,Maldives and Srilanka are gaining popularity in terms of low budget vacation places,Travel


In [6]:
df.text

0                                          Meditation and yoga can improve mental health
1                       Fruits, whole grains and vegetables helps control blood pressure
2                                      These are the latest fashion trends for this week
3                                      Vibrant color jeans for male are becoming a trend
4                                                     The concert starts at 7 PM tonight
5                        Navaratri dandiya program at Expo center in Mumbai this october
6                                      Exciting vacation destinations for your next trip
7    Maldives and Srilanka are gaining popularity in terms of low budget vacation places
Name: text, dtype: object

In [8]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("all-mpnet-base-v2")
vectors = encoder.encode(df.text)
vectors.shape

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(8, 768)

In [9]:
dim = vectors.shape[1]
dim

768

In [10]:
import faiss

index = faiss.IndexFlatL2(dim)
index

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x3a94871b0> >

In [11]:
index.add(vectors)

In [32]:
search_query = "Looking for a place to visit during christmas"
search_vector = encoder.encode(search_query)
search_vector.shape

(768,)

In [33]:
import numpy as np

search_vector = np.array(search_vector).reshape(1, -1)
search_vector.shape

(1, 768)

In [34]:
distances, indices = index.search(search_vector, k=2)

In [35]:
indices.tolist()[0]

[6, 7]

In [36]:
df.loc[indices.tolist()[0]]

Unnamed: 0,text,category
6,Exciting vacation destinations for your next trip,Travel
7,Maldives and Srilanka are gaining popularity in terms of low budget vacation places,Travel


In [37]:
import os
from transformers import pipeline
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())
pipe = pipeline("text-generation", model="meta-llama/Meta-Llama-Guard-2-8B",
                token=os.getenv("HUGGINGFACEHUB_API_TOKEN"))




Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:  53%|#####3    | 2.65G/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/126 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [38]:
pipe("Tell me a joke")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': 'Tell me a joke, please."\n"Okay. What\'s the difference between a hippopotamus and'}]