In [74]:

import numpy as np
import pandas as pd

In [2]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head(10)

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
6,data-engineering-zoomcamp,General course-related questions,Course - Is the current cohort going to be dif...,Yes. For the 2024 edition we are using Mage AI...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
8,data-engineering-zoomcamp,General course-related questions,Course - Can I get support if I take the cours...,"Yes, the slack channel remains open and you ca..."
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948 entries, 0 to 947
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   course    948 non-null    object
 1   section   948 non-null    object
 2   question  948 non-null    object
 3   text      948 non-null    object
dtypes: object(4)
memory usage: 29.8+ KB


In [6]:
#Check to confirm that the course column only contains value for "data-engineering-zoomcamp"
df["course"].unique()

array(['data-engineering-zoomcamp', 'machine-learning-zoomcamp',
       'mlops-zoomcamp'], dtype=object)

In [7]:
df_data_engr = df[df["course"] == 'data-engineering-zoomcamp']
df_data_engr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 435 entries, 0 to 434
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   course    435 non-null    object
 1   section   435 non-null    object
 2   question  435 non-null    object
 3   text      435 non-null    object
dtypes: object(4)
memory usage: 17.0+ KB


In [8]:
df_data_engr["course"].unique()

array(['data-engineering-zoomcamp'], dtype=object)

## Basics of Text Search
### Information Retrieval 
The process of obtaining relevant information from large datasets based on user queries.
- **Inverted Index:** Understand how to build and query an index that maps each term to the list of documents (posting lists) in which it appears.
- **Set‐theoretic & Boolean Retrieval:** Basic boolean operators (AND/OR/NOT) over posting lists.
  
### Text Pre‐processing & NLP
- **Tokenization:** Splitting raw text into meaningful “tokens” (words, n-grams, etc.).
- **Normalization:** Case‐folding, punctuation stripping, Unicode normalization.
- **Stop-word Removal:** Filtering out extremely common words (“the”, “and”) that carry little semantic weight.
- **Stemming/Lemmatization:** Reducing tokens to their root form (e.g., “running” → “run”) to improve recall.

### Scoring & Ranking Models
- **Vector Spaces -** A mathematical representation where text is converted into vectors (points in space), allowing for quantitative comparison.
- **Bag of Words -** A simple text representation model treating each document as a collection of words, disregarding grammar and word order but keeping multiplicity.
- **TF-IDF (Term Frequency-Inverse Document Frequency) -** A statistical measure used to evaluate how important a word is to a document in a collection or corpus. It increases with the number of times a word appears in the document, but is offset by the frequency of the word in the corpus.In summary, weighting to highlight rare but important terms.
- **BM25:** A probabilistic retrieval model that often outperforms vanilla TF–IDF in ranking relevance.
- **Learning-to-Rank (basic):** If you extend beyond simple scoring, knowledge of supervised ranking methods.



## Sklearn CountVectorizer (Bag of Words)
Transforms text into a sparse matrix of n-gram counts.<br>
<a href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html" target="_blank">Sklearn CountVectorizer Website</a>


In [9]:
# Converts the text into vectors
from sklearn.feature_extraction.text import CountVectorizer


## Example
Fit the count vectorizer on the document below and use the **get_feature_names_out()** see the vectors produced

docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [10]:
docs_example = [ 
    "Course starts on 15th Jan 2024", 
    "Prerequisites listed on GitHub", 
    "Submit homeworks after start date", 
    "Registration not required for participation", 
    "Setup Google Cloud and Python before course",
]
cv_example = CountVectorizer(stop_words='english') #Stop words allow for the exclusion of common English words like on, the, and, for etc from the vector
## Revoming stop words means that the words no longer have order, what is left in the vector is called a bag of words
cv_example.fit(docs_example)

In [11]:
cv_example.get_feature_names_out()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [12]:
cv_example.get_feature_names_out().shape

(19,)

## Transform a raw document
We will proceed to use the model transform method to transform a Raw document(in our case, we repeat the use of the example document. 
transform(raw_documents)[source]
Transform documents into document-term matrix.

Extract token counts out of raw text documents using the vocabulary fitted with fit or the one provided to the constructor.

In [13]:
X = cv_example.transform(docs_example)

In [14]:
X.todense()

matrix([[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
        [0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]])

In [15]:
names = cv_example.get_feature_names_out() # Returns all the columns(tokens) in the document
# For each row in the document, a 0 means that the token in the document is absent in that row, while a 1 confirms availability.

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


**max_df:** float in range [0.0, 1.0] or int, default=1.0<br>
When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.<br>

**min_df:** float in range [0.0, 1.0] or int, default=1<br>
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

In [20]:
cv = CountVectorizer(stop_words='english', min_df=5)
# min_df=5 means we only want to include numbers that have a minimum of 5 appearances in the document
X_doc = cv.fit_transform(df_data_engr['text']) 

In [23]:
names_doc = cv.get_feature_names_out()
df_docs = pd.DataFrame(X_doc.toarray(), columns=names_doc)
df_docs

Unnamed: 0,01,04,05,10,100,11,12,13,16,17,...,www,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
431,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
432,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
433,0,0,0,2,0,0,1,0,0,0,...,0,1,1,0,0,0,3,0,0,1


## Sklearn TfidfTransformer TF-IDF (Term Frequency-Inverse Document Frequency)
**Convert a collection of raw documents to a matrix of TF-IDF features<br>
<a href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html" target="_blank">Sklearn TfidfVectorizer </a>

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(stop_words='english', min_df=5) # Vectorizer
# min_df=5 means we only want to include numbers that have a minimum of 5 appearances in the document
X_doc_tv = tv.fit_transform(df_data_engr['text'])  # Matrix

In [46]:
names_doc_tv = tv.get_feature_names_out()
df_docs_tv = pd.DataFrame(X_doc_tv.toarray(), columns=names_doc_tv)
df_docs_tv

Unnamed: 0,01,04,05,10,100,11,12,13,16,17,...,www,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp
0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
1,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.460365
2,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.344378,0.000000,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
431,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
432,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
433,0.0,0.0,0.0,0.088511,0.0,0.00000,0.050737,0.0,0.0,0.0,...,0.000000,0.048926,0.050737,0.0,0.0,0.000000,0.132766,0.0,0.0,0.038826


## Query-Document Similarity
## Query Handling
**Goal:** Process and vectorize search query.<br>
We represent the query in the same vector space - i.e., using the same vectorizer:

In [50]:
query = "Do I need to know python to sign up for the January course?"

q = tv.transform([query])
##q.toarray()

We can see the words of the query and the words of some document:

In [32]:
def vectorize_query(query, vectorizer):
    return vectorizer.transform([query])


In [31]:
query_dict = dict(zip(names_doc_tv, q.toarray()[0]))
#query_dict

## Scoring & Ranking
**Goal:** Find documents most similar to the query.

Use cosine similarity between the query vector and document matrix.

In [48]:
def search(vectorizer, tfidf_matrix, query, top_k=5):
    
    query_vec = vectorize_query(query, vectorizer)
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    top_indices = similarity_scores.argsort()[::-1][:top_k] #[::-1]-reverse the matrix
    
    return [(df_data_engr.iloc[i]['text'], similarity_scores[i]) for i in top_indices]

**RECALL**<br>
We only created the vectorizer(tv) and the Matrix(X_doc_tv) for the text column in the df_data_engr DataFrame

In [49]:
search(vectorizer=tv, tfidf_matrix=X_doc_tv, query=query, top_k=5)

[("The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  np.float64(0.43891950393654094)),
 ('You can do most of the course without a cloud. Almost everything we use (excluding BigQuery) can be run locally. We won’t be able to provide guidelines for some things, but most of the materials are runnable without GCP.\nFor everything in the course, there’s a local alternative. You could even do the whole course locally.',
  np.float64(0.3940173136975931)),
 ('Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes

## Creating Vectorizers and Matrices for all the columns in the DataFrame


In [64]:
columns = ['section', 'question', 'text']

In [106]:
vectorizers ={}
matrices = {}
#columns = list(df_data_engr.columns)

for column in columns:
    #vectorizer = TfidfVectorizer(stop_words='english', min_df=5)
    vectorizer = TfidfVectorizer()
    matrix_X = vectorizer.fit_transform(df_data_engr[column])
    vectorizers[column] = vectorizer
    matrices[column] = matrix_X
    
matrices

{'section': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 1893 stored elements and shape (435, 53)>,
 'question': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 4634 stored elements and shape (435, 1324)>,
 'text': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 22733 stored elements and shape (435, 4074)>}

In [107]:
query = "I just discovered the course, is it too late to join?"
score = np.zeros(len(df_data_engr))

boosts = {
    'question':3,
    'text': 0.5
}
for column in columns:
    q = vectorizers[column].transform([query])
    matrix = matrices[column]

    f_score = cosine_similarity(q, matrix).flatten()

    boost = boosts.get(column, 1.0)
    
    score += f_score*boost

In [108]:
score

array([1.21852198, 0.98006473, 1.57062893, 0.77705967, 0.96525669,
       0.63682226, 0.83131938, 1.28883379, 0.90602263, 0.67833601,
       0.80365766, 0.65211561, 0.68114959, 0.57085675, 0.32472337,
       1.03780419, 0.63258002, 0.83968796, 0.45735368, 0.64750598,
       0.32569185, 0.62325239, 0.48350807, 0.3532358 , 0.43206945,
       0.31996684, 0.31208582, 0.81126643, 0.45448124, 0.49177486,
       0.4575335 , 0.64880481, 0.31208582, 1.01081711, 0.92546355,
       0.91741474, 0.32099204, 0.31610267, 0.51184596, 0.48639232,
       0.46755121, 0.68345433, 0.31579076, 0.33144927, 0.01577147,
       0.14774236, 0.        , 0.02409374, 0.17996841, 0.00902849,
       0.01107965, 0.10114581, 0.24775452, 0.2040217 , 0.02701125,
       0.09562065, 0.16207435, 0.09702075, 0.11957885, 0.26103883,
       0.00559325, 0.27496247, 0.03548984, 0.01961935, 0.01590088,
       0.00926431, 0.00959311, 0.0280996 , 0.00849314, 0.03528273,
       0.12505165, 0.00522055, 0.24981773, 0.01111349, 0.05663

In [109]:
idx_of_top_5_score = np.argsort(score)[::-1][:5]
idx_of_top_5_score

array([ 2,  7,  0, 15, 33])

In [110]:
df_data_engr.iloc[idx_of_top_5_score]

Unnamed: 0,course,section,question,text
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
15,data-engineering-zoomcamp,General course-related questions,Homework - Are late submissions of homework al...,"No, late submissions are not allowed. But if t..."
33,data-engineering-zoomcamp,General course-related questions,Is it possible to use tool “X” instead of the ...,"Yes, this applies if you want to use Airflow o..."


In [83]:
df_data_engr.iloc[idx_of_top_5_score]['question']

7     Course - Can I follow the course after it fini...
0                  Course - When will the course start?
3     Course - I have registered for the Data Engine...
34                 How can we contribute to the course?
1     Course - What are the prerequisites for this c...
Name: question, dtype: object

### Putting it all together
Let's create a class for us to use:

In [97]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [98]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just signed up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

## Transformers

In [115]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-hub<1.0,>=0.30.0->transformers)
  Downloading hf_xet-1.1.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [116]:
import torch
from transformers import BertModel, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [117]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

### We need:
- **tokenizer** - for turning text into vectors<br>
- **model** - for compressing the text into embeddings<br>
First, we tokenize the text

In [118]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

In [119]:
encoded_input

{'input_ids': tensor([[  101,  2748,  1010,  2057,  2097,  2562,  2035,  1996,  4475,  2044,
          1996,  2607, 12321,  1012,   102],
        [  101,  2017,  2064,  3582,  1996,  2607,  2012,  2115,  2219,  6393,
          2044,  2009, 12321,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

### compute the embeddings:

In [120]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

## Now we need to compress the embeddings: