In [1]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
!pip install transformers
!pip install -U sentence-transformers
!pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 960 kB/s eta 0:00:01
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.8 MB/s eta 0:00:01
Collecting scikit-learn
  Downloading scikit_learn-1.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 3.0 MB/s eta 0:00:011
[?25hCollecting scipy
  Downloading scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[K     |████████████████████████████████| 34.5 MB 3.2 MB/s eta 0:00:01
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 2.4 MB/s eta 0:00:01
Collecting threadpoolctl>=2.0.0
 

In [5]:
#!wget https://learn.microsoft.com/pdf?url=https%3A%2F%2Flearn.microsoft.com%2Fen-us%2Fazure%2Fsynapse-analytics%2Ftoc.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
--2023-03-08 20:04:30--  https://learn.microsoft.com/pdf?url=https%3A%2F%2Flearn.microsoft.com%2Fen-us%2Fazure%2Fsynapse-analytics%2Ftoc.json
Resolving learn.microsoft.com (learn.microsoft.com)... 95.100.65.213, 2a02:26f0:3400:18f::3544, 2a02:26f0:3400:182::3544
Connecting to learn.microsoft.com (learn.microsoft.com)|95.100.65.213|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 397216637 (379M) [application/pdf]
Saving to: ‘pdf?url=https:%2F%2Flearn.microsoft.com%2Fen-us%2Fazure%2Fsynapse-analytics%2Ftoc.json’


2023-03-08 20:05:11 (9.67 MB/s) - ‘pdf?url=https:%2F%2Flearn.microsoft.com%2Fen-us%2Fazure%2Fsynapse-analytics%2Ftoc.json’ saved [397216637/397216637]



In [3]:
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer, util
from itertools import islice
import torch
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class PdfObject:
    def __init__(self, file_path):
        self.file_path = file_path
        self.reader = PdfReader(self.file_path)
        self.pages = len(self.reader.pages)
    
    
    def extract_pages_data(self, limit=None):
        limit = self.pages if not limit else limit
        extracted_data = []
        for i, page in enumerate(islice(self.reader.pages, limit)):
            
            extracted_data.append(
                {
                    "page": i, 
                    "text": page.extract_text()
                }
            )
            print(f"{i}-th page extracted")
        return extracted_data


class Embeddings:
    def __init__(self, model, corpus=None, corpus_file_path=None, embeddings_file_path=None):
        self.model = model
        
        if corpus:
            self.corpus = corpus
        elif corpus_file_path and embeddings_file_path:
            self.load_from_files(corpus_file_path, embeddings_file_path)
            
    
    def _stack(self):
        self.embeddings = torch.stack(self.embeddings)
        
    def create(self):
        corpus_emds = []
        corpus_size = len(self.corpus)
        percents = set()
        for i, page in enumerate(corpus):
            page_text = page["text"]
            embds = self.model.encode(page_text, convert_to_tensor=True)

            percent_processed = int((i/corpus_size)*100)
            if percent_processed and percent_processed % 10 == 0:
                if percent_processed not in percents:
                    print(f"{percent_processed}% processed")
                    percents.add(percent_processed)

            corpus_emds.append(embds)
        
        self.embeddings = corpus_emds
        self._stack()
        return self.embeddings
    
    def save_to_files(self, corpus_store_path, embeddings_store_path):
        torch.save(self.embeddings, embeddings_store_path)
        
        with open(corpus_store_path, 'wb') as handle:
            pickle.dump(self.corpus, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    def load_from_files(self, corpus_store_path, embeddings_store_path):
        with open(corpus_store_path, 'rb') as handle:
            self.corpus = pickle.load(handle)
        
        self.embeddings = torch.load(embeddings_store_path) 

        
class Prediction:
    def __init__(self, model, corpus, embeddings):
        self.corpus = corpus
        self.model = model
        self.embeddings = embeddings
    
    def predict(self, query, limit):
        top_k = limit
        query_embedding = self.model.encode(query, convert_to_tensor=True)
        # We use cosine-similarity and torch.topk to find the highest 5 scores
        cos_scores = util.cos_sim(query_embedding, self.embeddings)[0]

        top_results = torch.topk(cos_scores, k=top_k)

        print("\n\n======================\n\n")
        print("Query:", query)
        print(f"\nTop {top_k} most similar sentences in corpus:")

        for score, idx in zip(top_results[0], top_results[1]):
            print(f"\n{self.corpus[idx]}\n", "(Score: {:.4f})".format(score))

In [82]:
# entrypoint
synapse_book = PdfObject('azure_synapse_analytics.pdf')
corpus = synapse_book.extract_pages_data()

0-th page extracted
1-th page extracted
2-th page extracted
3-th page extracted
4-th page extracted
5-th page extracted
6-th page extracted
7-th page extracted
8-th page extracted
9-th page extracted
10-th page extracted
11-th page extracted
12-th page extracted
13-th page extracted
14-th page extracted
15-th page extracted
16-th page extracted
17-th page extracted
18-th page extracted
19-th page extracted
20-th page extracted
21-th page extracted
22-th page extracted
23-th page extracted
24-th page extracted
25-th page extracted
26-th page extracted
27-th page extracted
28-th page extracted
29-th page extracted
30-th page extracted
31-th page extracted
32-th page extracted
33-th page extracted
34-th page extracted
35-th page extracted
36-th page extracted
37-th page extracted
38-th page extracted
39-th page extracted
40-th page extracted
41-th page extracted
42-th page extracted
43-th page extracted
44-th page extracted
45-th page extracted
46-th page extracted
47-th page extracted
48

385-th page extracted
386-th page extracted
387-th page extracted
388-th page extracted
389-th page extracted
390-th page extracted
391-th page extracted
392-th page extracted
393-th page extracted
394-th page extracted
395-th page extracted
396-th page extracted
397-th page extracted
398-th page extracted
399-th page extracted
400-th page extracted
401-th page extracted
402-th page extracted
403-th page extracted
404-th page extracted
405-th page extracted
406-th page extracted
407-th page extracted
408-th page extracted
409-th page extracted
410-th page extracted
411-th page extracted
412-th page extracted
413-th page extracted
414-th page extracted
415-th page extracted
416-th page extracted
417-th page extracted
418-th page extracted
419-th page extracted
420-th page extracted
421-th page extracted
422-th page extracted
423-th page extracted
424-th page extracted
425-th page extracted
426-th page extracted
427-th page extracted
428-th page extracted
429-th page extracted
430-th pag

764-th page extracted
765-th page extracted
766-th page extracted
767-th page extracted
768-th page extracted
769-th page extracted
770-th page extracted
771-th page extracted
772-th page extracted
773-th page extracted
774-th page extracted
775-th page extracted
776-th page extracted
777-th page extracted
778-th page extracted
779-th page extracted
780-th page extracted
781-th page extracted
782-th page extracted
783-th page extracted
784-th page extracted
785-th page extracted
786-th page extracted
787-th page extracted
788-th page extracted
789-th page extracted
790-th page extracted
791-th page extracted
792-th page extracted
793-th page extracted
794-th page extracted
795-th page extracted
796-th page extracted
797-th page extracted
798-th page extracted
799-th page extracted
800-th page extracted
801-th page extracted
802-th page extracted
803-th page extracted
804-th page extracted
805-th page extracted
806-th page extracted
807-th page extracted
808-th page extracted
809-th pag

1154-th page extracted
1155-th page extracted
1156-th page extracted
1157-th page extracted
1158-th page extracted
1159-th page extracted
1160-th page extracted
1161-th page extracted
1162-th page extracted
1163-th page extracted
1164-th page extracted
1165-th page extracted
1166-th page extracted
1167-th page extracted
1168-th page extracted
1169-th page extracted
1170-th page extracted
1171-th page extracted
1172-th page extracted
1173-th page extracted
1174-th page extracted
1175-th page extracted
1176-th page extracted
1177-th page extracted
1178-th page extracted
1179-th page extracted
1180-th page extracted
1181-th page extracted
1182-th page extracted
1183-th page extracted
1184-th page extracted
1185-th page extracted
1186-th page extracted
1187-th page extracted
1188-th page extracted
1189-th page extracted
1190-th page extracted
1191-th page extracted
1192-th page extracted
1193-th page extracted
1194-th page extracted
1195-th page extracted
1196-th page extracted
1197-th pag

1525-th page extracted
1526-th page extracted
1527-th page extracted
1528-th page extracted
1529-th page extracted
1530-th page extracted
1531-th page extracted
1532-th page extracted
1533-th page extracted
1534-th page extracted
1535-th page extracted
1536-th page extracted
1537-th page extracted
1538-th page extracted
1539-th page extracted
1540-th page extracted
1541-th page extracted
1542-th page extracted
1543-th page extracted
1544-th page extracted
1545-th page extracted
1546-th page extracted
1547-th page extracted
1548-th page extracted
1549-th page extracted
1550-th page extracted
1551-th page extracted
1552-th page extracted
1553-th page extracted
1554-th page extracted
1555-th page extracted
1556-th page extracted
1557-th page extracted
1558-th page extracted
1559-th page extracted
1560-th page extracted
1561-th page extracted
1562-th page extracted
1563-th page extracted
1564-th page extracted
1565-th page extracted
1566-th page extracted
1567-th page extracted
1568-th pag

1884-th page extracted
1885-th page extracted
1886-th page extracted
1887-th page extracted
1888-th page extracted
1889-th page extracted
1890-th page extracted
1891-th page extracted
1892-th page extracted
1893-th page extracted
1894-th page extracted
1895-th page extracted
1896-th page extracted
1897-th page extracted
1898-th page extracted
1899-th page extracted
1900-th page extracted
1901-th page extracted
1902-th page extracted
1903-th page extracted
1904-th page extracted
1905-th page extracted
1906-th page extracted
1907-th page extracted
1908-th page extracted
1909-th page extracted
1910-th page extracted
1911-th page extracted
1912-th page extracted
1913-th page extracted
1914-th page extracted
1915-th page extracted
1916-th page extracted
1917-th page extracted
1918-th page extracted
1919-th page extracted
1920-th page extracted
1921-th page extracted
1922-th page extracted
1923-th page extracted
1924-th page extracted
1925-th page extracted
1926-th page extracted
1927-th pag

2249-th page extracted
2250-th page extracted
2251-th page extracted
2252-th page extracted
2253-th page extracted
2254-th page extracted
2255-th page extracted
2256-th page extracted
2257-th page extracted
2258-th page extracted
2259-th page extracted
2260-th page extracted
2261-th page extracted
2262-th page extracted
2263-th page extracted
2264-th page extracted
2265-th page extracted
2266-th page extracted
2267-th page extracted
2268-th page extracted
2269-th page extracted
2270-th page extracted
2271-th page extracted
2272-th page extracted
2273-th page extracted
2274-th page extracted
2275-th page extracted
2276-th page extracted
2277-th page extracted
2278-th page extracted
2279-th page extracted
2280-th page extracted
2281-th page extracted
2282-th page extracted
2283-th page extracted
2284-th page extracted
2285-th page extracted
2286-th page extracted
2287-th page extracted
2288-th page extracted
2289-th page extracted
2290-th page extracted
2291-th page extracted
2292-th pag

2611-th page extracted
2612-th page extracted
2613-th page extracted
2614-th page extracted
2615-th page extracted
2616-th page extracted
2617-th page extracted
2618-th page extracted
2619-th page extracted
2620-th page extracted
2621-th page extracted
2622-th page extracted
2623-th page extracted
2624-th page extracted
2625-th page extracted
2626-th page extracted
2627-th page extracted
2628-th page extracted
2629-th page extracted
2630-th page extracted
2631-th page extracted
2632-th page extracted
2633-th page extracted
2634-th page extracted
2635-th page extracted
2636-th page extracted
2637-th page extracted
2638-th page extracted
2639-th page extracted
2640-th page extracted
2641-th page extracted
2642-th page extracted
2643-th page extracted
2644-th page extracted
2645-th page extracted
2646-th page extracted
2647-th page extracted
2648-th page extracted
2649-th page extracted
2650-th page extracted
2651-th page extracted
2652-th page extracted
2653-th page extracted
2654-th pag

2976-th page extracted
2977-th page extracted
2978-th page extracted
2979-th page extracted
2980-th page extracted
2981-th page extracted
2982-th page extracted
2983-th page extracted
2984-th page extracted
2985-th page extracted
2986-th page extracted
2987-th page extracted
2988-th page extracted
2989-th page extracted
2990-th page extracted
2991-th page extracted
2992-th page extracted
2993-th page extracted
2994-th page extracted
2995-th page extracted
2996-th page extracted
2997-th page extracted
2998-th page extracted
2999-th page extracted
3000-th page extracted
3001-th page extracted
3002-th page extracted
3003-th page extracted
3004-th page extracted
3005-th page extracted
3006-th page extracted
3007-th page extracted
3008-th page extracted
3009-th page extracted
3010-th page extracted
3011-th page extracted
3012-th page extracted
3013-th page extracted
3014-th page extracted
3015-th page extracted
3016-th page extracted
3017-th page extracted
3018-th page extracted
3019-th pag

3343-th page extracted
3344-th page extracted
3345-th page extracted
3346-th page extracted
3347-th page extracted
3348-th page extracted
3349-th page extracted
3350-th page extracted
3351-th page extracted
3352-th page extracted
3353-th page extracted
3354-th page extracted
3355-th page extracted
3356-th page extracted
3357-th page extracted
3358-th page extracted
3359-th page extracted
3360-th page extracted
3361-th page extracted
3362-th page extracted
3363-th page extracted
3364-th page extracted
3365-th page extracted
3366-th page extracted
3367-th page extracted
3368-th page extracted
3369-th page extracted
3370-th page extracted
3371-th page extracted
3372-th page extracted
3373-th page extracted
3374-th page extracted
3375-th page extracted
3376-th page extracted
3377-th page extracted
3378-th page extracted
3379-th page extracted
3380-th page extracted
3381-th page extracted
3382-th page extracted
3383-th page extracted
3384-th page extracted
3385-th page extracted
3386-th pag

3705-th page extracted
3706-th page extracted
3707-th page extracted
3708-th page extracted
3709-th page extracted
3710-th page extracted
3711-th page extracted
3712-th page extracted
3713-th page extracted
3714-th page extracted
3715-th page extracted
3716-th page extracted
3717-th page extracted
3718-th page extracted
3719-th page extracted
3720-th page extracted
3721-th page extracted
3722-th page extracted
3723-th page extracted
3724-th page extracted
3725-th page extracted
3726-th page extracted
3727-th page extracted
3728-th page extracted
3729-th page extracted
3730-th page extracted
3731-th page extracted
3732-th page extracted
3733-th page extracted
3734-th page extracted
3735-th page extracted
3736-th page extracted
3737-th page extracted
3738-th page extracted
3739-th page extracted
3740-th page extracted
3741-th page extracted
3742-th page extracted
3743-th page extracted
3744-th page extracted
3745-th page extracted
3746-th page extracted
3747-th page extracted
3748-th pag

4062-th page extracted
4063-th page extracted
4064-th page extracted
4065-th page extracted
4066-th page extracted
4067-th page extracted
4068-th page extracted
4069-th page extracted
4070-th page extracted
4071-th page extracted
4072-th page extracted
4073-th page extracted
4074-th page extracted
4075-th page extracted
4076-th page extracted
4077-th page extracted
4078-th page extracted
4079-th page extracted
4080-th page extracted
4081-th page extracted
4082-th page extracted
4083-th page extracted
4084-th page extracted
4085-th page extracted
4086-th page extracted
4087-th page extracted
4088-th page extracted
4089-th page extracted
4090-th page extracted
4091-th page extracted
4092-th page extracted
4093-th page extracted
4094-th page extracted
4095-th page extracted
4096-th page extracted
4097-th page extracted
4098-th page extracted
4099-th page extracted
4100-th page extracted
4101-th page extracted
4102-th page extracted
4103-th page extracted
4104-th page extracted
4105-th pag

4427-th page extracted
4428-th page extracted
4429-th page extracted
4430-th page extracted
4431-th page extracted
4432-th page extracted
4433-th page extracted
4434-th page extracted
4435-th page extracted
4436-th page extracted
4437-th page extracted
4438-th page extracted
4439-th page extracted
4440-th page extracted
4441-th page extracted
4442-th page extracted
4443-th page extracted
4444-th page extracted
4445-th page extracted
4446-th page extracted
4447-th page extracted
4448-th page extracted
4449-th page extracted
4450-th page extracted
4451-th page extracted
4452-th page extracted
4453-th page extracted
4454-th page extracted
4455-th page extracted
4456-th page extracted
4457-th page extracted
4458-th page extracted
4459-th page extracted
4460-th page extracted
4461-th page extracted
4462-th page extracted
4463-th page extracted
4464-th page extracted
4465-th page extracted
4466-th page extracted
4467-th page extracted
4468-th page extracted
4469-th page extracted
4470-th pag

4787-th page extracted
4788-th page extracted
4789-th page extracted
4790-th page extracted
4791-th page extracted
4792-th page extracted
4793-th page extracted
4794-th page extracted
4795-th page extracted
4796-th page extracted
4797-th page extracted
4798-th page extracted
4799-th page extracted
4800-th page extracted
4801-th page extracted
4802-th page extracted
4803-th page extracted
4804-th page extracted
4805-th page extracted
4806-th page extracted
4807-th page extracted
4808-th page extracted
4809-th page extracted
4810-th page extracted
4811-th page extracted
4812-th page extracted
4813-th page extracted
4814-th page extracted
4815-th page extracted
4816-th page extracted
4817-th page extracted
4818-th page extracted
4819-th page extracted
4820-th page extracted
4821-th page extracted
4822-th page extracted
4823-th page extracted
4824-th page extracted
4825-th page extracted
4826-th page extracted
4827-th page extracted
4828-th page extracted
4829-th page extracted
4830-th pag

5151-th page extracted
5152-th page extracted
5153-th page extracted
5154-th page extracted
5155-th page extracted
5156-th page extracted
5157-th page extracted
5158-th page extracted
5159-th page extracted
5160-th page extracted
5161-th page extracted
5162-th page extracted
5163-th page extracted
5164-th page extracted
5165-th page extracted
5166-th page extracted
5167-th page extracted
5168-th page extracted
5169-th page extracted
5170-th page extracted
5171-th page extracted
5172-th page extracted
5173-th page extracted
5174-th page extracted
5175-th page extracted
5176-th page extracted
5177-th page extracted
5178-th page extracted
5179-th page extracted
5180-th page extracted
5181-th page extracted
5182-th page extracted
5183-th page extracted
5184-th page extracted
5185-th page extracted
5186-th page extracted
5187-th page extracted
5188-th page extracted
5189-th page extracted
5190-th page extracted
5191-th page extracted
5192-th page extracted
5193-th page extracted
5194-th pag

5509-th page extracted
5510-th page extracted
5511-th page extracted
5512-th page extracted
5513-th page extracted
5514-th page extracted
5515-th page extracted
5516-th page extracted
5517-th page extracted
5518-th page extracted
5519-th page extracted
5520-th page extracted
5521-th page extracted
5522-th page extracted
5523-th page extracted
5524-th page extracted
5525-th page extracted
5526-th page extracted
5527-th page extracted
5528-th page extracted
5529-th page extracted
5530-th page extracted
5531-th page extracted
5532-th page extracted
5533-th page extracted
5534-th page extracted
5535-th page extracted
5536-th page extracted
5537-th page extracted
5538-th page extracted
5539-th page extracted
5540-th page extracted
5541-th page extracted
5542-th page extracted
5543-th page extracted
5544-th page extracted
5545-th page extracted
5546-th page extracted
5547-th page extracted
5548-th page extracted
5549-th page extracted
5550-th page extracted
5551-th page extracted
5552-th pag

5874-th page extracted
5875-th page extracted
5876-th page extracted
5877-th page extracted
5878-th page extracted
5879-th page extracted
5880-th page extracted
5881-th page extracted
5882-th page extracted
5883-th page extracted
5884-th page extracted
5885-th page extracted
5886-th page extracted
5887-th page extracted
5888-th page extracted
5889-th page extracted
5890-th page extracted
5891-th page extracted
5892-th page extracted
5893-th page extracted
5894-th page extracted
5895-th page extracted
5896-th page extracted
5897-th page extracted
5898-th page extracted
5899-th page extracted
5900-th page extracted
5901-th page extracted
5902-th page extracted
5903-th page extracted
5904-th page extracted
5905-th page extracted
5906-th page extracted
5907-th page extracted
5908-th page extracted
5909-th page extracted
5910-th page extracted
5911-th page extracted
5912-th page extracted
5913-th page extracted
5914-th page extracted
5915-th page extracted
5916-th page extracted
5917-th pag

6239-th page extracted
6240-th page extracted
6241-th page extracted
6242-th page extracted
6243-th page extracted
6244-th page extracted
6245-th page extracted
6246-th page extracted
6247-th page extracted
6248-th page extracted
6249-th page extracted
6250-th page extracted
6251-th page extracted
6252-th page extracted
6253-th page extracted
6254-th page extracted
6255-th page extracted
6256-th page extracted
6257-th page extracted
6258-th page extracted
6259-th page extracted
6260-th page extracted
6261-th page extracted
6262-th page extracted
6263-th page extracted
6264-th page extracted
6265-th page extracted
6266-th page extracted
6267-th page extracted
6268-th page extracted
6269-th page extracted
6270-th page extracted
6271-th page extracted
6272-th page extracted
6273-th page extracted
6274-th page extracted
6275-th page extracted
6276-th page extracted
6277-th page extracted
6278-th page extracted
6279-th page extracted
6280-th page extracted
6281-th page extracted
6282-th pag

6603-th page extracted
6604-th page extracted
6605-th page extracted
6606-th page extracted
6607-th page extracted
6608-th page extracted
6609-th page extracted
6610-th page extracted
6611-th page extracted
6612-th page extracted
6613-th page extracted
6614-th page extracted
6615-th page extracted
6616-th page extracted
6617-th page extracted
6618-th page extracted
6619-th page extracted
6620-th page extracted
6621-th page extracted
6622-th page extracted
6623-th page extracted
6624-th page extracted
6625-th page extracted
6626-th page extracted
6627-th page extracted
6628-th page extracted
6629-th page extracted
6630-th page extracted
6631-th page extracted
6632-th page extracted
6633-th page extracted
6634-th page extracted
6635-th page extracted
6636-th page extracted
6637-th page extracted
6638-th page extracted
6639-th page extracted
6640-th page extracted
6641-th page extracted
6642-th page extracted
6643-th page extracted
6644-th page extracted
6645-th page extracted
6646-th pag

6966-th page extracted
6967-th page extracted
6968-th page extracted
6969-th page extracted
6970-th page extracted
6971-th page extracted
6972-th page extracted
6973-th page extracted
6974-th page extracted
6975-th page extracted
6976-th page extracted
6977-th page extracted
6978-th page extracted
6979-th page extracted
6980-th page extracted
6981-th page extracted
6982-th page extracted
6983-th page extracted
6984-th page extracted
6985-th page extracted
6986-th page extracted
6987-th page extracted
6988-th page extracted
6989-th page extracted
6990-th page extracted
6991-th page extracted
6992-th page extracted
6993-th page extracted
6994-th page extracted
6995-th page extracted
6996-th page extracted
6997-th page extracted
6998-th page extracted
6999-th page extracted
7000-th page extracted
7001-th page extracted
7002-th page extracted
7003-th page extracted
7004-th page extracted
7005-th page extracted
7006-th page extracted
7007-th page extracted
7008-th page extracted
7009-th pag

7325-th page extracted
7326-th page extracted
7327-th page extracted
7328-th page extracted
7329-th page extracted
7330-th page extracted
7331-th page extracted
7332-th page extracted
7333-th page extracted
7334-th page extracted
7335-th page extracted
7336-th page extracted
7337-th page extracted
7338-th page extracted
7339-th page extracted
7340-th page extracted
7341-th page extracted
7342-th page extracted
7343-th page extracted
7344-th page extracted
7345-th page extracted
7346-th page extracted
7347-th page extracted
7348-th page extracted
7349-th page extracted
7350-th page extracted
7351-th page extracted
7352-th page extracted
7353-th page extracted
7354-th page extracted
7355-th page extracted
7356-th page extracted
7357-th page extracted
7358-th page extracted
7359-th page extracted
7360-th page extracted
7361-th page extracted
7362-th page extracted
7363-th page extracted
7364-th page extracted
7365-th page extracted
7366-th page extracted
7367-th page extracted
7368-th pag

7692-th page extracted
7693-th page extracted
7694-th page extracted
7695-th page extracted
7696-th page extracted
7697-th page extracted
7698-th page extracted
7699-th page extracted
7700-th page extracted
7701-th page extracted
7702-th page extracted
7703-th page extracted
7704-th page extracted
7705-th page extracted
7706-th page extracted
7707-th page extracted
7708-th page extracted
7709-th page extracted
7710-th page extracted
7711-th page extracted
7712-th page extracted
7713-th page extracted
7714-th page extracted
7715-th page extracted
7716-th page extracted
7717-th page extracted
7718-th page extracted
7719-th page extracted
7720-th page extracted
7721-th page extracted
7722-th page extracted
7723-th page extracted
7724-th page extracted
7725-th page extracted
7726-th page extracted
7727-th page extracted
7728-th page extracted
7729-th page extracted
7730-th page extracted
7731-th page extracted
7732-th page extracted
7733-th page extracted
7734-th page extracted
7735-th pag

In [5]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = Embeddings(model, corpus_file_path='data/corpus.pickle', embeddings_file_path='data/embeddings.pt')
#embeddings.create()
#embeddings.save_to_files('corpus.pickle', 'embeddings.pt')

In [6]:
chat = Prediction(model, embeddings.corpus, embeddings.embeddings)

query = input("Type your query: ")
count_results = int(input("How much results: "))
chat.predict(query, count_results)

Type your query: View
How much results: 1




Query: View

Top 1 most similar sentences in corpus:

{'page': 691, 'text': 'Example for SQL Login - T ype your User name and password.\n5. Select the view usPopulationView, and then select Load.\n'}
 (Score: 0.4219)


In [7]:
# add pinecone
!pip install pinecone-client
!pip install python-dotenv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [34]:
import pinecone
import os

%reload_ext dotenv
%dotenv

PINECONE_API_TOKEN = os.environ.get("PINECONE_API_TOKEN")
PINECONE_ENVIRONMENT = 'us-west1-gcp'
PINECONE_INDEX = 'super-brain'

pinecone.init(api_key=PINECONE_API_TOKEN, environment=PINECONE_ENVIRONMENT)

In [113]:
import uuid
import itertools

def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = list(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = list(itertools.islice(it, batch_size))

def prepare_embeddings(embeddings):
    return ((str(uuid.uuid4()), embedding.tolist(), {'file':'azure_synapse_analytics_documentation', 'page': page}) for page, embedding in enumerate(embeddings))
        

def upsert_to_pinecone(index, embeddings, pool_threads=30, batch_size=100):
    with pinecone.Index(index, pool_threads=pool_threads) as index:
        # Send requests in parallel
        async_results = [
            index.upsert(ids_vectors_chunk, async_req=True)
            for ids_vectors_chunk in chunks(prepare_embeddings(embeddings), batch_size=batch_size)
        ]
        # Wait for and retrieve responses (this raises in case of error)
        responses = [async_result.get() for async_result in async_results]
        print(responses)


def query_pinecone(index_name, query, model, embeddings, corpus, limit=5):
    index = pinecone.Index(index_name)
    query_embedding = model.encode(query).tolist()
    query_response = index.query(
        vector=query_embedding,
        top_k=limit,
        include_values=True,
        include_metadata=True
    )['matches']
    results = []
    for match in query_response:
        values = match["values"]
        for ind, embedding in enumerate(embeddings):
            embedding = [round(emb, 10) for emb in embedding.tolist()]
            if embedding[0] == values[0]:
                results.append(
                    {
                        "match": match["score"],
                        "text": corpus[ind]
                    }
                )
    return results

In [114]:
#upsert_to_pinecone(PINECONE_INDEX, embeddings.embeddings)

In [117]:
import pprint

results = query_pinecone(PINECONE_INDEX, 'Create External View', model, embeddings.embeddings, embeddings.corpus)
pprint.pprint(results)

[{'match': 0.419137,
  'text': {'page': 691,
           'text': 'Example for SQL Login - T ype your User name and '
                   'password.\n'
                   '5. Select the view usPopulationView, and then select '
                   'Load.\n'}},
 {'match': 0.393314809,
  'text': {'page': 685,
           'text': 'the following script you can see how to add a new user '
                   'that will be authenticated using\n'
                   'Azure AD identity:\n'
                   'SQL\n'
                   'Instead of Azure AD principals, you can create SQL '
                   'principals that authenticate with the\n'
                   'login name and password.\n'
                   'SQL\n'
                   'In both cases, you can assign permissions to the users.\n'
                   'SQL\n'
                   'The security rules depend on your security policies. Some '
                   'generic guidelines are:\n'
                   'You should deny ADMINISTER DATAB