# Intro 

This notebook shows the steps for loading information to a Qdrant data base

- **Create a client** for Qdrant data base
- **Create a collection**: the database that will store two types of information

    - Vectors: Embeddings -> Numerical vectors of the pieces of text
    
    - Payload: Additional information to consider in the search

- **Create the embeddings** or in other words the vectors that represent the information
- **Load the information to Qdrant collection** this collection will drive the model

In [1]:
import json

import numpy as np
import pandas as pd
import torch

from qdrant_client import QdrantClient
from qdrant_client.http import models 
from sentence_transformers import SentenceTransformer
import os

# Create a client

In [2]:

os.environ['QDRANT_HOST'] = 'https://c670fc3d-8a23-4b19-8b93-23e8f92cc2fe.europe-west3-0.gcp.cloud.qdrant.io:6333'
os.environ['QDRANT_API_KEY'] = 'kahBQdKVTA9mE2idCWAlEJkMIxZnJ6TglLhpIIXWkSptanG_NxR_GA'

client = QdrantClient(
    os.getenv('QDRANT_HOST'),
    api_key=os.getenv('QDRANT_API_KEY')
) 

# Create the collection 

In [3]:


os.environ['QDRANT_COLLECTION_NAME'] = 'care-collection'

# using sentence-transformers/msmarco-MiniLM-L-6-v3
vectors_config = models.VectorParams(
    size=384, # dimension of vectors returned by the embeddings model
    distance= models.Distance.COSINE
)

client.recreate_collection( 
    collection_name=os.getenv('QDRANT_COLLECTION_NAME'), 
    vectors_config=vectors_config
)

True

# Create the vectors

### Load the dataset where there is text data and the payload

In [16]:



df = pd.read_csv('/Users/personas/Dropbox/care_homes_university/sentiment/ltc-reviews/reviews_analysis/data/reviews_greater_manchester_complete.csv')


df['text'] = df['rev']
metadata_df = df[['rev', 'text', 'date_review', 'care_home', 'provider_name', 'code']]
# filter out reviews before 2020 to meet Batch size limit
metadata_df = metadata_df[(metadata_df['date_review'] > "2020-01-01")]
display(metadata_df.head(5))
display(len(metadata_df))



Unnamed: 0,rev,text,date_review,care_home,provider_name,code
0,My mother-in-law has settled really well at Au...,My mother-in-law has settled really well at Au...,2021-01-08,Auden House care home,Auden House Care Limited,M34 5PS
1,Auden House is a breath of fresh air. It provi...,Auden House is a breath of fresh air. It provi...,2020-12-15,Auden House care home,Auden House Care Limited,M34 5PS
2,My father moved into the care home in May 2020...,My father moved into the care home in May 2020...,2020-12-15,Auden House care home,Auden House Care Limited,M34 5PS
3,My Mum has been a resident in Auden House for ...,My Mum has been a resident in Auden House for ...,2020-12-15,Auden House care home,Auden House Care Limited,M34 5PS
4,Our mum originally went into Auden House in Ju...,Our mum originally went into Auden House in Ju...,2020-12-15,Auden House care home,Auden House Care Limited,M34 5PS


1436

### Load the model create the embeddings and an index

In [6]:
# msmarco-MiniLM-L-6-v3

model = SentenceTransformer(
    "msmarco-MiniLM-L-6-v3",
    device="cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu",
)


vectors = []
batch_size = 512
batch = []


for doc in metadata_df["rev"].to_list():
    batch.append(doc)

    if len(batch) >= batch_size:
        vectors.append(model.encode(batch))
        batch = []

if len(batch) > 0:
    vectors.append(model.encode(batch))
    batch = []

vectors = np.concatenate(vectors)


In [24]:
type(vectors[0, 0]), vectors[:2, :20]
display(len(vectors))
vectors

1436

array([[ 0.23375982, -0.1051191 , -0.09238673, ...,  0.2002763 ,
        -0.02410727, -0.21097781],
       [-0.07133141, -0.17226437,  0.05197802, ...,  0.3818317 ,
        -0.29250535, -0.25625592],
       [ 0.1434832 , -0.05654013,  0.06152335, ...,  0.3676012 ,
        -0.12712899,  0.0238701 ],
       ...,
       [-0.02019383,  0.06343129,  0.49209467, ...,  0.08434742,
        -0.03095722,  0.06525056],
       [ 0.528846  , -0.16134685, -0.16229698, ..., -0.05249184,
         0.04291157,  0.24380703],
       [ 0.27887225, -0.20670782,  0.02902296, ...,  0.03530879,
         0.01489652, -0.36518615]], dtype=float32)

In [23]:
vectors = vectors.astype(np.float32)
len(vectors), vectors.shape, vectors.dtype

(1436, (1436, 384), dtype('float32'))

In [13]:
# create and index 

index_mod = list(range(len(metadata_df)))
len(index_mod)

1436

In [17]:
# create the payloads

payload = metadata_df[['text','date_review', 'care_home', 'provider_name', 'code']].to_dict(orient="records")
payload[:2]
payload[0]['text'].astype(str)


[{'text': 'My mother-in-law has settled really well at Auden House. All contact with the home although usually not face to face because of the Covid crisis has been extremely positive nothing is too much trouble. We are extremely satisfied.',
  'date_review': '2021-01-08',
  'care_home': 'Auden House care home',
  'provider_name': 'Auden House Care Limited',
  'code': 'M34 5PS'},
 {'text': "Auden House is a breath of fresh air. It provides a safe, caring and friendly home for a treasured Aunt and allows her family and friends to rest easy in the knowledge that she is happy and well cared for. Her individual quirks and foibles are accepted gracefully and with good humour and my aunt has flourished since she has been a resident. I couldn't recommend Auden House more highly. The staff are always helpful and clearly prioritise the wellbeing of their residents above all else. The property and garden are all we could wish for in this handy and pleasant location. Thank you everyone working at

# Adding points to the data base

The points are the central entity that Qdrant operates with. A point is a record consisting of a vector and an optional payload.

- Vector: numerical representation of text
- Payload: additional information to the vector

In [18]:
client.upsert(
    collection_name=os.environ['QDRANT_COLLECTION_NAME'],
    points=models.Batch(
        ids=index_mod,
        vectors=vectors.tolist(),
        payloads=payload
    )
)


UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [21]:
# Check vectors loaded

client.retrieve(
    collection_name=os.environ['QDRANT_COLLECTION_NAME'],
    ids=[0, 100, 1000],
    with_vectors=True # the default is False
)


[Record(id=100, payload={'care_home': 'Avonleigh Gardens care home', 'code': 'OL1 4HT', 'date_review': '2020-06-12', 'provider_name': 'Methodist Homes', 'text': "I think you all are doing a brilliant job with my wife and all the residents. You all deserve medals. Reading the reviews, I think what other people have written is what I'm thinking too. So thanks again."}, vector=[0.015682852, 0.00072113075, 0.054558348, -0.022413597, -0.0018751859, 0.024194222, 0.01820496, -0.06553036, -0.07232475, -0.0065110438, -0.06660734, -0.0035907894, -0.025015058, 0.025189197, -0.010422286, 0.018539064, 0.014190554, 0.03456997, -0.122880414, 0.024938138, -0.137664, 0.02718843, 0.0023287907, 0.09651994, -0.034982976, 0.013059309, -0.0042842147, 0.063319996, -0.073378436, -0.01486554, -0.07441012, -0.04840662, 0.06612397, -0.008279531, -0.06639515, 0.09603335, -0.024393342, 0.003222545, -0.038819738, -0.029137202, 0.021889769, 0.0017680416, 0.05184927, -0.035655774, -0.040645584, -0.07200059, 0.0573090

In [20]:
client.scroll(
    collection_name=os.environ['QDRANT_COLLECTION_NAME'],
    limit=10,
    with_payload=True,
    with_vectors=False  
    
)

([Record(id=0, payload={'care_home': 'Auden House care home', 'code': 'M34 5PS', 'date_review': '2021-01-08', 'provider_name': 'Auden House Care Limited', 'text': 'My mother-in-law has settled really well at Auden House. All contact with the home although usually not face to face because of the Covid crisis has been extremely positive nothing is too much trouble. We are extremely satisfied.'}, vector=None),
  Record(id=1, payload={'care_home': 'Auden House care home', 'code': 'M34 5PS', 'date_review': '2020-12-15', 'provider_name': 'Auden House Care Limited', 'text': "Auden House is a breath of fresh air. It provides a safe, caring and friendly home for a treasured Aunt and allows her family and friends to rest easy in the knowledge that she is happy and well cared for. Her individual quirks and foibles are accepted gracefully and with good humour and my aunt has flourished since she has been a resident. I couldn't recommend Auden House more highly. The staff are always helpful and cle

In [204]:
# Get information about the collection
client.get_collection(collection_name=os.environ['QDRANT_COLLECTION_NAME'])

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=1436, indexed_vectors_count=0, points_count=1436, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None), payload_schema={})