In [46]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client import models
from qdrant_client.http.models import Distance, VectorParams, PointStruct 
from qdrant_client.models import Filter
from qdrant_client.http import models
import numpy as np
import pandas as pd
import configparser

In [22]:
config=configparser.ConfigParser()
config.read('../config.cfg')

model_name = "dangvantuan/sentence-camembert-large"
encoder = SentenceTransformer(model_name_or_path=model_name)

No sentence-transformers model found with name C:\Users\gaelp/.cache\torch\sentence_transformers\dangvantuan_sentence-camembert-large. Creating a new one with MEAN pooling.


In [23]:
QDRANT_HOST=config['QDRANT']['host']
QDRANT_PORT=config['QDRANT']['port']
QDRANT_API_KEY=config['QDRANT']['qdrant_api_key']

client = QdrantClient(url=QDRANT_HOST, 
                      port=QDRANT_PORT, 
                      api_key=QDRANT_API_KEY)

In [36]:
df = pd.read_csv('../data/processed/articles.csv', index_col=0)

In [37]:
df = df.reset_index()

In [38]:
df.columns = ['newsId', 'author', 'title', 'publishedAt', 'content']

In [39]:
def generate_item_sentence(item: pd.Series, text_columns=["title"]) -> str:
    return ' '.join([item[column] for column in text_columns])

In [40]:
df["sentence"] = df.apply(generate_item_sentence, axis=1)
df["sentence_embedding"] = df["sentence"].apply(encoder.encode)

In [None]:
client.create_collection(
    collection_name = "articles_fr_newsapi",
    vectors_config = models.VectorParams(
        size = encoder.get_sentence_embedding_dimension(),
        distance = models.Distance.COSINE,
    ),
)

In [42]:
metadata_columns = df.drop(["newsId", "sentence", "sentence_embedding"], axis=1).columns

def create_vector_point(item:pd.Series) -> PointStruct:
    """Turn vectors into PointStruct"""
    return PointStruct(
        id = item["newsId"],
        vector = item["sentence_embedding"].tolist(),
        payload = {
            field: item[field]
            for field in metadata_columns
            if (str(item[field]) not in ['None', 'nan'])
        }
    )

points = df.apply(create_vector_point, axis=1).tolist()

In [45]:
CHUNK_SIZE = 500
n_chunks = np.ceil(len(points)/CHUNK_SIZE)

for i, points_chunk in enumerate(np.array_split(points, n_chunks)):
    client.upsert(
        collection_name="articles_fr_newsapi",
        wait=True,
        points=points_chunk.tolist()
    )

In [47]:
query_text = "Hamas"
query_vector = encoder.encode(query_text).tolist()

In [48]:
client.search(
    collection_name="articles_fr_newsapi",
    query_vector=query_vector,
    with_payload=["newsId", "title", "content"],
    query_filter=None
)

[ScoredPoint(id=28, version=0, score=0.39190134, payload={'content': "Tsahal va utiliser une nouvelle arme contre le Hamas\xa0: l'eau pompée de la Méditerranée pour inonder le vaste réseau de 500\xa0km de tunnels creusés par Hamas dans la bande de Gaza. Les tests effectués c… [+3614 chars]", 'title': "Gaza\xa0: Israël s'apprête à inonder les tunnels du Hamas"}, vector=None, shard_key=None),
 ScoredPoint(id=67, version=0, score=0.29526114, payload={'content': 'Personne ne discute plus lampleur du bilan humain du carnage en cours à Gaza. Mais personne, parmi les responsables internationaux qui prétendent vouloir apporter des réponses au conflit israélo-pale… [+3387 chars]', 'title': 'A Gaza, la guerre sans fin de Benyamin Nétanyahou'}, vector=None, shard_key=None),
 ScoredPoint(id=17, version=0, score=0.28084984, payload={'content': 'La bande de Gaza na pas connu de répit depuis les années\xa01950. Coincé entre Israël et lEgypte, soumis à une succession de guerres et doccupations, ce mor