In [1]:
!pip install -U chromadb langchain  rank_bm25 sentence_transformers




In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
class ArticleSearch:
    def __init__(self, model_name, query,first_chanker_size=5000,secount_chanker_size = 1000,device = 'cuda',stop_words = 'english',alfa=0.5):
        self.model_name = model_name
        self.query = query
        self.first_chanker_size = first_chanker_size
        self.secount_chanker_size = secount_chanker_size
        self.device = device
        self.stop_words = stop_words
        self.alfa = alfa
        self.data = None
        self.bge_embeddings = None
        self.vectorstore = None
    def load_data(self, file_path):
        self.data = pd.read_csv(file_path)

    def preprocess_data(self):
        corpus = self.data['Title'] + ' ' + self.data['Text']
        return corpus

    def calculate_similarity(self, corpus):
        vectorizer = TfidfVectorizer(stop_words=self.stop_words)
        tfidf_matrix = vectorizer.fit_transform(corpus)
        query_vector = vectorizer.transform([self.query])
        similarity_scores = cosine_similarity(query_vector, tfidf_matrix)
        return similarity_scores

    def search_best_article(self, similarity_scores):
        sorted_indices = np.argsort(similarity_scores)[0][::-1]
        sorted_accuracy = np.sort(similarity_scores)[0][::-1]
        biggest_accuracy = sorted_accuracy[0]
        if biggest_accuracy < 0.1:
            print("no article that fits well")
            return None

        wall_index = 1
        wall_accuracy = biggest_accuracy * self.alfa
        for i in range(1, len(sorted_accuracy)):
            if sorted_accuracy[i] < wall_accuracy:
                break
            wall_index += 1
        article_to_search = sorted_indices[:wall_index]
        return article_to_search

    def retrieve_documents(self, article_indices):
        docs = []
        for i in article_indices:
            docs.append(Document(page_content=self.data.iloc[i]["Text"]))
        return docs
    def splitter_maker(self,size):
        spliter = RecursiveCharacterTextSplitter(
                chunk_size=size,
                chunk_overlap=0,
                length_function=len,
            )
        return spliter
    def search_articles(self, file_path):
        self.load_data(file_path)
        corpus = self.preprocess_data()
        similarity_scores = self.calculate_similarity(corpus)
        article_to_search = self.search_best_article(similarity_scores)
        if article_to_search is not None:
            self.bge_embeddings = HuggingFaceBgeEmbeddings(
                model_name=self.model_name,
                model_kwargs={'device': self.device},
                encode_kwargs={'normalize_embeddings': True}
            )
            parent_splitter = self.splitter_maker(self.first_chanker_size)
            child_splitter = self.splitter_maker(self.secount_chanker_size)
            self.vectorstore = Chroma(collection_name="split_parents", embedding_function=self.bge_embeddings)
            store = InMemoryStore()
            big_chunks_retriever = ParentDocumentRetriever(
                vectorstore=self.vectorstore,
                docstore=store,
                child_splitter=child_splitter,
                parent_splitter=parent_splitter,
            )
            docs = self.retrieve_documents(article_to_search)
            big_chunks_retriever.add_documents(docs)
            sub_docs = self.vectorstore.similarity_search(self.query)
            return sub_docs[0].page_content
        else:
            return "No article found"



In [5]:
# Example usage:
model_name = "BAAI/bge-small-en-v1.5"
query =  "Artificial Intelligence" #query to search
file_path = "medium.csv"

article_search = ArticleSearch(model_name, query)
article_content = article_search.search_articles(file_path)
print(article_content)


Artificial intelligence (AI) is a loved and hated concept, an ambiguous suitcase word that is in no way close to being human or not human. A human is more than a series of programmed actions, yet we are also this. Questions are arising of sustainability in regards to artificial intelligence alongside new techniques in machine learning. The climate crisis can be worsened by irresponsible use of technology, particularly when large technology companies self-police or self-evaluate.
