In [8]:
#!pip install -U chromadb langchain  rank_bm25 sentence_transformers


In [37]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import BM25Retriever
import pandas as pd


class DataAnalyzer:
    def __init__(self,chunk_size=200,stop_words='english',alfa=0.5):
        self.vectorizer = TfidfVectorizer(stop_words=stop_words)
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0, length_function=len)
        self.alfa = alfa

    def analyze_data(self, query, data):
        tfidf_matrix = self.calculate_tfidf_matrix(data)
        query_vector = self.vectorizer.transform([query])
        similarity_scores = cosine_similarity(query_vector, tfidf_matrix)
        sorted_indices, sorted_accuracy = self.sort_similarity_scores(similarity_scores)

        if self.check_accuracy_threshold(sorted_accuracy):
            wall_index = self.determine_wall_index(sorted_accuracy)
            selected_text = self.select_text(data, sorted_indices, wall_index)
            chunked_data = self.split_text_into_chunks(selected_text)
            to_final_selection = self.analyze_chunks(chunked_data, query)
            final_ans = self.retrieve_final_documents(to_final_selection, query)
            return "".join(final_ans)
        else:
            print("no article that fits well")
            return None

    def calculate_tfidf_matrix(self, data):
        return self.vectorizer.fit_transform(data["Text"])

    def sort_similarity_scores(self, similarity_scores):
        sorted_indices = np.argsort(similarity_scores)[0][::-1]
        sorted_accuracy = np.sort(similarity_scores)[0][::-1]
        return sorted_indices, sorted_accuracy

    def check_accuracy_threshold(self, sorted_accuracy):
        biggest_accuracy = sorted_accuracy[0]
        return biggest_accuracy >= 0.1

    def determine_wall_index(self, sorted_accuracy):
        wall_index = 1
        wall_accuracy = sorted_accuracy[0] * self.alfa
        for i in range(1, len(sorted_accuracy)):
            if sorted_accuracy[i] < wall_accuracy:
                break
            wall_index += 1
        return wall_index

    def select_text(self, data, sorted_indices, wall_index):
        article_to_search = sorted_indices[:wall_index]
        selected_text = [data.iloc[i]["Text"] for i in article_to_search]
        return selected_text

    def split_text_into_chunks(self, selected_text):
        chunked_data = [self.text_splitter.split_text(text) for text in selected_text]
        return chunked_data

    def analyze_chunks(self, chunked_data, query):
        to_final_selection = []
        for chunk in chunked_data:
            chunk_vector = self.vectorizer.transform(chunk)
            query_vector_chunk = self.vectorizer.transform([query])
            similarity_scores_chunk = cosine_similarity(query_vector_chunk, chunk_vector)
            sorted_accuracy_chunk = np.sort(similarity_scores_chunk)[0][::-1]
            biggest_accuracy_chunk = sorted_accuracy_chunk[0]
            wall_indexx = self.determine_wall_index(sorted_accuracy_chunk)
            bm25_retriever = BM25Retriever.from_texts(chunk)
            bm25_retriever.k = wall_index
            to_final_selection += self.Doctranslate(bm25_retriever.get_relevant_documents(query))
        return to_final_selection

    def retrieve_final_documents(self, to_final_selection, query):
        bm25_retriever_final = BM25Retriever.from_texts(to_final_selection)
        bm25_retriever_final.k = wall_indexx
        final_ans = self.Doctranslate(bm25_retriever_final.get_relevant_documents(query))
        return final_ans

    def Doctranslate(self, doc):
        odp = []
        for i in doc:
            odp.append(i.page_content)
        return odp





In [38]:
analyzer = DataAnalyzer()
query = "humans evolution"
data = pd.read_csv("/medium.csv")  

analyzer.analyze_data(query, data)

'most sophisticated AI’s. Of course, it is a really complicated problem — it took evolution 4 Billion years to create humans and I still manage to look for my keys for ten minutes just to realize they'