In [12]:
%%capture --no-stderr
%pip install -U trafilatura pinecone-client[grpc] langchain_experimental langchain-ai21 langchain-pinecone langchain-nomic langchain_community langchainhub chromadb langchain nomic[local] langchain-text-splitters

In [3]:
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["PINECONE_API_KEY"] = "94ef7896-1fae-44d3-b8d2-0bd6f5f664f5"
os.environ["AI21_API_KEY"] = "KlINkh5QKw3hG1b5Hr75YDO7TwGoQvzn"

# Scraping

- https://trafilatura.readthedocs.io/en/latest/
- https://www.diariodiunanalista.it/posts/chatbot-python-langchain-rag/
- https://www.diariodiunanalista.it/posts/come-scraperare-un-blog-e-raccogliere-i-suoi-articoli/

In [4]:
import pandas as pd
from tqdm import tqdm
import time
from trafilatura.sitemaps import sitemap_search
from trafilatura import fetch_url, extract, extract_metadata


def get_urls_from_sitemap(resource_url: str) -> list:
    """
    Funzione che recupera la sitemap attraverso Trafilatura
    """
    urls = sitemap_search(resource_url)
    return urls


def create_dataset(list_of_websites: list) -> pd.DataFrame:
    """
    Funzione che crea un DataFrame Pandas di URL e articoli.
    """
    data = []
    for url in tqdm(list_of_websites, desc="Websites"): #Per ogni sitoweb estraiamo il testo
        html = fetch_url(url)
        body = extract(html)
        try:
            metadata = extract_metadata(html)
            title = metadata.title
            description = metadata.description
        except:
            metadata = ""
            title = ""
            description = ""
        d = {
            'url': url,
            "body": body,
            "title": title,
            "description": description
        }
        data.append(d)
        time.sleep(0.5)
    df = pd.DataFrame(data)
    df = df.drop_duplicates()
    df = df.dropna()
    
    return df

In [5]:
urls = [
    "https://bmjgroup.com/celebrity-tweets-likely-shaped-us-negative-public-opinion-of-covid-19-pandemic/",
    "https://eu.usatoday.com/story/news/health/2024/07/26/covid-vaccine-us-china-propaganda/74555829007/",
    "https://www.theguardian.com/society/2023/jun/13/quarter-in-uk-believe-covid-was-a-hoax-poll-on-conspiracy-theories-finds",
]

df = create_dataset(urls)
df.to_csv("/kaggle/working/dataset.csv", index=False)

Websites: 100%|██████████| 3/3 [00:03<00:00,  1.05s/it]


In [6]:
aspects = ["Health","Technology","Society"] 
urls_list = [["https://www.who.int/emergencies/diseases/novel-coronavirus-2019/covid-19-vaccines",
                "https://www.who.int/news-room/questions-and-answers/item/vaccines-and-immunization-vaccine-safety",
                "https://www.bbc.com/news/stories-52731624",
                "https://www.bbc.com/news/technology-52903680"],
            ["https://www.worldbank.org/en/publication/wdr2022/brief/chapter-1-introduction-the-economic-impacts-of-the-covid-19-crisis"],
            []]

index = 0
for aspect in aspects:
    df = create_dataset(urls_list[index])
    index=index+1
    df.to_csv(f"/kaggle/working/dataset_{aspect.lower()}_kbt.csv", index=False)

Websites: 100%|██████████| 4/4 [00:03<00:00,  1.04it/s]
Websites: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s]


# Test splitters

text splitters: https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/
- https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/recursive_text_splitter/

Recursive character splitter fa la suddivisione senza staccare le parole, ma mettendole insieme.

https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/semantic-chunker/

https://python.langchain.com/v0.1/docs/integrations/document_transformers/ai21_semantic_text_splitter/

In [21]:
# Prova recursivecharactertextsplitter
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain_experimental.text_splitter import SemanticChunker
from langchain_ai21 import AI21SemanticTextSplitter
from langchain_ai21 import AI21Embeddings

text_chunks_recursive = DataFrameLoader(
        pd.read_csv("/kaggle/working/dataset.csv"), page_content_column="body"
    ).load_and_split(
        text_splitter=RecursiveCharacterTextSplitter(
            chunk_size=500, chunk_overlap=0, length_function=len)
    )

#Sembra suddividere in pezzi più lunghi
text_chunks_semantic = DataFrameLoader(
        pd.read_csv("/kaggle/working/dataset.csv"), page_content_column="body"
    ).load_and_split(
        text_splitter=SemanticChunker(
    AI21Embeddings(device="cuda"), breakpoint_threshold_type="percentile"
    )
)
    
#Sembra suddividere in pezzi più piccoli
text_chunks_ai21 = DataFrameLoader(
        pd.read_csv("/kaggle/working/dataset.csv"), page_content_column="body"
    ).load_and_split(
        text_splitter=AI21SemanticTextSplitter(chunk_size=500)
    )

print(len(text_chunks_recursive))
print(len(text_chunks_semantic))
print(len(text_chunks_ai21))

9
32


# Creazione vectorstore

https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/

Creazione chunks

In [11]:
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


def create_chunks(dataset:pd.DataFrame, text_splitter):
    """
    Crea chunk informazionali dal dataset 

    Args:
        dataset (pd.DataFrame): Dataset Pandas
        chunk_size (int): Quanti chunk informazionali?
        chunk_overlap (int): Quanti chunk condivisi?

    Returns:
        list: lista di chunk
    """
    text_chunks = DataFrameLoader(
        dataset, page_content_column="body"
    ).load_and_split(text_splitter)
    
    # aggiungiamo i metadati ai chunk stessi per facilitare il lavoro di recupero
    for doc in tqdm(text_chunks):
        title = doc.metadata["title"]
        description = doc.metadata["description"]
        content = doc.page_content
        url = doc.metadata["url"]
        final_content = f"TITLE: {title}\DESCRIPTION: {description}\BODY: {content}\nURL: {url}"
        doc.page_content = final_content

    return text_chunks

## Vector store

In [None]:
import pandas as pd
from langchain_pinecone import PineconeVectorStore
from langchain_ai21 import AI21Embeddings

index_name = "vectorstore"

chunks = create_chunks(dataset=pd.read_csv("/kaggle/working/dataset.csv"),SemanticChunker(
    AI21Embeddings(device="cuda"), breakpoint_threshold_type="percentile"))

# Add to vectorDB
vectorstore = PineconeVectorStore.from_documents(
    documents=chunks,
    #embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local", device="cuda"),
    embedding=AI21Embeddings(),
    index_name=index_name
)

## KBT

In [14]:
import pandas as pd
from langchain_pinecone import PineconeVectorStore
from langchain_ai21 import AI21Embeddings

aspects = ["Health","Technology","Society"]

for aspect in aspects:
    index_name = f"{aspect.lower()}-kbt"
    
    chunks = create_chunks(dataset=pd.read_csv(f"/kaggle/working/dataset_{aspect.lower()}_kbt.csv"),chunk_size=100,chunk_overlap=0)

    vectorstore_KBT = PineconeVectorStore.from_documents(
        documents=chunks,
        #embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local", device="cuda"),
        embedding=AI21Embeddings(),
        index_name=index_name
    )

100%|██████████| 30/30 [00:00<00:00, 159479.24it/s]
100%|██████████| 6/6 [00:00<00:00, 41187.93it/s]


# Verifica

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone

# Inizializza il client Pinecone con il tuo API key e ambiente
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

index_name = "prova"

# Ottenere la lista degli indici
index = pc.Index(index_name)

# Stampare gli indici
response = index.query(top_k=10, namespace="prova", include_values=True, vector=[0.93,0.23,0.87,0.87,0.72,0.08,0.31,0.21,0.7,0.78,0.58,0.57,0.16,0.26,0.25,0.08,0.64,0.53,0.6,0.24,0.54,0.08,0.51,0.7,0.78,0.97,0.78,0.1,0.77,0.84,0.67,0.55,0.66,0.96,0.45,0.26,0.11,0.46,0.45,0.68,0.19,0.92,0.18,0.14,0.05,0.44,0.46,0.77,0.93,0.05,0.75,0.05,0.35,0.17,0.98,0.32,0.35,0.88,0.56,0.69,0.96,0.7,0.27,0.68,0.87,0.15,0.25,0.01,0.61,0.99,0.95,0.06,0.86,0.03,0.98,0.56,0.19,0.67,0.5,0.7,0.25,0.45,0.13,0.7,0.78,0.47,0.37,0.48,0.16,0.34,0.24,0.8,0.78,0.06,1,0.33,0.29,0.89,0.64,0.4,0.37,0.8,0.01,0.47,0.39,0.86,0.7,0.4,0.93,0.4,0.5,0.81,0.83,0.2,0.74,0.48,0.31,0.6,0.99,0.7,1,0.84,0.16,0.25,0.74,0.54,0.92,0.6,0.88,0.21,0.99,0.43,0.55,0.4,0.46,0.21,0.64,0.53,0.55,0.8,0.39,0.43,0.07,0.95,0.56,0.95,0.46,0.95,0.74,0.73,0.01,0.96,0.72,0.43,0.9,0.47,0.83,0.78,0.51,0.79,0.35,0.98,0.05,0.25,0.77,0.97,0.55,0.82,0.2,0.15,0.66,0.69,0.07,0.26,0.47,0.61,0.9,0.57,0.93,0.92,0.94,0.42,0.23,0.28,0.56,0.5,0.17,0.64,0.61,0.92,0.75,0.63,0.57,0.06,0.5,0.9,0.02,0.66,0.22,0.73,0.97,0.87,0.87,0.81,0.02,0.94,0.22,0.62,0.5,0.33,0.64,0.19,0.83,0.87,0.9,0.05,0.74,0.03,0.93,0.67,0.05,0.3,0.3,0.99,0.01,0.34,0.88,0.08,0.06,0.27,0.01,0.83,0.53,0.21,0.02,0.12,0.68,0.9,0.31,0.26,0.76,0.51,0.09,0.78,0.64,0.14,0.65,0.89,0.66,0.47,0.46,0.68,0.61,0.46,0.19,0.4,0.53,0.58,0.09,0.78,0.31,0.41,0.84,0.93,0.42,0.39,0.91,0.9,0.31,0.85,0.89,0.99,0.99,0.8,0.34,0.07,0.54,0.98,0.6,0.67,0.83,0.74,0.37,0.28,0.43,0.09,0.22,0.73,0.39,0.47,0.07,0.8,0.42,0.49,0.26,0.71,0.21,0.82,0.97,0.94,0.75,0.51,0.62,0.03,0.99,0.82,0.81,0.77,0.65,0.78,0.2,0.36,0.1,0.95,0.4,0.59,0.72,0.76,0.12,0.75,0.97,0.06,0.46,0.99,0.12,0.43,0.69,0.54,0.69,0.54,0.16,0.85,0.57,0.74,0.78,0.08,0.2,0.83,0.78,0.64,0.54,0.33,0.24,0.63,0.09,0.31,0.02,0.63,0.3,0.63,0.73,0.33,0.68,0.83,0.15,0.69,0.69,0.61,0.59,0.65,0.25,0.1,0.1,0.09,0.29,0.8,0.91,0.24,0.53,0.2,0.23,0.38,0.82,0.37,0.2,0.99,0.92,0.12,0.57,0.94,0.33,0.72,0.47,0.38,0.35,0.38,0.84,0.67,0.53,0.1,0.64,0.06,0.36,0.13,0.47,0.65,0.55,0,0.11,0.82,0.34,0.63,0.75,0.73,0.39,0.49,0.92,0.42,0.18,0.24,0.84,0.23,0.33,0.61,0.35,0.02,0.97,0.67,0.97,0.15,0.45,0.77,0.44,0.76,0.72,0.35,0.53,0.84,0.6,0.11,0.03,0.38,0.26,0.64,0.43,0.15,0.39,0.72,0.67,0.16,0.68,0.67,0.04,0.61,0.69,0.25,0.62,0.04,0.58,0.98,0.08,0.05,0.4,0.05,0.94,0.75,0.86,0.98,0.43,0.1,0.49,0.93,0.32,0.77,0.51,0.34,0.33,0.98,0.76,0.93,0.13,0.14,0.84,0.62,0.2,0.5,0.45,0.26,0.54,0.04,0.23,0.74,0.06,0.41,0.38,0.84,0.39,0.77,0.38,0.17,0.09,0.43,0.4,0.48,0.74,0.96,0.09,0.04,0.93,0.85,0.53,0.38,0.94,0.65,0.38,0.12,0.33,0.97,0.67,0.85,0.85,0.32,0.42,0.21,0.76,0.93,0.6,0.93,0.85,0.76,0.39,0.51,0.66,0.17,0.68,0.41,0.85,0.52,0.22,0.88,0.67,0.5,0.06,0.38,0.28,0.53,0.53,0.74,0.36,0.01,0.21,0.36,0.67,0.47,0.16,0.66,0.53,0.71,0.46,0.9,0.54,0.94,0.31,0.11,0.01,0.1,0.99,0.05,0.24,0.2,0.37,0.07,0.5,0.09,0.78,0.09,0.73,0.88,0.42,0.35,0.06,0.8,0.15,0.57,0.36,0.43,0.11,0.8,0.77,0.59,0.16,0.07,0.36,0.43,0.45,0.96,0.36,0.28,0.35,0.65,0.14,0.26,0.26,0.18,0.07,0.29,0.98,0.28,0.81,0.83,0.61,0.94,0.9,0.28,0.7,0.91,0.09,0.64,0.91,0.82,0.95,0.93,0.88,0.49,0.59,0.82,0.03,0.05,0.93,0.89,0.96,0.38,0.27,0.48,0.75,0.43,0.16,0.53,0.84,0.99,0.23,0.45,0.71,0.6,0.43,0.2,0.08,0.13,0.68,0.81,0.52,0.6,0.25,0.37,0.76,0.82,0.28,0.82,0.54,0.8,0.93,0.57,0.1,0.76,0.23,0.29,0.32,0.72,0.42,0.6,0.15,0.7,0.67,0.58,0.53,0.1,0.73,0.39,0.62,0.06,0.55,0.15,0.58,0.09,0.68,0.54,0.91,0.96,0.21,0.87,0.24,0.83,0.48,0.99,0.3,0.14,0.9,0.5,0.61,0.01,0.91,0.97,0.3,0.01,0.25,0.24,0.36,0.78,0.02,0.95,0.97,0.66,0.46,0.32,0.24,0.59,0.36,0.36,0.19,0.97,0.74,0.47,0.59,0.43,0.48,0.51,0.77,0,0.64,0.76,0.82,0.11,0.8,0.34,0.98,0.02,0.59,0.37,0.06,0.39,0.31,0.6,0.19,0.23,0.55,0.85,0.13,0.77,0.38,0.02,0.71,0.95,0.19,0.92,0.93,0.56,0.19,0.83,0.79,0.52,0.23,0.31,0.61,0.1,0.74,0.42,0.58,0.96,0.1,0.06,0.47,0.13,0.55,0.37,0.3,0.64,0.11,0.3,0.29,0.43,0.55,0.44,0.9,0.9,0.43,0.2,0.75,0.77,0.96,0.1,0.65,0.97,0.38,0.23,0.23,0.94,0.29,0.73,0.64,0.64,0.7,0.55,0.33,0.42,0.5,0.73,0.71,0.01,0.01,1,0.54,0.11,0.46,0.24,0.94,0.25,0.14,0.96,0.28,0.95,0.59,0.9,0.81,0.9,0.62,0.62,0.44,0.07,0.52,0.15,0.33,0.95,0.96,0.53,0.6,0.02,0.89,0.25,0.53,0.19,0.03,0.6,0.08,0.45,0.55,0.91,0.11,0.66,0.9,0.69,0.36,0.06,0.2,0.21,0.19,0.64,0.93,0.35,0.84,0.78,0.99,0.02,0.81,0.02,0.66,0.02,0.2,0.24,0.65,0.16,0.96,0.23,0.69,0.5,0.7,0.01,0.88,0.26,0.59,0.8,0.62,0.51,0.17,0.91,0.41,0.33,0.42,0.45,0,0.14,0.45,0.13,0.93,0.66,0.62,0.41,0.23,0.81,0.08,0.36,0.75,0.41,1,0.02,0.67,0.66,0.45,0.16,0.76,0.78,0.7,0.28,0.38,0.57,0.59,0.51,0.77,0.84,0.46,0.25,0.78,0.49,0.12,0.7,0.03,0.75,0.53,0.7,0.22,0.96,0.75,0.31,0.48,0.13,0.61,0.41,0.97,0.3,0.25,0.18,0.62,0.79,0.4,0.77,0.46,0.12,0.26,0.77,0.21,0.28,0.57,0.95,0.82,0.58,0.14,0.15,0.84,0.13,0.92,0.23,0.5,0.1,0.14,0.43,0.65,0.47,0.99,0.86,0.77,0.9,0.87,0.58,0.61,0.63,0.88,0.68,0.73,0.46,0.07,0.8,0.62,0.75,0.29,0.12,0.64,0.11,0.56,0.38,0.14,0.26,0.29,0.13,0.68,0.43,0.24,0.5,0.71,0.6,0.58,0.09,0.26,0.59,0.51,0.79,0.84,0.4,0.3,0.5,0.44,0.52,0.44,0.06,0.12,0.15,0.92,0.93,0.68,0.11,0.42,0.89,0.02,0.06,0.55,0.58,0.89,0.79,0.48,0.83,0.41,0.56,0.09,0.86,0.25,0.94])  # Recupera i primi 10000 elementi
print(response)
items = response['matches']

# Stampa tutti gli elementi recuperati
print("Elementi presenti nell'indice:")
for item in items:
    print(f"ID: {item['id']}, Vettore: {item['values']}")