# News Article Summarization. 
This notebook prepares news articles to be inserted into the ChromaDB vector database. First, it creates a summary of the news article, then it identifies important Named Entities such as the names of politicians, locations, and relevant dates. Having a news summary allows us to quickly perform small-to-big retrieval; finding the full article from it's brief overview. This method helps us evaluate the Language Learning Model (LLM) more effectively. Additionally, the identified entities will be used as metadata and embedded with the news articles, aiding in fine-tuning and evaluating the LLM.

For more insight into this approach, check out this YouTube video by Jerry Liu, Founder of LlmamaIndex: https://youtu.be/TRjq7t2Ms5I.

## Config & Install Libraries
Check if Huggingface transformers and required libraries are installed

In [1]:
!pip install -q transformers sentencepiece sentence-transformers datasets spacy chromadb

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m285.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## News Summary Pipeline

In [3]:
import os
import json
from util import utils
from pymongo import MongoClient
from dotenv import load_dotenv, find_dotenv

In [4]:
load_dotenv(find_dotenv())

True

### Parameters

In [5]:
collection_name = 'raw-news'
batch_date = {'$gte': '2024-06-04', '$lte': '2024-06-05'}

In [6]:
MONGO_CONN_STRING = 'mongodb+srv://admin:oEQT00ln2FyAW8i6@us-election-gpt.thkjuq3.mongodb.net/?retryWrites=true&w=majority&appName=us-election-gpt'  # os.getenv("MONGO_CONNECTION_STRING")

In [7]:
mongo_client = MongoClient(MONGO_CONN_STRING)
db = mongo_client.get_database(os.getenv("DB_NAME"))

### Prepare Dataset

In [8]:
news_articles = json.loads(json.dumps(list(db.get_collection(collection_name).find({'created_at': batch_date})), cls=utils.CustomMongoDecoder))

In [9]:
for article in news_articles:
    article['processed_content'] = ''.join(art.strip() for art in article['raw_content'])
    article['processed_content'] = article['processed_content'].replace('\xa0', ' ')

In [10]:
def split_text(text, max_length=1000, separator='\n\n'):
    chunks = []
    current_chunk = []
    for line in text.split('.'):
        if len('.'.join(current_chunk)) + len(line) < max_length:
            current_chunk.append(line)
        else:
            chunks.append('.'.join(current_chunk))
            current_chunk = [line]
    if current_chunk:
        chunks.append('.'.join(current_chunk))
    return separator.join(chunks)

In [11]:
split_text(article['processed_content'])

'This material may not be published, broadcast, rewritten,\n      or redistributed. ©2024 FOX News Network, LLC. All rights reserved.\n      Quotes displayed in real-time or delayed by at least 15 minutes. Market data provided by. Powered and implemented by.. Mutual Fund and ETF data provided by.Sen. Tom Cotton, R-Ark., joins ‘Special Report’ to discuss the Iranian president’s death, the Biden administration\'s official condolences and the arrest warrants out for Israeli Prime Minister Netanyahu and Hamas leaders.Republican lawmakersand Rep. Mike Lawler are pressing the Biden administration to censure Iran at the next International Atomic Energy Agency (IAEA) meeting due to its heightened nuclear activities and guarantee that steps are taken to thwart Iran\'s acquisition of nuclear weapons.On Monday afternoon, the pair will introduce a resolution that would also "refer the issue to the U.N\n\n Security Council, and reaffirm that all measures will be taken to prevent the regime in Iran 

## Load Summarization Model

In [12]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load fine-tuned BART model for summarization
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)


def summarize_article(text: str):
    chunks = split_text(text, max_length=1000)
    summaries = []
    for chunk in chunks:
        inputs = tokenizer([chunk], return_tensors="pt", truncation=True, max_length=1024)
        summary_ids = model.generate(inputs.input_ids, num_beams=4, length_penalty=2.0, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)
    return ' '.join(summaries)

In [None]:
sample_summary = summarize_article(article['processed_content'])

In [None]:
sample_summary

### Named Entity Recognition

In [None]:
REQUIRED_FIELDS = ['PERSON', 'GPE', 'NORP', 'EVENT', 'ORG']

In [None]:
import spacy
from collections import defaultdict

def perform_ner(text: str):
    
    # Load the English language model
    nlp = spacy.load("en_core_web_sm")
    
    # Process the text with spaCy
    doc = nlp(text)
    
    # Extract named entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    return entities

In [None]:
# postprocess the named entities to select the required entity tags
def postprocess_entities(entities):
    processed_entities = defaultdict(set)
    
    for entity, label in entities:
        if label in REQUIRED_FIELDS:
            processed_entities[label].add(entity)
    processed_entities = {key: list(value) for key, value in processed_entities.items()}
    return processed_entities

## Perform Summarization and NER on News Articles

In [None]:
from bson import ObjectId

In [None]:
for news in news_articles:
    summary = summarize_article(news['processed_content'])
    entities = postprocess_entities(perform_ner(news['processed_content']))

    news['news_summary'] = summary
    news['entities'] = entities

    # filter criteria
    filter_criteria = {'_id': ObjectId(news['_id'])}
    
    # Define the update operation
    update_data = {
        '$set': {
            'processed_content': news['processed_content'],
            'news_summary': summary,
            'entities': entities
        }
    }
    
    # Update the Mongo document
    result = db.get_collection('raw-news').update_one(filter_criteria, update_data)

## Save content and metadata on Chromadb

In [None]:
import chromadb
from chromadb.utils import embedding_functions

In [None]:
chroma_client = chromadb.HttpClient()

In [None]:
try:
    collection = chroma_client.get_collection('us-election-gpt')
except:
    collection = chroma_client.create_collection('us-election-gpt')

In [None]:
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

In [None]:
for news in news_articles:
    collection.add(
        documents=[news['processed_content']],
        embeddings=[sentence_transformer_ef(news['processed_content'])[0]],
        metadatas=[{'entities': json.dumps(news['entities']), 
                    'summary': news['news_summary'], 
                    'source': news['source'],
                    'publication_date': news['publication_date']
                   }],
        ids=[str(news['_id'])]
    )

### Test Chromadb Querying

In [None]:
TEST_QUERY = """
    What's the latest in Texas?
"""

In [None]:
query_entities = postprocess_entities(perform_ner(TEST_QUERY))
query_embeddings = sentence_transformer_ef(TEST_QUERY)

In [None]:
query_entities

In [None]:
collection.query(query_embeddings=query_embeddings, n_results=1)

In [None]:
len(collection.query(query_embeddings=query_embeddings, n_results=1)['ids'])

## LangChain

In [None]:
!pip install langchain langchain-chroma

In [None]:
import chromadb
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

In [None]:
chroma_client = chromadb.HttpClient()

In [None]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="us-election-gpt",
    embedding_function=embedding_function,
)

In [None]:
print("There are", langchain_chroma._collection.count(), "items in the collection")

In [None]:
query = "How do you think michigan will vote this coming election"
docs = langchain_chroma.similarity_search(query)
print(docs[0].page_content)