In [None]:
from src.rag.components.generator import LLamaCppGeneratorComponent

In [None]:
prompt = "You are a french news reporter"
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

In [None]:
llama_cpp_generator = LLamaCppGeneratorComponent(
    api_url="http://127.0.0.1:8001",
    model_name=model_name,
    prompt=prompt
)

In [None]:
llama_cpp_generator._ping_api()

In [None]:
summarization_prompt = """
Give a summary in french of the following document:
{{content}}
 Describes it in a style of a french new paper reporters.

Don't summarize each document separately, the content in all the documents should be summarized.

The summary should be in french not in English
"""

### Download the File

In [None]:
from src.shared.cloud_storage import BackBlazeCloudStorage

In [None]:
cloud_storage = BackBlazeCloudStorage(environment="prod")

In [None]:
from datetime import datetime

In [None]:
today = datetime.now().strftime("%Y-%m-%d")

In [None]:
today

In [None]:
bucket_name = "congonews-clusters"
file_name = f"news-clusters-{today}.csv"

In [None]:
import pandas as pd

In [None]:
from tempfile import NamedTemporaryFile

In [None]:
today_news_file = cloud_storage.download_by_name(
    bucket_name=bucket_name, file_name=file_name)
with NamedTemporaryFile(delete=True, suffix=".csv") as temp_file:
    today_news_file.save_to(temp_file.name)
    today_news_df = pd.read_csv(temp_file.name, index_col=0)

In [None]:
def select_top_clusters(news_df: pd.DataFrame) -> pd.DataFrame:
    """ select the clusters with the more than two documents """
    cluster_counts = news_df["labels"].value_counts()
    labels_with_more_than_one = cluster_counts[cluster_counts > 1].index
    important_news_df = news_df.loc[news_df.labels.isin(
        labels_with_more_than_one)]
    return important_news_df

In [None]:
today_news_df = select_top_clusters(today_news_df)

In [None]:
today_news_df = today_news_df.sort_values(by="labels")

### Prompt

In [None]:
summaries = []
for id, news_group in today_news_df.groupby("labels"):
    news = news_group.content.str.cat(sep="\n")
    summary = llama_cpp_generator.run(
        template_values={"content": news}, prompt_template=summarization_prompt)
    summaries.append(summary)
    print(summary)
    print("---" * 10)

In [None]:
today_news_df.labels.nunique()

In [None]:
today_news_df