In [1]:
import sys
from pathlib import Path

if str(Path().resolve().parent) not in sys.path:
    sys.path.append(str(Path().resolve().parent))

# Data Processing

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.config import random_seed, PREPROCESSED_BLOG_DATASET_PATH

# Import data
blogs = pd.read_csv(PREPROCESSED_BLOG_DATASET_PATH)

# Split dataset into validation and test set
Xval, Xtest, yval_score, ytest_score = train_test_split(
    blogs.drop(columns=['normalized_engagement_score']), blogs["normalized_engagement_score"],
    test_size=0.4, random_state=random_seed)

# Same Xval, Xtest; new explained variable "engagement_level"
Xval, Xtest, yval_level, ytest_level = train_test_split(
    blogs.drop(columns=['engagement_level', 'normalized_engagement_score']), blogs["engagement_level"],
    test_size=0.4, random_state=random_seed)

print(f"Size of validation set, X: {Xval.shape}, y: {yval_score.shape}")
print(f"Size of test set, X: {Xtest.shape}, y: {ytest_score.shape}")

Size of validation set, X: (30, 10), y: (30,)
Size of test set, X: (20, 10), y: (20,)


In [3]:
valid_blogs = blogs[blogs["engagement_level"].isin(["Good", "Very Good", "Excellent"])].copy()
valid_blogs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, 0 to 47
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           28 non-null     int64  
 1   title_blog                   28 non-null     object 
 2   url_blog                     28 non-null     object 
 3   author_blog                  28 non-null     object 
 4   author_followers             28 non-null     int64  
 5   claps                        28 non-null     int64  
 6   comments                     28 non-null     int64  
 7   title_paper                  28 non-null     object 
 8   url_paper                    28 non-null     object 
 9   engagement_score             28 non-null     float64
 10  normalized_engagement_score  28 non-null     float64
 11  engagement_level             28 non-null     object 
dtypes: float64(2), int64(4), object(6)
memory usage: 2.8+ KB


In [4]:
from src.text_extraction import *

valid_blogs["full_paper"] = valid_blogs["url_paper"].apply(extract_paper_text)

# Vector storage

In [5]:
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

MODEL_NAME = 'all-MiniLM-L6-v2'
VECTOR_STORE_PATH = "../data/vector_store"

texts = valid_blogs['full_paper'].tolist()

model = SentenceTransformer(MODEL_NAME)

elements = []
for text, blog_url, author, claps, comments in zip(valid_blogs["full_paper"],
                                                   valid_blogs["url_blog"],
                                                   valid_blogs["author_blog"],
                                                   valid_blogs["claps"],
                                                   valid_blogs["comments"]):
    embedding = model.encode(text, clean_up_tokenization_spaces=True)
    metadata = {
        "full_text": text,
        "blog_url": blog_url,
        "author": author,
        "claps": claps,
        "comments": comments
    }
    elements.append((embedding, metadata))

embedding_model = HuggingFaceEmbeddings(model_name=MODEL_NAME)

vector_store = FAISS.from_texts(
    texts=[element[1]["full_text"] for element in elements],
    embedding=embedding_model,
    metadatas=[element[1] for element in elements]
)

vector_store.save_local(VECTOR_STORE_PATH)

In [6]:
vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings=embedding_model, allow_dangerous_deserialization=True)

def find_most_similar_article(query_text):
    query_embedding = model.encode(query_text, clean_up_tokenization_spaces=True)
    results = vector_store.similarity_search_by_vector(query_embedding, k=2)

    if results:
        most_similar = results[1]
        return most_similar.metadata

# Generator test

In [7]:
import os
import google.generativeai as genai
from dotenv import load_dotenv

load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

In [8]:
from src.prompts import prompt_rag
from src.models_setup import gemini_2_flash
from src.output_formats import BlogGeneration

paper_text = extract_paper_text(blogs.loc[0, "url_paper"])
most_similar_article = find_most_similar_article(paper_text)
example_blog = extract_blog_text(url_blog=most_similar_article["blog_url"],
                                 author_blog=most_similar_article["author"],
                                 claps=most_similar_article["claps"],
                                 comments=most_similar_article["comments"])

# RAG approach
llm_generator = gemini_2_flash.with_structured_output(BlogGeneration, include_raw=True)
generation_chain = prompt_rag | llm_generator
generator_response = generation_chain.invoke({"paper_text": paper_text,
                                              "example_paper": most_similar_article["full_text"],
                                              "example_blog": example_blog})

In [9]:
print(generator_response["parsed"].text)

# The Landscape of Emerging AI Agent Architectures for Reasoning, Planning, and Tool Calling: A Survey
## Preface
Large language models have revolutionized the field of artificial intelligence, offering a universal model capable of handling diverse problems through large-scale language modeling tasks. This blog post outlines the basic concepts and techniques related to these models, focusing on foundational aspects rather than cutting-edge methods.

## Key Concepts
- **Pre-training:** The foundation of large language models, involving common pre-training methods and model architectures.
- **Generative Models:** The large language models we commonly use today, exploring their construction, scaling, and handling of long texts.
- **Prompting Methods:** Strategies for effective prompting, including chain-of-thought reasoning and automatic prompt design.
- **Alignment Methods:** Techniques for instruction fine-tuning and alignment based on human feedback.

## Pre-training
Pre-training invol

In [10]:
from src.helpers import get_examples
from src.output_formats import BlogClassification
from src.prompts import prompt_five_shots

examples = get_examples()
blog_text = generator_response["parsed"].text

llm_evaluator = gemini_2_flash.with_structured_output(BlogClassification, include_raw=True)
evaluation_chain = prompt_five_shots | llm_evaluator
evaluator_response = evaluation_chain.invoke({**examples, "blog_text": blog_text})

In [11]:
print(f"Overall assessment: {evaluator_response["parsed"].overall_assessment}")

Overall assessment: Average


In [12]:
improvements = evaluator_response["parsed"].improvements
print(f"Possible improvements:")
for i, improvement in enumerate(improvements):
    print(f"{i+1}. {improvement}")

Possible improvements:
1. Add specific examples and case studies to illustrate the concepts discussed.
2. Provide more in-depth explanations of the techniques mentioned.
3. Incorporate visuals or diagrams to enhance understanding.
4. Make the title more engaging and specific to attract a wider audience.
5. Include a call to action to encourage discussion and feedback from readers.
6. Add a section on the limitations and challenges of current AI agent architectures.
7. Elaborate on the ethical considerations associated with AI agent development and deployment.
8. Include a section discussing future trends and potential research directions in the field.
9. Consider tailoring the content to a more specific audience (e.g., researchers, practitioners, or general enthusiasts).


In [13]:
from src.prompts import prompt_retry

possible_improvements = "\n".join([f"{i+1}. {improvement}" for i, improvement in enumerate(improvements)])

# Reflexion
generation_chain = prompt_retry | llm_generator
generator_response = generation_chain.invoke({"generated_blog": blog_text,
                                              "possible_improvements": possible_improvements})

In [14]:
print(generator_response["parsed"].text)

## Navigating the AI Agent Landscape: Reasoning, Planning, and Tool Calling

### Preface
Large language models (LLMs) have revolutionized artificial intelligence, offering a versatile approach to diverse problems through language modeling. This blog post explores the core concepts and techniques behind these models, focusing on fundamental aspects relevant to researchers, practitioners, and enthusiasts alike.

### Key Concepts
- **Pre-training:** The bedrock of LLMs, covering common methods and architectures.
- **Generative Models:** Exploring the construction, scaling, and long-text handling capabilities of modern LLMs.
- **Prompting Methods:** Strategies for effective prompting, including chain-of-thought reasoning and automated prompt design.
- **Alignment Methods:** Techniques for instruction fine-tuning and human feedback alignment.

### Pre-training
Pre-training optimizes a neural network before task-specific applications, enabling broad generalization. Key aspects include:

- **

In [15]:
blog_text = generator_response["parsed"].text
evaluator_response = evaluation_chain.invoke({**examples, "blog_text": blog_text})

In [16]:
print(f"Overall assessment: {evaluator_response["parsed"].overall_assessment}")

Overall assessment: Good
