In [216]:
from dotenv import load_dotenv
import os
from datetime import datetime

load_dotenv()  # Load environment variables from .env file

# Access keys and configurations
os.environ["LANGCHAIN_TRACING_V2"] = os.getenv('LANGCHAIN_TRACING_V2')
os.environ["LANGCHAIN_ENDPOINT"] = os.getenv('LANGCHAIN_ENDPOINT')
os.environ["LANGCHAIN_API_KEY"] = os.getenv('LANGCHAIN_API_KEY')
os.environ["LANGCHAIN_PROJECT"] = os.getenv('LANGCHAIN_PROJECT')
os.environ["ANTHROPIC_API_KEY"] = os.getenv('ANTHROPIC_API_KEY')
# os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY') # OpenAI API key
os.environ["NOTION_API_KEY"]=os.getenv('NOTION_API_KEY')
os.environ["LIBRARY_DATABASE_ID"]=os.getenv('LIBRARY_DATABASE_ID')
os.environ["NEON_DATABASE_URL"] = os.getenv("NEON_DATABASE_URL")

In [217]:
from sqlalchemy import create_engine, text
import pandas as pd

# engine = create_engine(NEON_DATABASE_URL)
engine = create_engine(os.environ["NEON_DATABASE_URL"])

def run_neon_query(query):
    with engine.connect() as conn:
        result = conn.execute(text(query))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    return df

In [218]:
required_vars = [
    'LANGCHAIN_API_KEY',
    'ANTHROPIC_API_KEY',
    'LIBRARY_DATABASE_ID',
    'NEON_DATABASE_URL'
]

missing_vars = [var for var in required_vars if not os.getenv(var)]
if missing_vars:
    raise EnvironmentError(f"Missing required environment variables: {', '.join(missing_vars)}")

In [None]:
def get_current_date():
    # Get the current date
    current_date = datetime.now()
    # Format the date as YYYY-MM-DD
    formatted_date = current_date.strftime('%Y-%m-%d')
    return formatted_date

print(get_current_date())

In [None]:
# Importing the LLM providers
from langchain_openai import ChatOpenAI # OpenAI
from langchain_google_genai import ChatGoogleGenerativeAI # Google
from langchain_anthropic import ChatAnthropic # Anthropic

# Importing the prompt template and chains
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.schema import AIMessage, HumanMessage, SystemMessage

# Importing gradio for the UI
import gradio as gr

# Importing LangSmith for tracing
from langsmith import Client
from langchain.callbacks.tracers import LangChainTracer

# Initialize LangSmith client
langsmith_client = Client()

# Initialize LangSmith tracer
tracer = LangChainTracer(project_name=os.getenv('LANGCHAIN_PROJECT'))

In [205]:
import json
from pathlib import Path

def load_model_costs(config_path="../config/model_costs.json"):
    try:
        with open(Path(config_path)) as f:
            return json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"Model costs file not found at {config_path}")

# Load the costs
MODEL_COSTS = load_model_costs()


In [220]:
# Initialize the language model
model_name = "claude-3-5-sonnet-20241022"
# model_name = "gpt-4o-mini"
streaming = True # Streaming is when the LLM returns a token at a time, instead of the entire response at once

# Initialize the language model
llm = ChatAnthropic(
    model=model_name,
    max_tokens=4096,
    temperature=0.3,
    streaming=streaming,
    tags=["newsletter"],
    metadata={
        "ls_provider": "anthropic",
        "ls_model_name": model_name,
        "model_name": model_name,
        "model_cost_per_1k_input_tokens": MODEL_COSTS[model_name]["input"],   # price per 1K input tokens
        "model_cost_per_1k_output_tokens": MODEL_COSTS[model_name]["output"]    # price per 1K output tokens
    }
)

# llm.invoke("Hello, world!").content



In [214]:
from pathlib import Path

def load_sql_query(filename):
    query_path = Path("../queries") / filename
    with open(query_path, "r") as f:
        return f.read()

In [None]:
query = load_sql_query("web_pages.sql")
df = run_neon_query(query)

print("Number of rows:", len(df.index))
df.head()

In [155]:
def create_content_from_df(df):
    """Convert dataframe rows into formatted content string."""
    all_content = '<START CONTEXT>\n'
    all_content_list=[]
    
    for idx, row in df.iterrows():
        # Format each article with consistent structure
        content = f"""
<START Article Number: {idx + 1}>
Title: {row['title']}
URL: {row['url']}
Summary: {row['summary']}
Description: {row['description']}
Created: {row['created_at'].strftime('%Y-%m-%d')}
Type: {row['media_type']}
<END Article Number: {idx + 1}>
"""
        # print('HERE***********', all_content_list)
        all_content += content
        all_content_list.append(content)
    
    all_content += '\n<END CONTEXT>\n--------------------\n'
    
    return all_content, all_content_list

In [None]:
# Print out the results (summary, titles, etc.)
all_content, all_content_list = create_content_from_df(df)

print(len(all_content_list))
print(all_content_list[0])


In [None]:
# Create a one-shot example template
NEWSLETTER_EXAMPLE = """
Subject: AI & Tech Weekly Summary {date}

Welcome to this week's AI & Tech digest! Here's what's making waves:

Featured Story #1: The Evolution of Large Language Models
Last week's breakthrough in parameter-efficient training has opened new possibilities for smaller companies.
Key highlights:
• 40% reduction in training costs
• Improved performance on specialized tasks
• New benchmarks for model efficiency

Featured Story #2: The Evolution of Large Language Models
Last week's breakthrough in parameter-efficient training has opened new possibilities for smaller companies.
Key highlights:
• 40% reduction in training costs
• Improved performance on specialized tasks
• New benchmarks for model efficiency

Featured Story #3: The Evolution of Large Language Models
Last week's breakthrough in parameter-efficient training has opened new possibilities for smaller companies.
Key highlights:
• 40% reduction in training costs
• Improved performance on specialized tasks
• New benchmarks for model efficiency

Industry Updates:
• Google announced their latest quantum computing milestone
• OpenAI released updates to their fine-tuning API
• Meta's PyTorch 2.0 shows promising performance gains

Key Takeaways:
• The future of AI is in smaller, more efficient models
• Quantum computing is making significant strides
• Fine-tuning APIs are becoming more powerful

Must-Read Resources:
• New paper on efficient training methods [link]
• Updated documentation for PyTorch 2.0 [link]
• Comprehensive guide to quantum computing basics [link]

Join us next week for more updates!
-------------------
""".format(date=get_current_date())

newsletter_example_formatted = """<OUTPUT EXAMPLE>
{example}
</OUTPUT EXAMPLE>
""".format(example=NEWSLETTER_EXAMPLE)

newsletter_prompt = PromptTemplate(
    input_variables=["context", "today_date"],
    template= newsletter_example_formatted + """{context}

Generate today's newsletter that follows the output example format while incorporating the key points from the provided context. Make sure to have at least three bullet points in each section. Add relevant sections as needed, but maintain the professional and engaging tone.
Make sure to use today's date, {today_date}, in the subject line.
"""
)

# Create the chain with tracing
chain = (newsletter_prompt | llm).with_config(
    {
        "callbacks": [tracer],
        "tags": ["newsletter_generation"],
    }
)

# Test the chain
newsletter = chain.invoke({"context": all_content, "today_date": get_current_date()})
print(newsletter.content)

In [None]:
# system_message = newsletter_prompt.format(context=all_content, today_date=get_current_date())
system_message = """
<SYSTEM MESSAGE>
You are an expert newsletter creator. Your task is to generate a well-organized, engaging, and informative newsletter based on the articles and structure provided. The newsletter should follow the example format and maintain a consistent tone suitable for a [target audience] (e.g., tech enthusiasts, data scientists, etc.).

- Keep the language professional and insightful.
- Summarize articles clearly, highlighting key takeaways.
- Include engaging headings and subheadings.
- Ensure the content flows logically and is easy to read.

Here's the structure to follow for each newsletter:

1. **Introduction**: A brief overview of the newsletter's theme or main focus for the week. Provide 3 bullet points with key takeaways.
2. **Main Section 1**: Headline for the first major topic, followed by a summary and analysis. Provide 3 bullet points with key takeaways.
3. **Main Section 2**: Headline for the second major topic, followed by a summary and analysis. Provide 3 bullet points with key takeaways.
4. **Additional Highlights**: Brief summaries of other important articles. Provide 3 bullet points with key takeaways.
5. **Closing**: A call-to-action, final thought, or reminder to stay tuned for more content. Provide 3 bullet points with key takeaways.

The articles and data you need for this week's edition are provided in the user prompt in the context.
</SYSTEM MESSAGE>
"""
print(system_message)

In [None]:
newsletter_prompt = PromptTemplate(
    input_variables=["context", "today_date"],
    template= system_message + """{context}

Please generate the newsletter using the structure and style described in the system message. Ensure the language is engaging, and provide a concise summary of each article.
Make sure to use today's date, {today_date}.
"""
)

# Create the chain with tracing
chain = (newsletter_prompt | llm).with_config(
    {
        "callbacks": [tracer],
        "tags": ["newsletter_generation"],
    }
)

# Test the chain
newsletter = chain.invoke({"context": all_content, "today_date": get_current_date()})
print(newsletter.content)