In [14]:
import requests
import json
from langchain_openai import ChatOpenAI
import os
from pydantic import BaseModel
from typing import List, Dict, Any
from langchain_core.prompts import ChatPromptTemplate
import os
import pathlib
from bs4 import BeautifulSoup
import re
from bs4 import BeautifulSoup
import re
from langchain_openai import ChatOpenAI
from pydantic import Field
from typing import Optional
from typing import Annotated
from typing_extensions import TypedDict
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langchain.schema import HumanMessage, AIMessage
from __future__ import print_function
import sib_api_v3_sdk
from sib_api_v3_sdk.rest import ApiException
from IPython.display import Markdown, display

In [3]:
with open(".env", "r") as f:
  for line in f:
    key, value = line.strip().split("=")
    os.environ[key] = value


In [4]:
search_terms = ["Software Engineer New Grad", "Entry level Software Engineer", "University Graduate Software Engineer", "Junior Software Developer", "AI automation Engineer"]

In [5]:
class ResultRelevance(BaseModel):
  explanation: str
  id: str

class RelevanceCheckOutput(BaseModel):
  relevant_results: List[ResultRelevance]

def search_serper(search_query):
  url = "https://google.serper.dev/search"
    
  payload = json.dumps({
    "q": search_query,
    "gl": "us", 
    "num": 30,
    "tbs": "qdr:d"
  })

  headers = {
    'X-API-KEY': '0130e7da8283ec8a8d9721c6c38a97d557867923',
    'Content-Type': 'application/json'
  }

  response = requests.request("POST", url, headers=headers, data=payload)
  results = json.loads(response.text)
  results_list = results['organic']

  all_results = []
  for id, result in enumerate(results_list, 1):
    result_dict = {
      'title': result['title'],
      'link': result['link'],
      'snippet': result['snippet'],
      'search_term': search_query,
      'id': id
    }
    all_results.append(result_dict)
  return all_results

def load_prompt(prompt_name):
  with open(f"prompts/{prompt_name}.md", "r") as file:
    return file.read()
  
def check_search_relevance(search_results: Dict[str, Any]) -> RelevanceCheckOutput:
  prompt = load_prompt("relevance_check")
  prompt_template = ChatPromptTemplate.from_messages(
    ("system_prompt", prompt)
  )

  llm = ChatOpenAI(model="gpt-4o").with_structured_output(RelevanceCheckOutput)
  llm_chain = prompt_template | llm
  return llm_chain.invoke({'input_search_results': search_results})

In [6]:
relevant_results = []
for search_term in search_terms:
  python_results = search_serper(search_term)
  results = check_search_relevance(python_results)
  
  # Get the relevant result IDs from the LLM output
  relevant_ids = [r.id for r in results.relevant_results]
  
  # Filter original results to only include those with matching IDs
  filtered_results = [r for r in python_results if str(r['id']) in relevant_ids]
  
  relevant_results.extend(filtered_results)
relevant_results

[{'title': 'Software Engineer jobs at Y Combinator startups',
  'link': 'https://www.ycombinator.com/jobs/role',
  'snippet': 'Many YC startups are seeing breakout growth, and are actively hiring for software engineers. Find some of the top YC companies at Y Combinator.',
  'search_term': 'Software Engineer New Grad',
  'id': 5},
 {'title': 'Search Jobs - Software and Services: Machine Learning and AI',
  'link': 'https://jobs.apple.com/en-us/search?team=machine-learning-and-ai-SFTWR-MCHLN',
  'snippet': 'Explore all Software and Services: Machine Learning and AI jobs at Apple. Create a profile and apply today.',
  'search_term': 'Software Engineer New Grad',
  'id': 6},
 {'title': 'Technology jobs at DISNEY',
  'link': 'https://jobs.disneycareers.com/category/technology-jobs/17189/26715/1',
  'snippet': 'Jobs JOBS FOUND · Lead Software Engineer - Frontend · Lead Software Engineer - Secret Management · Lead Software Engineer - Ad Platforms · Product Manager.',
  'search_term': 'Softwar

In [7]:
def convert_html_to_markdown(html_content):
    # Create BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Convert common HTML elements to markdown
    
    # Headers
    for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        level = int(h.name[1])
        h.replace_with('#' * level + ' ' + h.get_text() + '\n\n')
    
    # Links
    for a in soup.find_all('a'):
        href = a.get('href', '')
        text = a.get_text()
        if href and text:
            a.replace_with(f'[{text}]({href})')
    
    # Bold
    for b in soup.find_all(['b', 'strong']):
        b.replace_with(f'**{b.get_text()}**')
    
    # Italic
    for i in soup.find_all(['i', 'em']):
        i.replace_with(f'*{i.get_text()}*')
    
    # Lists
    for ul in soup.find_all('ul'):
        for li in ul.find_all('li'):
            li.replace_with(f'- {li.get_text()}\n')
    
    for ol in soup.find_all('ol'):
        for i, li in enumerate(ol.find_all('li'), 1):
            li.replace_with(f'{i}. {li.get_text()}\n')
    
    # Get text and clean up
    text = soup.get_text()
    
    # Remove excess whitespace/newlines
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = text.strip()
    
    return text

def scrape_and_save_markdown(relevant_results):
    """
    Scrapes HTML content from URLs in relevant_results and saves as markdown files.
    
    Args:
        relevant_results: List of dictionaries containing search results with URLs
        
    Returns:
        List of dictionaries containing markdown content and metadata
    """
    # Create scraped_html directory if it doesn't exist
    pathlib.Path("scraped_markdown").mkdir(exist_ok=True)

    markdown_contents = []
    for result in relevant_results:
        if 'link' in result:
            payload = {
                "api_key": "zZwEqATYReQC4ogVNYnaLUYdm47ITPMNiFbRtvw1o7IpQbTubwZUvnkCycrWx2wCWt9wuA4aUIizsSvtQH", 
                "url": result['link'],
                "render_js": "true"
            }

            response = requests.get("https://scraping.narf.ai/api/v1/", params=payload)
            if response.status_code == 200:
                # Create filename from ID or URL if ID not available
                filename = f"{result.get('id', hash(result['link']))}.md"
                filepath = os.path.join("scraped_markdown", filename)
                
                # Convert HTML to markdown
                markdown_content = convert_html_to_markdown(response.content.decode())
                
                # Save markdown content to file
                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(markdown_content)
                
                markdown_contents.append({
                    'url': result['link'],
                    'filepath': filepath,
                    'markdown': markdown_content,
                    'title': result.get('title', ''),
                    'id': result.get('id', '')
                })
            else:
                print(f"Failed to fetch {result['link']}: Status code {response.status_code}")

    print(f"Successfully downloaded and saved {len(markdown_contents)} pages as markdown to scraped_markdown/")
    return markdown_contents

markdown_contents = scrape_and_save_markdown(relevant_results)

Failed to fetch https://www.linkedin.com/jobs/view/forward-deployed-engineer-entry-level-at-jobright-ai-4267676167: Status code 500
Failed to fetch https://www.linkedin.com/jobs/view/software-engineer-new-grad-at-jobright-ai-4268113930: Status code 500
Failed to fetch https://www.linkedin.com/jobs/view/entry-level-software-engineer-data-at-jobright-ai-4267699722: Status code 500
Successfully downloaded and saved 40 pages as markdown to scraped_markdown/


In [8]:
markdown_contents

[{'url': 'https://www.ycombinator.com/jobs/role',
  'filepath': 'scraped_markdown/5.md',
  'markdown': "Software Engineer jobs at Y Combinator startups | Y Combinator\n\n[About](/about)[What Happens at YC?](/about)[Apply](/apply)[YC Interview Guide](/interviews)[FAQ](/faq)[People](/people)[YC Blog](/blog)[Companies](/companies)[Startup Directory](/companies)[Founder Directory](/companies/founders)[Launch YC](/launches)[Startup Jobs](/jobs)[All Jobs](/jobs)[◦ Engineering](/jobs/role/software-engineer)[◦ Operations](/jobs/role/operations)[◦ Marketing](/jobs/role/marketing)[◦ Sales](/jobs/role/sales)[Startup Job Guide](/startup-job-guide)[YC Startup Jobs Blog](/blog/jobs)[Find a Co-Founder](/cofounder-matching)[Library](/library)[SAFE](/documents)[Resources](/library)[Startup School](https://startupschool.org?utm_source=yc&utm_campaign=ycdc_header)[Newsletter](/subscribe)[Requests for Startups](/rfs)[For Investors](/investors)[Hacker News](https://news.ycombinator.com/)[Bookface](https://

In [None]:
class Details(BaseModel):
  company_name: str = Field(description="This indicates the name of the company in which the role is in.", examples = ["Meta", "Amazon"])
  role_name: str = Field(description="This indicates the name of the role in the company.", examples = ["Software Developer", "AI Engineer"])
  description: str = Field(description="A short description on what the work will involve.")
  location: Optional[str] = Field(description="This is the location or locations in which the job is available.")
  skills: Optional[str] = Field(description="Note down the skills the role is demanding or is mentioned as minimum requirements.", examples = ["Node, React", "LangChain, LangGraph"])
  experience: Optional[str] = Field(description="How many years of experience is the job looking for")
  sponsors: Optional[bool] = Field(description = "Whether it sponsors or not. True if it does False if it doesn't.")
  


def generate_summaries(markdown_contents):
  pathlib.Path("markdown_summaries").mkdir(exist_ok=True)

  summary_prompt = load_prompt("summaries_markdown_page")

  summary_template = ChatPromptTemplate.from_messages([
    ("system", summary_prompt)
  ])

  llm = ChatOpenAI(model="gpt-4o").with_structured_output(Details)
  summary_chain = summary_template | llm

  summaries = []
  for content in markdown_contents:
    try:
      summary = summary_chain.invoke({
        'markdown_input': ' '.join(content['markdown'].split()[:2000])
      })

      # Create filename for summary
      summary_filename = f"summary_{content['id']}.md"
      summary_filepath = os.path.join("markdown_summaries", summary_filename)
      
      # Save summary to file
      with open(summary_filepath, 'w', encoding='utf-8') as f:
        f.write(summary)
      
      # Add to summaries list
      summaries.append({
        'markdown_summary': summary,
        'url': content['url']
      })

    except Exception as e:
      print(f"Failed to summarize {content['filepath']}:{str(e)}")
  
  print(f"Successfully generated summaries for {len(markdown_contents)} pages in markdown_summaries/")
  return summaries


summaries = generate_summaries(markdown_contents)


Failed to summarize scraped_markdown/5.md:'Details' object has no attribute 'content'
Failed to summarize scraped_markdown/6.md:'Details' object has no attribute 'content'
Failed to summarize scraped_markdown/9.md:'Details' object has no attribute 'content'
Failed to summarize scraped_markdown/13.md:'Details' object has no attribute 'content'
Failed to summarize scraped_markdown/14.md:'Details' object has no attribute 'content'
Failed to summarize scraped_markdown/16.md:'Details' object has no attribute 'content'
Failed to summarize scraped_markdown/20.md:'Details' object has no attribute 'content'
Failed to summarize scraped_markdown/21.md:'Details' object has no attribute 'content'
Failed to summarize scraped_markdown/24.md:'Details' object has no attribute 'content'
Failed to summarize scraped_markdown/26.md:'Details' object has no attribute 'content'
Failed to summarize scraped_markdown/2.md:'Details' object has no attribute 'content'
Failed to summarize scraped_markdown/4.md:'Deta

In [10]:
summaries

[{'markdown_summary': '# Job Summary: Software Engineer at Y Combinator Startups\n\n**Location:** Multiple locations including San Francisco, New York, Los Angeles, Seattle, Boston, Austin, India, and Remote\n\n**Roles:**\n1. **Software Engineer Roles:**\n   - Backend Engineer\n   - Full Stack Engineer\n   - Frontend Engineer\n   - Data Engineer\n   - Embedded Software Engineer\n   - AI/ML Engineer\n\n2. **Experience Levels:**\n   - Senior Software Engineer\n   - Founding Engineer\n   - Product Engineer\n   - Recent Grad Software Engineer\n   - New Grad Software Engineer\n\n**Job Description Overview:**\n\nY Combinator startups are experiencing significant growth and are seeking software engineers for various roles. These positions cover multiple engineering specializations and levels, from recent graduates to senior positions. The opportunities are spread across a wide range of locations, with options for remote work as well.\n\n**Key Requirements:**\n- Position types vary from full-t

In [12]:
class State(TypedDict):
  messages: Annotated[list, add_messages]
  summaries: List[dict]
  approved: bool
  created_summaries: Annotated[List[dict], Field(description="The summaries that have been created by the summariser")]

graph_builder = StateGraph(State)

In [13]:
llm = ChatOpenAI(model="gpt-4o")

In [None]:
with open("email_template.md", "r") as f:
  email_template = f.read()

class SummariserOutput(BaseModel):
  email_summary: str = Field(description="The summary email of the content")
  message: str = Field(description="A message to the reviewer, asking for feedback on the summary")

summariser_prompt = ChatPromptTemplate.from_messages([
  ("system", load_prompt("summariser")),
])

