<a href="https://colab.research.google.com/github/mshumer/ai-journalist/blob/main/Claude_Journalist.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install newspaper3k
!pip install -qU google-generativeai



In [2]:
import requests
from bs4 import BeautifulSoup
import newspaper
from newspaper import Article
import ast
import google.generativeai as genai
from google.generativeai import GenerationConfig

GEMINI_MODEL = "gemini-1.5-pro-latest"
GEMINI_API_KEY = ""  # Replace with your Gemini API key
SERP_API_KEY = ""  # Replace with your SERP API key

genai.configure(api_key=GEMINI_API_KEY, transport='rest')
safety_settings = [
    {
        "category": "HARM_CATEGORY_DANGEROUS",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
]
generation_config = GenerationConfig(candidate_count=1, stop_sequences=None, max_output_tokens=None, temperature=0.5, top_p=0.9, top_k=40)

requests.adapters.DEFAULT_RETRIES = 3;
session = requests.session()

def get_search_terms(topic):
    term_count = 3;
    system_prompt = f"You are a world-class journalist. Generate a list of {term_count} search terms to search for to research and write an article about the topic."
    prompt = f"Please provide a list of {term_count} search terms related to '{topic}' for researching and writing an article. Respond with the search terms in a Python-parseable list, separated by commas."

    model = genai.GenerativeModel(model_name=GEMINI_MODEL, safety_settings=safety_settings, system_instruction=system_prompt)
    response = model.generate_content(prompt, generation_config=generation_config)
    response_text = response.candidates[0].content.parts[0].text
    response_data = response_text.strip().removeprefix("```python").removesuffix("```")

    search_terms = ast.literal_eval(response_data)
    return search_terms

def get_search_results(search_term):
    url = f"https://serpapi.com/search.json?q={search_term}&api_key={SERP_API_KEY}"
    headers = {'Connection': 'close'}
    try:
        response = session.get(url, headers=headers, verify=False)
    except:
        response = session.get(url, headers=headers, verify=False)
    data = response.json()
    return data['organic_results']

def select_relevant_urls(search_results):
    system_prompt = "You are a journalist assistant. From the given search results, select the URLs that seem most relevant and informative for writing an article on the topic."
    search_results_text = "\n".join([f"{i+1}. {result['link']}" for i, result in enumerate(search_results)])
    prompt = f"Search Results:\n{search_results_text}\n\nPlease select the numbers of the URLs that seem most relevant and informative for writing an article on the topic. Respond with the numbers in a Python-parseable list, separated by commas."

    model = genai.GenerativeModel(model_name=GEMINI_MODEL, safety_settings=safety_settings, system_instruction=system_prompt)
    response = model.generate_content(prompt, generation_config=generation_config)
    response_text = response.candidates[0].content.parts[0].text
    response_data = response_text.strip().removeprefix("```python").removesuffix("```")

    numbers = ast.literal_eval(response_data)
    relevant_indices = [int(num) - 1 for num in numbers]
    relevant_urls = [search_results[i]['link'] for i in relevant_indices]

    return relevant_urls

def get_article_text(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text

def write_article(topic, article_texts):
    system_prompt = "You are a journalist. Write a high-quality, NYT-worthy article on the given topic based on the provided article texts. The article should be well-structured, informative, and engaging."
    combined_text = "\n\n".join(article_texts)
    prompt = f"Topic: {topic}\n\nArticle Texts:\n{combined_text}\n\nPlease write a high-quality, NYT-worthy article on the topic based on the provided article texts. The article should be well-structured, informative, and engaging. Ensure the length is at least as long as a NYT cover story -- at a minimum, 15 paragraphs."

    model = genai.GenerativeModel(model_name=GEMINI_MODEL, safety_settings=safety_settings, system_instruction=system_prompt)
    response = model.generate_content(prompt, generation_config=generation_config)
    response_text = response.candidates[0].content.parts[0].text

    article = response_text
    return article

def edit_article(article):
    system_prompt = "You are an editor. Review the given article and provide suggestions for improvement. Focus on clarity, coherence, and overall quality."
    prompt = f"Article:\n{article}\n\nPlease review the article and provide suggestions for improvement. Focus on clarity, coherence, and overall quality."

    model = genai.GenerativeModel(model_name=GEMINI_MODEL, safety_settings=safety_settings, system_instruction=system_prompt)
    response = model.generate_content(prompt, generation_config=generation_config)
    response_text = response.candidates[0].content.parts[0].text
    response_data = response_text.strip().removeprefix("```python").removesuffix("```")

    suggestions = response_data

    system_prompt = "You are an editor. Rewrite the given article based on the provided suggestions for improvement."
    prompt = f"Original Article:\n{article}\n\nSuggestions for Improvement:\n{suggestions}\n\nPlease rewrite the article based on the provided suggestions for improvement."

    model = genai.GenerativeModel(model_name=GEMINI_MODEL, safety_settings=safety_settings, system_instruction=system_prompt)
    response = model.generate_content(prompt, generation_config=generation_config)
    response_text = response.candidates[0].content.parts[0].text
    response_data = response_text.strip().removeprefix("```python").removesuffix("```")

    edited_article = response_data
    return edited_article

# User input
topic = input("Enter a topic to write about: ")
do_edit = input("After the initial draft, do you want an automatic edit? This may improve performance, but is slightly unreliable. Answer 'yes' or 'no'.")

# Generate search terms
search_terms = get_search_terms(topic)
print(f"\nSearch Terms for '{topic}':")
print(", ".join(search_terms))

# Perform searches and select relevant URLs
relevant_urls = []
for term in search_terms:
    search_results = get_search_results(term)
    urls = select_relevant_urls(search_results)
    relevant_urls.extend(urls)

print('Relevant URLs to read:', relevant_urls)


# Get article text from relevant URLs
article_texts = []
for url in relevant_urls:
  try:
    text = get_article_text(url)
    if len(text) > 75:
      article_texts.append(text)
  except:
    pass

print('Articles to reference:', article_texts)

print('\n\nWriting article...')
# Write the article
article = write_article(topic, article_texts)
print("\nGenerated Article:")
print(article)

if 'y' in do_edit:
  # Edit the article
  edited_article = edit_article(article)
  print("\nEdited Article:")
  print(edited_article)

Enter a topic to write about:  中国大学生
After the initial draft, do you want an automatic edit? This may improve performance, but is slightly unreliable. Answer 'yes' or 'no'. no



Search Terms for '中国大学生':
中国大学生就业形势, 中国大学生心理健康, 中国大学生创新创业




Relevant URLs to read: ['https://www.voachinese.com/a/china-s-11-6m-graduates-face-a-jobs-market-with-no-jobs-20230601/7118431.html', 'https://www.bbc.com/zhongwen/simp/world-65995537', 'https://cn.nytimes.com/china/20230809/china-youth-unemployment/', 'https://www.ncss.cn/ncss/jydt/jy/202304/20230413/2275751544.html', 'https://m.jiemian.com/article/8611743.html', 'https://dxs.moe.gov.cn/zx/a/xl_xlyr_xlyral/230418/1834625.shtml', 'https://journal.psych.ac.cn/xlkxjz/CN/10.3724/SP.J.1042.2022.00991', 'http://edu.people.com.cn/n1/2023/0327/c1006-32651704.html', 'http://www.xinhuanet.com/health/20230525/89573799dace412280322dc8a83a4faf/c.html', 'https://m.thepaper.cn/newsDetail_forward_22596360?commTag=true', 'https://www.chinanews.com/sh/2023/03-27/9979195.shtml', 'https://baike.baidu.com/item/%E5%A4%A7%E5%AD%A6%E7%94%9F%E5%BF%83%E7%90%86%E5%81%A5%E5%BA%B7/7332039', 'http://www.moe.gov.cn/srcsite/A08/s5672/202305/t20230530_1061991.html', 'https://baike.baidu.com/item/%E4%B8%AD%E5%9B%BD%E5

Building prefix dict from /opt/homebrew/anaconda3/lib/python3.11/site-packages/jieba/dict.txt ...
Dumping model to file cache /var/folders/v7/qqxlbg1n5912djxtm8vks61w0000gn/T/jieba.cache
Loading model cost 0.7083470821380615 seconds.
Prefix dict has been built succesfully.


Articles to reference: ['随着中国16至24岁青年人的失业率今年4月达到创纪录的20.4%，今年即将毕业的1158万名大学毕业生将面临一个越来越严峻的就业市场，或“一个没有职位的就业市场”。\n\n\n\n英国卫报周三报道说，接受过高等教育的失业青年的问题变得如此严峻，许多人开始将自己自讽为孔乙己，中国最伟大的作家之一的鲁迅笔下的一个小说角色。孔乙己曾是“读书人”，后来成为乞丐，经常受到当地人的嘲笑。\n\n\n\n在中国经济复苏乏力之际，高等院校的毕业生面临求职难的困境，许多人借孔乙己自嘲寒窗苦读多年，却找不到一个合适的工作，感叹“学历是下不来的高台，更是孔乙己脱不掉的长衫”。\n\n\n\n报道说，随后，中国官媒批评这种“孔乙己现象”，称孔乙己之所以陷入生活困境，不是因为读过书，而是放不下读书人的架子，不愿意靠劳动改变自己的处境，当代的青年不应该被困在“长衫”里。一个评论还批评许多青年人不愿意做低于他们预期的工作。\n\n路透社表示，所有人都在这个仍然是世界上增长最快的主要经济体之一的国家竞争就业机会，但中国以制造业为主的结构，越来越不符合年轻一代的期望。而最受中国毕业生欢迎的行业，如科技、教育、房地产和金融，近年来都面临监管部门的打击。\n\n\n\n经济学家预估，大学毕业生过剩和劳动力老龄化导致工厂劳动力短缺，加剧了中国就业市场的失衡，未来几年这类例子将变得越来越普遍。\n\n\n\n另外，就业机会与谋职者专业的不匹配也是造成青年人失业率高企的因素。据美国投行高盛数据，2018到2021年，体育和教育方面的毕业生人数增长了20%多，而制造业专业则增长很小。\n\n\n\n尽管一些监管措施已被取消，但商业信心恢复缓慢。今年1至4月民间固定资产投资仅同比增长0.4%，国有控股投资则增长9.4%。\n\n\n\n由于国内和国际需求减弱，中国今年5月份的工业活动收缩得比预期要快，为政策制定者支撑不平衡的经济复苏施加了更多的压力，更令今年毕业生的就业前景堪忧。\n\n中国国家统计局周三发布的数据显示，官方的制造业采购经理指数（PMI）5月份跌至五个月来的新低，为48.8，比4月份的49.2下降0.4%，低于50的扩张和收缩临界点，也低于49.4的微增预期，表明制造业景气水平小幅回落。\n\n\n\n服务行业的活动5月份以四个月来最慢的速度扩张，

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))