In [1]:
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.agents import create_csv_agent
import streamlit as st

In [61]:
import os
from dotenv import load_dotenv
load_dotenv('constants.py')
os.environ["OPENAI_API_KEY"] = os.getenv("APIKEY")

In [60]:
# load csv file
import pandas as pd
df = pd.read_csv('cnbc_news_datase.csv')
df.head()

Unnamed: 0,title,url,published_at,author,publisher,short_description,keywords,header_image,raw_description,description,scraped_at
0,Santoli’s Wednesday market notes: Could Septem...,https://www.cnbc.com/2021/09/29/santolis-wedne...,2021-09-29T17:09:39+0000,Michael Santoli,CNBC,"This is the daily notebook of Mike Santoli, CN...","cnbc, Premium, Articles, Investment strategy, ...",https://image.cnbcfm.com/api/v1/image/10694960...,"<div class=""group""><p><em>This is the daily no...","This is the daily notebook of Mike Santoli, CN...",2021-10-30 14:11:23.709372
1,My take on the early Brexit winners and losers,https://www.cnbc.com/2016/06/24/ian-bremmers-t...,2016-06-24T13:50:48-0400,,CNBC,This commentary originally ran on Facebook. Bo...,"Articles, Politics, Europe News, European Cent...",https://fm.cnbc.com/applications/cnbc.com/reso...,,,2021-10-30 14:11:23.820139
2,Europe&#039;s recovery depends on Renzi&#039;s...,https://www.cnbc.com/2014/03/25/europes-recove...,2014-03-25T13:29:45-0400,,CNBC,"In spring, ambitious reforms began in Italy. U...","Articles, Business News, Economy, Europe Econo...",https://fm.cnbc.com/applications/cnbc.com/reso...,,,2021-10-30 14:11:23.854710
3,US Moves Closer to Becoming A Major Shareholde...,https://www.cnbc.com/2009/04/22/us-moves-close...,2009-04-22T19:49:03+0000,Michelle Caruso-Cabrera,CNBC,The US government is increasingly likely to co...,"cnbc, Articles, General Motors Co, Business Ne...",https://image.cnbcfm.com/api/v1/image/24947979...,"<div class=""group""><p>The US government is inc...",The US government is increasingly likely to co...,2021-10-30 14:11:24.261143
4,Trump: 'Mission accomplished' on 'perfectly ex...,https://www.cnbc.com/2018/04/14/trump-mission-...,2018-04-14T14:59:04+0000,Javier E. David,CNBC,,"cnbc, Articles, George W. Bush, Vladimir Putin...",https://image.cnbcfm.com/api/v1/image/10513177...,"<div class=""group""></div>,<div class=""group""><...",President Donald Trump hailed the U.S.-led int...,2021-10-30 14:11:24.489490


# Simple text

### TextLoader

In [None]:
loader = TextLoader("cnbc_news_datase.csv")
index = VectorstoreIndexCreator().from_loaders([loader])

In [None]:
query = "Give a summary of the article 'My take on the early Brexit winners and losers'. include also additional information like publication data, publisher, author, etc."

In [None]:
print(index.query(query, llm= ChatOpenAI(model = "gpt-3.5-turbo")))

The article titled "My take on the early Brexit winners and losers" was written by Ian Bremmer and originally appeared on Facebook. It was later published on CNBC on June 24, 2016. Ian Bremmer is the president and founder of Eurasia Group, a professor at New York University, and the author of "Superpower: Three Choices for America's Role in the World."

In the article, Bremmer discusses the winners and losers of the early stages of the Brexit campaign. He highlights Boris Johnson, the former London mayor and "Leave" supporter, who has now become a front-runner to lead the Tory party. Bremmer also mentions Theresa May, the UK Home Secretary, as a potential successor to outgoing Prime Minister David Cameron, given her political heavyweight status and respect among the Tories.

Nigel Farage, the leader of the UK Independence Party, is also mentioned as a winner, as he gets to keep having a political career and quoted "Independence Day" in his victory speech. Bremmer suggests that the succ

In [43]:
query = "Find the newest article"

In [44]:
print(index.query(query, llm= ChatOpenAI(model = "gpt-3.5-turbo")))

I'm sorry, but I don't have access to real-time information. The context provided does not include the date of the newest article.


# Agents
for reference: https://python.langchain.com/docs/modules/agents/toolkits/python

### CSV agent

In [62]:
agent = create_csv_agent(
    ChatOpenAI(temperature=0, model="gpt-4"),
    "cnbc_news_datase.csv",
    verbose=True
)

In [63]:
agent.run("What is the newest article? Give a short summary and provide some basic information about it like the date and author.")



[1m> Entering new  chain...[0m
[32;1m[1;3mThought: To find the newest article, I need to sort the dataframe by the 'published_at' column in descending order and then select the first row. The basic information about the article such as the date, author, and short summary can be found in the 'published_at', 'author', and 'short_description' columns respectively.
Action: python_repl_ast
Action Input: df.sort_values('published_at', ascending=False).iloc[0][0m
Observation: [36;1m[1;3mtitle                Gas crisis, labor shortages and supply chain c...
url                  https://www.cnbc.com/2021/10/18/gas-crisis-lab...
published_at                                  2021-10-18T08:00:25+0000
author                                                    Chloe Taylor
publisher                                                         CNBC
short_description    The U.K. has emerged from the Covid-19 pandemi...
keywords             cnbc, Articles, Border security, Customs, Nort...
header_im

'The newest article is titled "Gas crisis, labor shortages and supply chain chaos: The UK\'s path to recovery is fraught with challenges". It was published on 2021-10-18 by Chloe Taylor. The short description of the article is "The U.K. has emerged from the Covid-19 pandemic with a series of economic challenges that are causing a headache for the government and businesses alike."'

In [59]:
agent.run("What happened in March 2019? Give a short summary about the articles in this month.")



[1m> Entering new  chain...[0m
[32;1m[1;3mThought: To answer this question, I need to filter the dataframe to only include articles published in March 2019. Then, I can summarize the articles based on their descriptions.
Action: python_repl_ast
Action Input: df['published_at'] = pd.to_datetime(df['published_at'])
df_2019 = df[df['published_at'].dt.year == 2019]
df_march = df_2019[df_2019['published_at'].dt.month == 3]
df_march[['title', 'description']][0m
Observation: [36;1m[1;3m                                                 title  \
33   Bryce Harper's $330 million Phillies contract ...   
134  Investors on pace to plow a record amount of m...   
181  The 15 best medical schools in the US, accordi...   
478  Everybody's missing how Apple's 'subtle' appro...   
521  Billionaire Warren Buffett: 'This $100 college...   

                                           description  
33   Major League Baseball spring training is under...  
134  Investors are barreling toward the bond

"In March 2019, five articles were published. The topics covered include Bryce Harper's $330 million contract with the Phillies, investors moving towards the bond market, and Apple's event that failed to impress investors. Additionally, there was an article about the annual ranking of the best medical schools by U.S. News & World Report and an article about Warren Buffett sharing his wisdom and experiences."

In [58]:
agent.run("Are there more articles in 2020 that are about Donald Trump or Bernie Sanders?")



[1m> Entering new  chain...[0m
[32;1m[1;3mThought: To answer this question, I need to filter the dataframe to only include articles from 2020, then count how many of these articles mention Donald Trump and how many mention Bernie Sanders. I can do this by checking if the 'title' or 'description' columns contain these names.
Action: python_repl_ast
Action Input: df['published_at'] = pd.to_datetime(df['published_at'])
df_2020 = df[df['published_at'].dt.year == 2020]
trump_articles = df_2020[df_2020['title'].str.contains('Trump') | df_2020['description'].str.contains('Trump')]
sanders_articles = df_2020[df_2020['title'].str.contains('Sanders') | df_2020['description'].str.contains('Sanders')]
len(trump_articles), len(sanders_articles)[0m
Observation: [36;1m[1;3m(17, 2)[0m
Thought:[32;1m[1;3mThere are 17 articles about Donald Trump and 2 articles about Bernie Sanders in 2020.
Final Answer: There are more articles about Donald Trump than Bernie Sanders in 2020.[0m

[1m> Finish

'There are more articles about Donald Trump than Bernie Sanders in 2020.'

### JSON Agent

In [64]:
from langchain.agents import create_json_agent, AgentExecutor
from langchain.agents.agent_toolkits import JsonToolkit
from langchain.chains import LLMChain
from langchain.llms.openai import OpenAI
from langchain.requests import TextRequestsWrapper
from langchain.tools.json.tool import JsonSpec

In [65]:
# load json file
import json
with open('news_feed.json') as f:
    data = json.load(f)

json_spec = JsonSpec(dict_=data, max_value_length=4000)
json_toolkit = JsonToolkit(spec=json_spec)


In [66]:
json_agent_executor = create_json_agent(
    ChatOpenAI(temperature=0, model="gpt-4"),
     toolkit=json_toolkit, verbose=True
)

In [67]:
json_agent_executor.run(
    "Find the newest article. Give a short summary and provide some basic information about it like the date and author."
)



[1m> Entering new  chain...[0m
[32;1m[1;3mAction: json_spec_list_keys
Action Input: data[0m
Observation: [36;1m[1;3m['version', 'title', 'home_page_url', 'description', 'author', 'items'][0m
Thought:[32;1m[1;3mThe 'items' key might contain the articles. I should explore this key next.
Action: json_spec_list_keys
Action Input: data["items"][0m
Observation: [36;1m[1;3mValueError('Value at path `data["items"]` is not a dict, get the value directly.')[0m
Thought:[32;1m[1;3mThe 'items' key is not a dictionary, so I should use the json_spec_get_value tool to see what it contains.
Action: json_spec_get_value
Action Input: data["items"][0m
Observation: [33;1m[1;3m[{'guid': '61ba43e4-a6eb-427e-b9ef-27ffcdf3150a', 'url': 'https://www.nytimes.com/the-daily', 'title': 'A 36-Hour Rebellion in Russia', 'content_html': '<p>An armed rebellion in Russia over the weekend stunned the world and amounted to the single biggest challenge to President Vladimir V. Putin’s rule since he cam

'The newest article is titled "A 36-Hour Rebellion in Russia". It was published on 2023-06-26 by The New York Times. The summary of the article is: "An armed rebellion in Russia over the weekend stunned the world and amounted to the single biggest challenge to President Vladimir V. Putin’s rule since he came to power 23 years ago. Anton Troianovski, the Moscow bureau chief for The New York Times, talks about the man who led the revolt, Yevgeny V. Prigozhin, and about what might happen next."'

### document comparison

In [35]:
from pydantic import BaseModel, Field

from langchain.chat_models import ChatOpenAI
from langchain.agents import Tool
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA

In [40]:
class DocumentInput(BaseModel):
    question: str = Field()


llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

tools = []
files = [
    # https://abc.xyz/investor/static/pdf/2023Q1_alphabet_earnings_release.pdf
    {
        "name": "alphabet-earnings",
        "path": "files/2023Q1_alphabet_earnings_release.pdf",
    },
    # https://digitalassets.tesla.com/tesla-contents/image/upload/IR/TSLA-Q1-2023-Update
    {
        "name": "tesla-earnings",
        "path": "files/TSLA-Q1-2023-Update.pdf",
    },
]

for file in files:
    loader = PyPDFLoader(file["path"])
    pages = loader.load_and_split()
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    docs = text_splitter.split_documents(pages)
    embeddings = OpenAIEmbeddings()
    retriever = FAISS.from_documents(docs, embeddings).as_retriever()

    # Wrap retrievers in a Tool
    tools.append(
        Tool(
            args_schema=DocumentInput,
            name=file["name"],
            description=f"useful when you want to answer questions about {file['name']}",
            func=RetrievalQA.from_chain_type(llm=llm, retriever=retriever),
        )
    )

In [41]:
from langchain.agents import initialize_agent
from langchain.agents import AgentType


In [42]:
llm = ChatOpenAI(
    temperature=0,
    model="gpt-3.5-turbo-0613",
)

agent = initialize_agent(
    agent=AgentType.OPENAI_FUNCTIONS,
    tools=tools,
    llm=llm,
    verbose=True,
)

agent({"input": "did alphabet or tesla have more revenue?"})



[1m> Entering new  chain...[0m
[32;1m[1;3m
Invoking: `alphabet-earnings` with `{'question': 'revenue'}`


[0m[36;1m[1;3m{'query': 'revenue', 'result': 'The revenue for Alphabet Inc. for the quarter ended March 31, 2023, was $69,787 million.'}[0m[32;1m[1;3m
Invoking: `tesla-earnings` with `{'question': 'revenue'}`


[0m[33;1m[1;3m{'query': 'revenue', 'result': 'Total revenue for Q1-2023 was $23.3 billion.'}[0m[32;1m[1;3mAlphabet Inc. had more revenue than Tesla. Alphabet's revenue for the quarter ended March 31, 2023, was $69,787 million, while Tesla's total revenue for Q1-2023 was $23.3 billion.[0m

[1m> Finished chain.[0m


{'input': 'did alphabet or tesla have more revenue?',
 'output': "Alphabet Inc. had more revenue than Tesla. Alphabet's revenue for the quarter ended March 31, 2023, was $69,787 million, while Tesla's total revenue for Q1-2023 was $23.3 billion."}