# Intro 

<img src="./slides/Slide1.png" alt="Image description" width="1000">

# Background

<img src="./slides/Slide11.png" alt="Image description" width="1000">

<img src="./slides/Slide12.png" alt="Image description" width="1000">

# Demo - Classical NLP Tasks

## Masked word prediction with BERT

In [1]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

sentence = "The Toronto Blue Jays are the [MASK] team in baseball."
input_tensor = tokenizer.encode(sentence, return_tensors="pt")
mask_token_index = torch.where(input_tensor == tokenizer.mask_token_id)[1]

token_logits = model(input_tensor).logits
mask_token_logits = token_logits[0, mask_token_index, :]

top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(tokenizer.decode([token]), end=" ")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


oldest canadian professional national dominant 

## Masked word prediction with RoBERTa

In [2]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
import torch

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForMaskedLM.from_pretrained("roberta-base")

sentence = "The Toronto Blue Jays are the <mask> team in baseball."
input_tensor = tokenizer.encode(sentence, return_tensors="pt")
mask_token_index = torch.where(input_tensor == tokenizer.mask_token_id)[1]

token_logits = model(input_tensor).logits
mask_token_logits = token_logits[0, mask_token_index, :]

top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(tokenizer.decode([token]), end=" ")

 best  hottest  worst  greatest  top 

# Inflection Point - ChatGPT

<img src="./slides/Slide20.png" alt="Image description" width="1000">

<img src="./slides/Slide21.png" alt="Image description" width="1000">

# Competitive Landscape for LLMs

<img src="./slides/Slide27.png" alt="Image description" width="1000">

<img src="./slides/Slide28.png" alt="Image description" width="1000">

# LLM Concepts and Abstractions

<img src="./slides/Slide30.png" alt="Image description" width="1000">

## Zero Shot Sentiment Prediction with T5

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


def analyze_tweet_sentiment(tweet):
    # Load tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base")

    # Task prefix
    prompt = f"Tweet: {tweet} Sentiment: "

    # Encode the prompt and convert to Tensor
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    # Generate sentiment
    sentiment_ids = model.generate(input_ids, max_length=3, num_return_sequences=1)
    sentiment = tokenizer.decode(sentiment_ids[0], skip_special_tokens=True)

    return sentiment


# Predefined tweets
tweets = [
    "Just had the best day ever with my friends!",
    "I'm so disappointed with the service at the restaurant.",
    "Looks like it's going to rain all week. Oh well, more time for coding!",
    "Can't believe I got the job! Dreams do come true!",
    "Not feeling well today, think I caught a cold.",
]

# Analyze sentiment of each tweet
for tweet in tweets:
    sentiment = analyze_tweet_sentiment(tweet)
    print(f"Tweet: {tweet}\nSentiment: {sentiment}\n")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tweet: Just had the best day ever with my friends!
Sentiment: Just



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tweet: I'm so disappointed with the service at the restaurant.
Sentiment: Tweet

Tweet: Looks like it's going to rain all week. Oh well, more time for coding!
Sentiment: Fals



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tweet: Can't believe I got the job! Dreams do come true!
Sentiment: True

Tweet: Not feeling well today, think I caught a cold.
Sentiment: Fals



## Sentiment Prediction w/ HuggingFace Pipelines

In [4]:
from transformers import pipeline

# Load sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

tweets = [
    "Just had the best day ever with my friends!",
    "I'm so disappointed with the service at the restaurant.",
    "Looks like it's going to rain all week. Oh well, more time for coding!",
    "Can't believe I got the job! Dreams do come true!",
    "Not feeling well today, think I caught a cold.",
]

# Analyze sentiment
for text in tweets:
    result = sentiment_pipeline(text)
    print(
        f"Text: {text}\nSentiment: {result[0]['label']}, Confidence: {result[0]['score']}\n"
    )

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Text: Just had the best day ever with my friends!
Sentiment: POSITIVE, Confidence: 0.9998741149902344

Text: I'm so disappointed with the service at the restaurant.
Sentiment: NEGATIVE, Confidence: 0.999789297580719

Text: Looks like it's going to rain all week. Oh well, more time for coding!
Sentiment: NEGATIVE, Confidence: 0.9966244697570801

Text: Can't believe I got the job! Dreams do come true!
Sentiment: POSITIVE, Confidence: 0.9995515942573547

Text: Not feeling well today, think I caught a cold.
Sentiment: NEGATIVE, Confidence: 0.999713122844696



## Zero Shot Sentiment Analysis w/ Llama3

In [5]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama3")

tweets = [
    "Just had the best day ever with my friends!",
    "I'm so disappointed with the service at the restaurant.",
    "Looks like it's going to rain all week. Oh well, more time for coding!",
    "Can't believe I got the job! Dreams do come true!",
    "Not feeling well today, think I caught a cold.",
]

for tweet in tweets:
    print(f"Tweet: {tweet}")
    response = llm.invoke(
        f"Analyze the sentiment of the tweet: {tweet}\nRespond precisely with one of ['positive', 'negative', 'neutral']"
    )
    print(f"Sentiment: {response}\n")
    print("\n***\n")

Tweet: Just had the best day ever with my friends!
Sentiment: Positive


***

Tweet: I'm so disappointed with the service at the restaurant.
Sentiment: The sentiment of this tweet is: **Negative**


***

Tweet: Looks like it's going to rain all week. Oh well, more time for coding!
Sentiment: The sentiment of this tweet is: **positive**.

Although the tweet mentions that it might rain all week, which could be a negative topic, the overall tone is optimistic and focused on the positive aspect (having more time for coding). The phrase "Oh well" is also a lighthearted way to accept the situation, rather than complaining about it.


***

Tweet: Can't believe I got the job! Dreams do come true!
Sentiment: The sentiment of this tweet is: **Positive**


***

Tweet: Not feeling well today, think I caught a cold.
Sentiment: The sentiment of this tweet is: **Negative**


***



## Few Shot Sentiment Analysis w/ Llama3

In [6]:
examples = """1. Tweet: "Wow, just watched an incredible sunset from my balcony!"
   Sentiment: positive

2. Tweet: "Frustrated with how slow my internet has been today."
   Sentiment: negative

3. Tweet: "Nothing special happening today, just a typical Monday."
   Sentiment: neutral

4. Tweet: "Just got back from an amazing vacation in Hawaii!"
   Sentiment: positive

5. Tweet: "Feeling under the weather after yesterday's marathon."
   Sentiment: negative
"""

prompt_template = """
# Role
You are an expert sentiment analysis agent trained to analyze tweets. 
Your task is to determine whether the sentiment expressed in a tweet is positive, negative, or neutral.

# Examples
The following examples illustrate the expected response format:
Tweet: "Wow, just watched an incredible sunset from my balcony!"
Sentiment: positive

Tweet: "Frustrated with how slow my internet has been today."
Sentiment: negative

Tweet: "Nothing special happening today, just a typical Monday."
Sentiment: neutral

Tweet: "Just got back from an amazing vacation in Hawaii!"
Sentiment: positive

Tweet: "Feeling under the weather after yesterday's marathon."
Sentiment: negative

# Instructions
Analyze the sentiment of the following tweet and respond with exactly one of the options: 'positive', 'negative', 'neutral'. 

# Your Task
Tweet: "{tweet}"
"""

tweets = [
    "Just had the best day ever with my friends!",
    "I'm so disappointed with the service at the restaurant.",
    "Looks like it's going to rain all week. Oh well, more time for coding!",
    "Can't believe I got the job! Dreams do come true!",
    "Not feeling well today, think I caught a cold.",
]

for tweet in tweets:
    print(f"Tweet: {tweet}")
    response = llm.invoke(
        prompt_template.format(tweet=tweet)
    )
    print(f"{response}\n")
    print("\n***\n")

Tweet: Just had the best day ever with my friends!
Sentiment: positive


***

Tweet: I'm so disappointed with the service at the restaurant.
Sentiment: negative


***

Tweet: Looks like it's going to rain all week. Oh well, more time for coding!
Sentiment: positive


***

Tweet: Can't believe I got the job! Dreams do come true!
Sentiment: positive


***

Tweet: Not feeling well today, think I caught a cold.
Sentiment: negative


***



<img src="./slides/Slide31.png" alt="Image description" width="1000">

<img src="./slides/Slide32.png" alt="Image description" width="1000">

<img src="./slides/Slide33.png" alt="Image description" width="1000">

## Demo: Chain of Thought Prompting

In [7]:
# Set up Council to help with OpenAI LLM calls

from council.contexts import AgentContext, Budget
from council.llm import OpenAILLM, LLMMessage

import dotenv

dotenv.load_dotenv(override=True)

llm_gpt = OpenAILLM.from_env()

def invoke_GPT(prompt, llm=llm_gpt, system_prompt=None, context=None):
    if context is None:
        context = AgentContext.empty(budget=Budget(200))
    if system_prompt:
        messages = [
            LLMMessage.system_message(system_prompt),
            LLMMessage.user_message(prompt),
        ]
    else:
        messages = [LLMMessage.user_message(prompt)]
    response = llm.post_chat_request(context=context, messages=messages)
    return response.first_choice


gpt-4-turbo-preview may change over time. Returning num tokens assuming gpt-4-0125-preview.


In [8]:
prompt1 = "How can I perform sentiment analysis on a large database of customer reviews?"
prompt2 = "How can I perform sentiment analysis on a large database of customer reviews? Let's think step by step."
response1 = invoke_GPT(prompt1)
print(f"Prompt: {prompt1}\nResponse: {response1}\n\n*********\n\n")
response2 = invoke_GPT(prompt2)
print(f"Prompt: {prompt2}\nResponse: {response2}\n")

Prompt: How can I perform sentiment analysis on a large database of customer reviews?
Response: Performing sentiment analysis on a large database of customer reviews involves several steps, from data collection and preprocessing to choosing the right analysis model and interpreting the results. Here's a structured approach to tackle this task:

### 1. Data Collection and Preparation

- **Gather Data**: Ensure you have access to the customer reviews. This might involve scraping websites, accessing APIs, or exporting data from databases.
- **Data Cleaning**: Clean the data to remove irrelevant information, such as HTML tags, special characters, and extra spaces. This step is crucial for improving the accuracy of your analysis.
- **Normalization**: Convert the text to a uniform case (usually lowercase) to ensure consistency.
- **Tokenization**: Break down the text into individual words or tokens. This step is essential for further processing.
- **Stop Words Removal**: Remove common words 

### ReAct with GPT-4

In [9]:
react_system_prompt = """Solve a problem with interleaving steps:
1. Thought
2. Action
3. Observation

where each Action must be a use of one of the following functions: 
[
    ask_question(question:str) -> str,  # Send a question to the user and receive a response
    web_search(search_query:str) -> str,  # Perform a web search and return summaries of the top 5 results
    calculator(op:ArithOperation, a:float, b:float) -> float,  # Perform a simple arithmetic operation
    query_database(query:str) -> pd.DataFrame,  # Query a database and return the results as a DataFrame
    analyze_data(data:pd.DataFrame) -> str  # Analyze a DataFrame and return a summary
]

Let's tackle the problem one step at a time.
Whenever you invoke an Action, please wait for me to provide the next Observation before you proceed.
"""

task = """I need help creating a model that will use customer sentiment data to predict future sales."""

message = task
print(f"Sending message to GPT:\n{message}\n")

messages = [message]  # Collect messages
while True:

     # Send messages to LLM
    llm_response = invoke_GPT("\n\n".join(messages), system_prompt=react_system_prompt) 

    # Record the LLM response
    messages.append(llm_response)
    
    print(f"Received response from GPT:\n{llm_response}\n")

    # Simulate the execution of the Action (i.e. provide an "Observation")
    message = input("Enter your response:")
    messages.append(message)
    if message == "stop":
        break

Sending message to GPT:
I need help creating a model that will use customer sentiment data to predict future sales.

Received response from GPT:
### Thought
To create a model that uses customer sentiment data to predict future sales, we need to understand the type of data available, the scope of the sentiment analysis (e.g., positive, negative, neutral), and the historical sales data to correlate with the sentiment. We also need to consider the tools and methods for building and training the model.

### Action
First, we should gather information on the typical approaches to building models that correlate customer sentiment with sales. This will help us understand the common practices, tools, and data requirements.

**Action:** web_search("how to use customer sentiment data to predict sales")

### Observation
Please wait for the Observation to proceed.



Enter your response: To use customer sentiment data to predict sales effectively, you can follow these key steps:      Collect and Process Sentiment Data: Gather customer feedback from social media, reviews, and surveys. Use sentiment analysis tools to convert this textual data into numerical sentiment scores.      Combine with Sales Data: Integrate the sentiment scores with historical sales data, ensuring alignment in terms of time periods and product or service categories.      Model Development: Use statistical or machine learning models to analyze the relationship between sentiment scores and sales figures. Common models include regression analysis, time series forecasting, and machine learning algorithms like random forests or neural networks.      Validation and Refinement: Validate the model with a separate set of data to check its accuracy and refine it based on the results.      Deployment and Monitoring: Implement the model to predict future sales based on new sentiment data,

Received response from GPT:
### Thought
The observation provides a clear roadmap for using customer sentiment data to predict sales, which involves collecting sentiment data, combining it with sales data, developing a model, validating this model, and then deploying and monitoring it. Before proceeding with these steps, it's crucial to understand the specific data we have access to or need to collect. This includes the sources of customer sentiment data (e.g., social media, reviews, surveys) and the historical sales data we can use for model training.

### Action
To better understand the data requirements and preparation process for this project, I will ask a question to clarify the availability and nature of the data we have or need.

**Action:** ask_question("Do we already have access to customer sentiment data and historical sales data? If so, what is the format and scope of this data?")

### Observation
Please wait for the Observation to proceed.



Enter your response: Yes, we have access to customer sentiment data collected from online reviews and social media mentions, formatted as JSON files. Our historical sales data is stored in Excel spreadsheets and covers the past three years, detailing monthly sales figures per product category.


Received response from GPT:
### Thought
With the availability of customer sentiment data in JSON format and historical sales data in Excel spreadsheets, the next steps involve processing these datasets to a usable format for analysis and model building. The sentiment data will likely require cleaning and sentiment analysis to convert textual data into numerical scores. The sales data will need to be aggregated or transformed to match the sentiment data's granularity, ensuring that the analysis correlates sentiment to sales effectively. Given the formats, we'll need to convert the JSON and Excel files into a format that can be easily manipulated and analyzed, such as a pandas DataFrame in Python.

### Action
Before proceeding with data processing, it's essential to convert the JSON sentiment data and Excel sales data into pandas DataFrames. This will facilitate easier manipulation and analysis in the subsequent steps. Since I can't directly convert files here, I'll simulate querying a d

Enter your response: stop


## Tree of Thoughts Prompting

In [10]:
tot_system_prompt = """Imagine three different experts are answering this question.
Each expert has a specific area of expertise, which should be declared when they are speaking.
All experts will write down one step of their thinking, then share it with the group.
Then all experts will go on to the next step, etc.
If any expert realises they're wrong or failing to contribute at any point then they leave.
I can also instruct you to remove an expert from the conversation.
Let's do this exercise one step at a time. I will provide feedback between steps.
"""

task = "I need help designing a model that will use customer sentiment data to predict future sales."

message = task
print(f"Sending message to GPT:\n{message}\n")

messages = [message]  # Collect messages
while True:

     # Send messages to LLM
    llm_response = invoke_GPT("\n\n".join(messages), system_prompt=tot_system_prompt) 

    # Record the LLM response
    messages.append(llm_response)
    
    print(f"Received response from GPT:\n{llm_response}\n")

    # Simulate the execution of the Action (i.e. provide an "Observation")
    message = input("Enter your response:")
    messages.append(message)
    if message == "stop":
        break

Sending message to GPT:
I need help designing a model that will use customer sentiment data to predict future sales.

Received response from GPT:
To address your request, let's introduce three experts in the fields relevant to your project:

1. **Data Scientist**: Specializes in machine learning, statistical analysis, and predictive modeling.
2. **Business Analyst**: Focuses on market trends, business strategies, and the financial implications of data insights.
3. **NLP (Natural Language Processing) Expert**: Concentrates on analyzing, understanding, and generating human languages through machine learning and AI techniques.

**Step 1: Initial Thoughts**

**Data Scientist**: The first step in designing a model to use customer sentiment data to predict future sales is to gather and preprocess the data. This involves collecting customer reviews, ratings, and any other relevant sentiment data, then cleaning and structuring it for analysis. We'll need to decide on the type of model that mig

Enter your response: The initial plan sounds comprehensive and well-structured. I would suggest incorporating a feedback loop into the model to continuously refine our predictions based on new data and sales outcomes, ensuring our model adapts over time to changes in consumer behavior and market conditions.


Received response from GPT:
**Step 2: Data Collection and Preparation**

**Data Scientist**: Before we proceed with any modeling, we need to ensure we have a robust dataset. This involves not only collecting customer sentiment data but also historical sales data, as we'll need to correlate sentiment with actual sales outcomes. It's important to ensure the data spans a sufficient time frame to capture seasonal variations and any market trends. Data quality is key, so we'll need to identify and handle missing values, outliers, and any inconsistencies in the data.

**Business Analyst**: In parallel with the data scientist's efforts, I'll focus on gathering contextual information that could affect the interpretation of our data. This includes industry trends, competitor performance, and any external factors such as economic indicators that might influence sales. This contextual data can help us understand anomalies in the data and may also be used as additional features in our predictive m

Enter your response: Okay great! Can each of you please tell me in 1 or 2 sentences what your action items are?


Received response from GPT:
**Data Scientist**: My action items involve collecting a robust dataset that includes both customer sentiment and historical sales data, ensuring it's of high quality by handling missing values and outliers, and preparing it for analysis.

**Business Analyst**: I will gather contextual and industry-specific data that could impact sales and sentiment analysis, including economic indicators, market trends, and competitor performance, to provide a comprehensive background for our predictive model.

**NLP Expert**: I'm tasked with compiling a diverse and voluminous dataset of customer feedback from various sources and preparing it for analysis through cleaning, standardizing, and initial preprocessing steps like tokenization.



Enter your response: stop


# LLM Agents

<img src="./slides/Slide34.png" alt="Image description" width="1000">

<img src="./slides/Slide35.png" alt="Image description" width="1000">

## Demo: DSPy: "Programming—not prompting—Foundation Models"

<img src="./slides/Slide37.png" alt="Image description" width="1000">

In [11]:
from IPython.display import display, HTML
from openai import OpenAI
from pydantic import BaseModel, Field
import time
from typing import List
import wget

import dspy
from dspy import Signature, InputField, OutputField
from dspy.functional import TypedPredictor

dspy.settings.configure(lm=dspy.OpenAI(model="gpt-4o", max_tokens=2048))
client = OpenAI()

In [16]:
class AIImage(BaseModel):
    """A single generated image."""

    prompt: str = Field(desc="The prompt used to generate the image.")
    url: str = Field(
        desc="The URL of the generated image.", default="./img/placeholder.webp"
    )


class Slide(BaseModel):
    """A single slide in a lecture."""

    title: str = Field(desc="The slide's title.")
    bullets: List[str] = Field(
        desc="Up to 5 bullet points of concise, relevant content."
    )
    image: AIImage = Field(desc="A nice AI generated image to accompany the slide.")
    python_code_example: str = Field(
        desc="An optional Python code example to include in the slide.", default=None
    )

    def to_html(self):
        html_output = f'<h2>{self.title}</h2><table><tr><td><img src="{self.image.url}" width="400" alt="{self.image.prompt}"></td><td>'
        for bullet in self.bullets:
            html_output += f"<li>{bullet}</li>"
        if self.python_code_example:
            html_output += f"<pre><code>{self.python_code_example}</code></pre>"
        html_output += "</td></tr></table><hr>"
        return html_output


class Lecture(BaseModel):
    """A complete lecture with a title, description, and content."""

    title: str = Field(desc="The lecture's title.")
    description: str = Field(desc="A brief description of the lecture.")
    slides: List[Slide] = Field(desc="The slides that make up the lecture.")

    def to_html(self):
        html_output = f"<h1>{self.title}</h1><p>{self.description}</p><hr>"
        for slide in self.slides:
            html_output += slide.to_html()
        return html_output


class LectureCreator(Signature):
    """Create content for a great lecture."""

    lecture_subject: str = InputField(desc="The subject of the lecture.")
    lecture_content: Lecture = OutputField(desc="The complete lecture content.")

### Bringing it all together

In [17]:
def create_my_lecture(subject:str):
    lecture_creator = TypedPredictor(LectureCreator)
    lecture = lecture_creator(lecture_subject=subject)
    for slide in lecture.lecture_content.slides:
        prompt = slide.image.prompt
        print(f"Calling OpenAI to generate an image for the prompt: {prompt}")
        for _ in range(3):
            try:
                # Call OpenAI to generate the image
                dalle_response = client.images.generate(
                    model="dall-e-3",
                    prompt=prompt,
                    size="1024x1024",
                    quality="standard",
                    n=1,
                )
                # Download and save it
                image_url = dalle_response.data[0].url
                image_filename = wget.download(image_url, out="./img")
                slide.image.url = image_filename
                break
            except Exception as e:
                print(f"Error calling OpenAI: {e}, retrying after 5 seconds...")
                time.sleep(5)
                continue

    # Save the markdown to a file
    with open(f"{subject}_lecture.html", "w") as file:
        file.write(lecture.lecture_content.to_html())

    return lecture

In [18]:
lecture = create_my_lecture("Sentiment Analysis for Sales Prediction")
display(HTML(lecture.lecture_content.to_html()))

Calling OpenAI to generate an image for the prompt: A diverse group of people analyzing social media posts on a large screen
100% [..........................................................................] 3162696 / 3162696Calling OpenAI to generate an image for the prompt: A graph showing sales trends influenced by social media sentiment
100% [..........................................................................] 3162696 / 3162696Calling OpenAI to generate an image for the prompt: A computer screen displaying code and sentiment analysis results
100% [..........................................................................] 3162696 / 3162696Calling OpenAI to generate an image for the prompt: A Python code editor with sentiment analysis code
100% [..........................................................................] 3162696 / 3162696Calling OpenAI to generate an image for the prompt: A team of data scientists working on sales forecasting using sentiment analysis
100% [....

0,1
,Definition of Sentiment AnalysisImportance in various industriesBasic techniques and tools

0,1
,How sentiment affects consumer behaviorCase studies of sentiment analysis in salesBenefits of using sentiment analysis for sales forecasting

0,1
,Overview of popular sentiment analysis toolsIntroduction to Natural Language Processing (NLP)Machine learning models for sentiment analysis

0,1
,Setting up the environmentUsing libraries like NLTK and TextBlobBuilding a simple sentiment analysis modelimport nltk from textblob import TextBlob # Sample text text = 'I love this product! It has changed my life.' # Create a TextBlob object blob = TextBlob(text) # Get the sentiment sentiment = blob.sentiment print(sentiment)

0,1
,Overview of the case studyData collection and preprocessingModel training and evaluation

0,1
,Common challenges in sentiment analysisEthical considerationsFuture trends and advancements


## Data Analytics Agent

In [19]:
airbnb_schema = """kind: DatasetMetadata
version: 0.1
metadata:
  name: AirBNB
spec:
  desc: "New York City Airbnb Open Data"
  tables:
    - name: AB_NYC_2019
      desc: 'Since 2008, guests and hosts have used Airbnb to expand on traveling possibilities
    and present more unique, personalized way of experiencing the world. This dataset
    describes the listing activity and metrics in NYC, NY for 2019.

    Content

    This data file includes all needed information to find out more about hosts, geographical
    availability, necessary metrics to make predictions and draw conclusions.'
      columns:
      - desc: listing ID
        name: id
      - desc: name of the listing
        name: name
      - desc: host ID
        name: host_id
      - desc: name of the host
        name: host_name
      - desc: location
        name: neighbourhood_group
      - desc: area
        name: neighbourhood
      - desc: latitude coordinates
        name: latitude
      - desc: longitude coordinates
        name: longitude
      - desc: listing space type
        name: room_type
      - desc: price in dollars
        name: price
      - desc: amount of nights minimum
        name: minimum_nights
      - desc: number of reviews
        name: number_of_reviews
      - desc: latest review
        name: last_review
      - desc: number of review per month
        name: reviews_per_month
      - desc: amount of listing per host
        name: calculated_host_listings_count
      - desc: number of days when listing is available for booking
        name: availability_365"""

In [20]:
import re

import pandas as pd
from pydantic import BaseModel, Field
from sqlalchemy import create_engine
from typing import Any, Dict

# Assuming dspy is properly imported and configured
import dspy
from dspy.functional import TypedPredictor
from dspy import Signature, InputField, OutputField

# Define the database URI and schema
database_URI = "postgresql+psycopg2://postgres:postgres@localhost:5432/nyc_airbnb"

def parse_code_block(code_block, kind):
    pattern = f"```{kind}(.*?)```"
    match = re.search(pattern, code_block, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

class SQLCodeGenerator(Signature):
    """Generate PostgreSQL code to access data from the database."""
    task: str = InputField(desc="The Database Specialist's task in natural language.")
    database_schema: str = InputField(desc="The database's schema.")
    database_URI: str = InputField(desc="The database's URI.")
    thoughts: str = OutputField(desc="The Database Specialist's high-level plan for writing a great query.")
    sql_code_block: str = OutputField(desc="The SQL code block to access the data.")

class DatabaseSpecialist(BaseModel):
    """Write and execute a PostgreSQL query to access data from the database."""
    database_task: str = Field(desc="A natural-language task for the DatabaseSpecialist.")
    database_schema: str = Field(desc="The database schema.")

    def execute_query(self, query) -> pd.DataFrame:
        print(f"Executing query:\n\n{query}\n")
        try:
            engine = create_engine(database_URI)
            with engine.connect() as connection:
                df = pd.read_sql_query(query, connection)
            return df
        except Exception as e:
            print(f"Query execution failed: {e}")
            return pd.DataFrame()  # Return an empty DataFrame on failure

    def execute(self) -> pd.DataFrame:
        print("Generating query...\n")
        code_generator = TypedPredictor(SQLCodeGenerator)
        response = code_generator(task=self.database_task, database_schema=self.database_schema, database_URI=database_URI)
        print("Done!\n")
        sql_query = parse_code_block(response.sql_code_block, kind="sql")
        return self.execute_query(sql_query)

# Usage example
result = DatabaseSpecialist(
    # database_task="Please get me the average price and standard deviation by borough.",
    database_task="Which 5 neighbourhoods have the listings with the most reviews?",
    database_schema=airbnb_schema
).execute()

print(result)

Generating query...

Done!

Executing query:

SELECT 
    neighbourhood, 
    SUM(number_of_reviews) AS total_reviews
FROM 
    AB_NYC_2019
GROUP BY 
    neighbourhood
ORDER BY 
    total_reviews DESC
LIMIT 5;

        neighbourhood  total_reviews
0  Bedford-Stuyvesant       110352.0
1        Williamsburg        85427.0
2              Harlem        75962.0
3            Bushwick        52514.0
4      Hell's Kitchen        50227.0
