# Tagging

In [1]:
from pydantic import BaseModel
from llama_index.program import OpenAIPydanticProgram, LLMTextCompletionProgram
from llama_index.llms import OpenAI, MistralAI
from llama_index.prompts import PromptTemplate
from llama_index.output_parsers import PydanticOutputParser

import pandas as pd
from dotenv import dotenv_values

Set env var OPENAI_API_KEY, MISTRAL_API_KEY or load from a .env file

In [2]:
config = dotenv_values()
OPENAI_API_KEY = config["OPENAI_API_KEY"]
MISTRAL_API_KEY = config["MISTRAL_API_KEY"]

## Load Datasets

Load Kaggle's Stock-Market Sentiment Dataset

In [3]:
!mkdir -p data
!wget "https://drive.google.com/uc?export=download&id=1fprVejtXNlOrO0Kv2Izp2MJWNGUSbi11" -O data/tweets.csv

--2023-12-20 15:20:39--  https://drive.google.com/uc?export=download&id=1fprVejtXNlOrO0Kv2Izp2MJWNGUSbi11
Resolving drive.google.com (drive.google.com)... 173.194.73.138, 173.194.73.100, 173.194.73.102, ...
Connecting to drive.google.com (drive.google.com)|173.194.73.138|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0g-30-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/9cgsqin934pt90b6062ueg3msqfojtr6/1703064000000/04295854648265356048/*/1fprVejtXNlOrO0Kv2Izp2MJWNGUSbi11?e=download&uuid=2ef91373-20c3-4000-8afe-511ad818d80f [following]
--2023-12-20 15:20:40--  https://doc-0g-30-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/9cgsqin934pt90b6062ueg3msqfojtr6/1703064000000/04295854648265356048/*/1fprVejtXNlOrO0Kv2Izp2MJWNGUSbi11?e=download&uuid=2ef91373-20c3-4000-8afe-511ad818d80f
Resolving doc-0g-30-docs.googleusercontent.com (doc-0g-30-docs.googleusercontent.com)... 216.58.212.193
Connec

In [4]:
df = pd.read_csv("data/tweets.csv", sep=",")
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [5]:
# Randomly select 10 samlpes and create list of str
df_elements = df.sample(n=10)
tweets_list = df_elements['Text'].to_list()
tweets_list

["JPM bounced off it's fib support level today too.",
 'user: Nice B/O for BMY today. Not a lot of volume though. First pullback to confirm.  ',
 'There are arguments for American banks both to suspend and to keep their dividend payouts. But the degree of creditâ€¦ https://t.co/a30vacnv70',
 'ssys if fundamentals matter... headed under 40. income is 10% sales. 40mil net inc. puts this at PE of 60 with stock at 60. doubt it',
 'BWD Wants to fill the gap which will be complete at 83.10. Higher highs and MACD and SI showing bullish signals.',
 'Heard on the Street: Investors are betting on IACâ€™s remodel, but just how quickly can the whole thing be fixed? https://t.co/COJsGVy9pH',
 'biggest mistake i made all day was not sticking to my gut on reversal in #SOAS #2 not sticking to my watchlist DK STP ANA KOG',
 'U.S. corporate bonds suffer negative ratings moves, while analysts say more may be coming https://t.co/w35t7jJkwY',
 "Time to all in short AX AMZN NTAP IBM. #cloud-is-dead, iPads n

## Define output schema

We use a Pydantic schema to specify the required properties and types.

In [6]:
class Tag(BaseModel):
    """Data model for tagging."""

    text: str
    language: str
    sentiment: str
    toxicity: float
    hate: float
    hate_threatening: float
    violence: float

## Tagging with OpenAI

Define openai pydantic program

In [7]:
openai_prompt_template = """\
I request you to divide the user comment below into aspects and perform sentiment analysis and give the result in sentiment analysis json format. Just return result in json format. Don't add any other comments.

Note: The toxicity, hate, hate_threatening and violence scores should be in the range (0.1 - 1). The scores should be a float.
The sentiment label should be either ['positive', 'negative', 'neutral'].
The text label should be the comment itself.
The language label should be a full name of the language in which the comment is written.

comment: {user_comment}\
"""

openai_model_name = "gpt-3.5-turbo-1106"

openai_program = OpenAIPydanticProgram.from_defaults(
    output_cls=Tag,
    llm=OpenAI(model=openai_model_name, api_key=OPENAI_API_KEY),
    prompt_template_str=openai_prompt_template,
    verbose=True,
)

Run program to get tagging output.

In [8]:
# from llama_index.response.notebook_utils import display_response

openai_output = openai_program(user_comment="I love this world!")

Function call: Tag with args: {"text":"I love this world!","language":"English","sentiment":"positive","toxicity":0.1,"hate":0.1,"hate_threatening":0.1,"violence":0.1}


In [9]:
openai_output.model_dump()

{'text': 'I love this world!',
 'language': 'English',
 'sentiment': 'positive',
 'toxicity': 0.1,
 'hate': 0.1,
 'hate_threatening': 0.1,
 'violence': 0.1}

Run the tagging program on dataset

In [10]:
openai_tagged_list = []
errors = []
for tweet in tweets_list:
    # from llama_index.response.notebook_utils import display_response

    try:
        output = openai_program(user_comment=tweet)
        openai_tagged_list.append(output)
    except Exception as e:
        errors.append(tweet)

Function call: Tag with args: {"text":"JPM bounced off it's fib support level today too.","language":"English","sentiment":"positive","toxicity":0.2,"hate":0.1,"hate_threatening":0.1,"violence":0.1}
Function call: Tag with args: {"text":"Nice B/O for BMY today. Not a lot of volume though. First pullback to confirm.","language":"English"}
Function call: Tag with args: {"text":"There are arguments for American banks both to suspend and to keep their dividend payouts. But the degree of creditâ€¦ https://t.co/a30vacnv70","language":"English","sentiment":"positive","toxicity":0.2,"hate":0.1,"hate_threatening":0.1,"violence":0.1}
Function call: Tag with args: {"text":"ssys if fundamentals matter... headed under 40. income is 10% sales. 40mil net inc. puts this at PE of 60 with stock at 60. doubt it","language":"English"}
Function call: Tag with args: {"text":"BWD Wants to fill the gap which will be complete at 83.10. Higher highs and MACD and SI showing bullish signals.","language":"English"

In [11]:
for tagged_tweet in openai_tagged_list:
    print(tagged_tweet.model_dump_json(indent=2))

{
  "text": "JPM bounced off it's fib support level today too.",
  "language": "English",
  "sentiment": "positive",
  "toxicity": 0.2,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "There are arguments for American banks both to suspend and to keep their dividend payouts. But the degree of creditâ€¦ https://t.co/a30vacnv70",
  "language": "English",
  "sentiment": "positive",
  "toxicity": 0.2,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "BWD Wants to fill the gap which will be complete at 83.10. Higher highs and MACD and SI showing bullish signals.",
  "language": "English",
  "sentiment": "positive",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "biggest mistake i made all day was not sticking to my gut on reversal in #SOAS #2 not sticking to my watchlist DK STP ANA KOG",
  "language": "English",
  "sentiment": "negative",
  "toxicity": 0.3,
  "hate": 0.1,
  "hate_threatening": 0

## Tagging with Mixtral 8x7

In [12]:
mixtral_prompt_template = """
<s> [INST] I request you to divide the user comment below into aspects and perform sentiment analysis and give the result in sentiment analysis json format. Just return result in json format. Don't add any other comments.

Note: The toxicity, hate, hate_threatening and violence scores should be in the range (0.1 - 1). The scores should be a float.
The sentiment label should be either ['positive', 'negative', 'neutral'].
The text label should be the comment itself. 
The language label should be a full name of the language in which the comment is written. [/INST] </s> 
[INST] comment: {user_comment} 
Answer: [/INST]
"""

mixtral_prompt_tmpl = PromptTemplate(mixtral_prompt_template)

mixtral_model_name = "mistral-small"

mixtral_program = LLMTextCompletionProgram(
    output_parser=PydanticOutputParser(Tag),
    prompt=mixtral_prompt_tmpl,
    llm=MistralAI(model=mixtral_model_name, api_key=MISTRAL_API_KEY),
    verbose=True,
)

Run program to get tagging output.

In [13]:
mixtral_output = mixtral_program(user_comment="I love this world!")

In [14]:
mixtral_output.model_dump()

{'text': 'I love this world!',
 'language': 'English',
 'sentiment': 'positive',
 'toxicity': 0.1,
 'hate': 0.1,
 'hate_threatening': 0.1,
 'violence': 0.1}

Run the tagging program on dataset

In [15]:
mixtral_tagged_list = []
errors = []
for tweet in tweets_list:
    # from llama_index.response.notebook_utils import display_response

    try:
        output = mixtral_program(user_comment=tweet)
        mixtral_tagged_list.append(output)
    except Exception as e:
        errors.append(tweet)

In [16]:
for tagged_tweet in mixtral_tagged_list:
    print(tagged_tweet.model_dump_json(indent=2))

{
  "text": "JPM bounced off it's fib support level today too.",
  "language": "English",
  "sentiment": "neutral",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "Nice B/O for BMY today. Not a lot of volume though. First pullback to confirm.",
  "language": "English",
  "sentiment": "positive",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "There are arguments for American banks both to suspend and to keep their dividend payouts. But the degree of creditâ€¦ https://t.co/a30vacnv70",
  "language": "English",
  "sentiment": "neutral",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "BWD Wants to fill the gap which will be complete at 83.10. Higher highs and MACD and SI showing bullish signals.",
  "language": "English",
  "sentiment": "positive",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "Heard on the