# Import

In [1]:
import pandas as pd
import requests
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm
from langchain.docstore.document import Document
import ollama

## Define max context size to pass to the model at once
Different models have different context length they can handle. For longer context length we have to split and pass the article in smaller parts to the model.

In [2]:
# context_size = 1500

# Load data

In [2]:
ticker_to_stock = {'GE': 'General Electric (GE)'}

In [26]:
ext_df = pd.read_csv('../dat_ge.csv', index_col=0)
ext_df.head()

Unnamed: 0_level_0,Article_title,Stock_symbol,Url,Publisher,Author,Article,Lsa_summary,Luhn_summary,Textrank_summary,Lexrank_summary
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-12-16 21:00:00+00:00,Rolls-Royce can grow market share and improve ...,GE,https://www.nasdaq.com/articles/rolls-royce-ca...,,,"By Paul Sandle, Sarah Young and Tim Hepher\nLO...",But his challenge is to convince airlines afte...,Long-standing customer Thai Airways is finalis...,Based on underlying aircraft deliveries and ex...,Our engine will be very competitive to GE's en...
2023-12-16 00:00:00+00:00,General Electric's (GE) Unit Wins Wind Turbine...,GE,https://www.nasdaq.com/articles/general-electr...,,,"General Electric Company’s GE unit, GE Vernova...","Per the deal, GE will supply approximately 693...",Image Source: Zacks Investment Research Zacks ...,"General Electric Company’s GE unit, GE Vernova...","General Electric Company’s GE unit, GE Vernova..."
2023-12-16 00:00:00+00:00,FOCUS-North American aviation companies get la...,GE,https://www.nasdaq.com/articles/focus-north-am...,,,"By Allison Lampert\nMONTREAL, Dec 19 (Reuters)...","Business jet maker Bombardier BBDb.TO, which h...","Business jet maker Bombardier BBDb.TO, which h...",As clients like engine maker General Electric ...,As clients like engine maker General Electric ...
2023-12-16 00:00:00+00:00,Airbus on course for record jetliner orders in...,GE,https://www.nasdaq.com/articles/airbus-on-cour...,,,"By Tim Hepher\nPARIS, Dec 19 (Reuters) - Airbu...",Airlines are scrambling to order new planes to...,"Despite the positive end-year note, Airbus is ...","Despite the positive end-year note, Airbus is ...","With just over 10 days to go, the total has re..."
2023-12-15 00:00:00+00:00,Validea Detailed Fundamental Analysis - GE,GE,https://www.nasdaq.com/articles/validea-detail...,,,Below is Validea's guru fundamental report for...,Below is Validea's guru fundamental report for...,Detailed Analysis of GENERAL ELECTRIC CO GE Gu...,"Of the 22 guru strategies we follow, GE rates ...","Of the 22 guru strategies we follow, GE rates ..."


# Check article example

## Define quasi token calculation function

In [4]:
get_token = lambda x: len(x.replace('.!?:-',' ').split(' '))/0.75

## Show example article

In [5]:
i = 1

test_ticker = ext_df.iloc[i].Stock_symbol
test_article = ext_df.iloc[i].Article
print(i, test_ticker, '\n\n', test_article, '\n\n', get_token(test_article))

1 GE 

 General Electric Company’s GE unit, GE Vernova (combined operations of GE Digital, Renewable Energy and GE Power), has recently inked a framework deal with Forestalia for the installation of onshore wind turbines across several upcoming project sites in Spain.

Per the deal, GE will supply approximately 693 megawatts (MW) of wind turbines in 16 different project sites across the country’s Aragon region. The company will install a total of 110 units of 6.1-158 wind turbines in the Zaragoza region in six phases. The first phase, which is already in progress, will see 33 units of wind turbines being constructed at five wind farms.

Out of the 110 GE turbines, 33 turbines will have a hub height of 101 meters and the rest will have a hub height of 120.9 meters. The delivery of the turbine units will be completed by 2024 end.

This deal supports Spain’s effort to shift toward renewable sources of energy and will attract similar projects in 2024.

The collaboration between General Ele

# Define sentiment prediction functions

## Stuff
We use stuff if the whole article + prompt instruction fits into the context size of the model.

With stuff we pass the whole prompt (with instruction and article) to the model to predict the sentiment in a single stage.

### Define prompt template
It includes the instructions and the article and stock name will be inserted for each article.

In [6]:
llama3_prompt_template = """### System: 
Forget all your previous instructions. You are a financial expert with stock market analysis experience. A news article will be passed and you will give only a single score between 1-5 as an answer. Based on a specific stock, score what sentiment the article implies on a certain stock: range from 1 to 5, where 1 is negative, 2 is somewhat negative, 3 is neutral, 4 is somewhat positive, 5 is positive.
Examples:  
  Stock of interest -- Apple: `AAPL: Apple (AAPL) announced iPhone 15`: Answer: `4`
  Stock of interest -- XOM: `Exxon (XOM) price decreased more than 3%: Answer: `1`
  Stock of interest -- MSFT: `Microsoft (MSTF) price has no change`: Answer: `3`
  Stock of interest -- Visa: `Visa got better earnings than expected`: Answer: `5` 
  Stock of interest -- BRK: `BRK got slightly worse earnings than expected`: Answer: `2` 

### Article:
```
{article}
```
## User:
What is the implied sentiment of the article to {stock} stock? Answer only a single number from 1 to 5.

### Assistant:
The implied sentiment for {stock} is """

### Define functions for stuff sentiment
- clean output cleans every additional symbols from the prediction to return only the sentiment number
- stuff sentiment makes the sentiment prediction

In [10]:
def ollama_api(prompt, model='llama3_8b-Q5_km', temperature=0, max_new_tokens=1):
    response = ollama.chat(model=model, messages=[
                  {
                    'role': 'user',
                    'content': prompt,
                    'options': {
                        "seed": 123,
                        "temperature": temperature,
                        "num_predict": max_new_tokens
                  }
                  }
                ])['message']['content']
    return response

In [18]:
def clean_output(output):
    output = output[:2]
    output = output.strip(':`.!?\n\\#"')
    return output

In [19]:
def stuff_sentiment(article, stock):
    headers = {"Content-Type": "application/json"}

    template = llama3_prompt_template
    prompt = template.format(article=article, stock=stock)
    response = ollama_api(prompt)
    response =  clean_output(response)
    return response

### Test

In [20]:
i=1
stuff_sentiment(ext_df.iloc[i].Article, ticker_to_stock[ext_df.iloc[i].Stock_symbol])

'4'

In [23]:
sample = ext_df.sample(n=2)
sample

Unnamed: 0_level_0,Date,Article_title,Stock_symbol,Url,Publisher,Author,Article,Lsa_summary,Luhn_summary,Textrank_summary,Lexrank_summary
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1102041.0,2022-09-20 00:00:00 UTC,General Electric (GE) Sees Supply Chain Impact...,GE,https://www.nasdaq.com/articles/general-electr...,,,General Electric GE continues to grapple with ...,General Electric GE continues to grapple with ...,General Electric Company Price General Electri...,General Electric Company Price General Electri...,Special Report: The Top 5 IPOs for Your Portfo...
1101335.0,2023-09-21 00:00:00 UTC,Zacks Industry Outlook Highlights General Elec...,GE,https://www.nasdaq.com/articles/zacks-industry...,,,"For Immediate Release\nChicago, IL – September...","GE, sporting a Zacks Rank #1 (Strong Buy), is ...","Over the past five years, the industry has tra...",Zacks Industry Rank Suggests Bright Prospects ...,Zacks Investment Research does not engage in i...


In [24]:
results = []
for i in sample.index:
    response = stuff_sentiment(ext_df.loc[i].Article, ticker_to_stock[ext_df.loc[i].Stock_symbol])
    results.append(response)

results


['2', '4']

In [25]:
result_df = pd.DataFrame(results, index=sample.index)
result_df

Unnamed: 0_level_0,0
Unnamed: 0,Unnamed: 1_level_1
1102041.0,2
1101335.0,4


## Map Reduce
If the article exceeds this context size then use map-reduce: 
- split the atricle into smaller parts
- make shorter summaries with sentiment in focus
- concat the sumamries and make final sentiment prediction

### Define map prompt template
This prompt is used to summarize article parts to shorter versions with focus on extracting the sentiment info. These summaries will be used for filan sentiment prediction.

In [23]:
llama3_map_template = """### System: 
Forget all your previous instructions. You are a financial expert with stock market analysis experience. A news article part will be passed and you will give a summary of it emphasizing the sentiment of the text to the stock.
Examples:
 `Apple got earnings of 1.5 billion $ exceeding expectations by 0.1 billion dollars than expected`: Answer: `Positive sentiment as Apple earnings were better than expeted.` 

### Article part:
```
{article}
```

### User:
Summarize the article of {stock} stock. Answer very shortly. Do not repeat the instructions.

### Assistant:
The summary of the article in a few words is:

"""

### Define functions for stuff sentiment
- clean output mapped article clean removes the prompt instruction in case the model returns it
- map-reduce sentiment makes the sentiment prediction

In [24]:
def clean_mapped_article(mapped_article):
    return mapped_article[:mapped_article.find('###')]

In [116]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1500,
    chunk_overlap=100,
    length_function=get_token ,
    is_separator_regex=False,
)

def map_reduce_sentiment(article, stock):
    headers = {"Content-Type": "application/json"}

    # split the article to chunks
    doc =  [Document(page_content=article, metadata={"source": "local"})] ## ext_df.iloc[18].Article
    sources = text_splitter.split_documents(doc)

    # iterate through the article parts and map them 
    mapped_article_parts_list = []
    for article_part_doc in sources:
        article_part = article_part_doc.page_content
        prompt = llama3_map_template.format(article=article_part, stock=stock)
    
        # json_data = {"prompt": prompt, "max_tokens": 100, "temperature": 0.01,}
        # mapped_article = requests.post(oobabooga_endpoint, headers=headers, json=json_data).json()['results'][0]['text'] 
        # mapped_article = clean_mapped_article(mapped_article).replace('\n', '')
        # mapped_article_parts_list.append(mapped_article)

        # Access Llama3-8B through ollama
        mapped_article = ollama.chat(model='llama3', messages=[
              {
                'role': 'user',
                'content': prompt,
                'options': {
                    "seed": 123,
                    "temperature": 0,
                    "num_predict": 100
              }
              }
            ])['message']['content']
        mapped_article = clean_mapped_article(mapped_article).replace('\n', '')
        print('Summarized by llama3: ', mapped_article)
        mapped_article_parts_list.append(mapped_article)
        
    # concat the mapped article parts
    joined_article_parts = '\n'.join(mapped_article_parts_list)
    prompt = llama3_prompt_template.format(article=joined_article_parts, stock=stock)

    # json_data = {"prompt": prompt, "max_tokens": 1, "temperature": 0.01}
    # response = requests.post(oobabooga_endpoint, headers=headers, json=json_data).json()['results'][0]['text']
    response = ollama.chat(model='llama3', messages=[
              {
                'role': 'user',
                'content': prompt,
                'options': {
                    "seed": 123,
                    "temperature": 0,
                    "num_predict": 1
              }
              }
            ])['message']['content']
    response =  clean_output(response)
    print('response after clean_output: ', response)
    return response

In [1]:
result_value = map_reduce_sentiment(ext_df.loc[1104431].Article, ticker_to_stock[ext_df.loc[1104431].Stock_symbol])

NameError: name 'map_reduce_sentiment' is not defined

# Predict sentiment of articles



### Test on list of articles

In [118]:
sample = ext_df.sample(n=5)

for i in sample.index:
    print('\n ID: ', i)
    print('Lsa_summary: ', sample.loc[i, 'Lsa_summary'])
    print(map_reduce_sentiment(sample.loc[i, 'Article'], sample.loc[i, 'Stock_symbol']))


 ID:  1107890.0
Lsa_summary:  Quarterly results excluded charges pertaining to closure of the GE Healthcare acquisition, planned site shutdowns and revisions to restructuring charges recorded in the second quarter reflecting updated information. A rise in interchange fees and other revenues were, however, partially offset by lower service charges and other customer-related fees as well as net other-than-temporary impairment recognized in earnings. Net charge-off rate rose 16 bps year over year to 1.96%.
Summarized by llama3:  No summary of GE stock was provided in the article. The article only discussed Capital One Financial Corporation (COF)
response after clean_output:  3
3

 ID:  1101955.0
Lsa_summary:  Xu Yanjun, the first Chinese spy extradited to the United States for trial, was convicted in Nov. 2021 by a federal jury on counts of conspiring and attempting to commit economic espionage and trade secret theft. "Today's sentence demonstrates the seriousness of those crimes and the