# Testing final model

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
tokenizer = AutoTokenizer.from_pretrained("fuchenru/Trading-Hero-LLM")
model = AutoModelForSequenceClassification.from_pretrained("fuchenru/Trading-Hero-LLM")
nlp = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Preprocess the input text
def preprocess(text, tokenizer, max_length=128):
    inputs = tokenizer(text, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    return inputs

# Function to perform prediction
def predict_sentiment(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted label
    predicted_label = torch.argmax(outputs.logits, dim=1).item()

    # Map the predicted label to the original labels
    label_map = {0: 'neutral', 1: 'positive', 2: 'negative'}
    predicted_sentiment = label_map[predicted_label]

    return predicted_sentiment

# Example usage
# user_input = input("Enter the news text: ")
# predicted_sentiment = predict_sentiment(user_input)
stock_news = [
    "Market analysts predict a stable outlook for the coming weeks.",
    "The market remained relatively flat today, with minimal movement in stock prices.",
    "Investor sentiment improved following news of a potential trade deal.",
    "The stock market closed unchanged after a day of mixed trading.",
    "Economic indicators suggest a period of consolidation in the stock market.",
    "The stock market surged today, with record gains across all sectors.",
    "Investors panicked as stock prices plummeted amidst economic uncertainty.",
    "Market analysts predict a stable outlook for the coming weeks.",
    "Tech stocks rallied after a positive earnings report from major companies.",
    "The market experienced a sharp decline, causing concern among investors.",
    "Investor sentiment improved following news of a potential trade deal.",
    "A bear market seems imminent as stock prices continue to fall.",
    "Positive economic indicators boosted investor confidence in the market.",
    "Shares of XYZ Corporation soared after announcing a new product launch.",
    "The market remained relatively flat today, with minimal movement in stock prices."
    "Investors are awaiting key economic data before making major decisions.",
    "Market volatility remained low as investors adopted a wait-and-see approach.",
    "The market showed signs of indecision as trading volume remained light.",
    "Analysts expect the market to trade sideways in the near term.",
    "Investors remained cautious amid uncertainty in global markets."
]


for i in stock_news:
    predicted_sentiment = predict_sentiment(i)
    print("Predicted Sentiment:", predicted_sentiment)
    


2024-05-25 11:12:42.547346: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/839 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Predicted Sentiment: neutral
Predicted Sentiment: neutral
Predicted Sentiment: positive
Predicted Sentiment: neutral
Predicted Sentiment: neutral
Predicted Sentiment: positive
Predicted Sentiment: negative
Predicted Sentiment: neutral
Predicted Sentiment: positive
Predicted Sentiment: negative
Predicted Sentiment: positive
Predicted Sentiment: negative
Predicted Sentiment: positive
Predicted Sentiment: positive
Predicted Sentiment: neutral
Predicted Sentiment: negative
Predicted Sentiment: negative
Predicted Sentiment: neutral
Predicted Sentiment: negative


In [3]:
# Faster way
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("fuchenru/Trading-Hero-LLM")
model = AutoModelForSequenceClassification.from_pretrained("fuchenru/Trading-Hero-LLM")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to preprocess the input text
def preprocess(texts, tokenizer, max_length=128):
    inputs = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    return inputs

# Function to perform batch prediction
def batch_predict_sentiment(texts, model, tokenizer, batch_size=16):
    sentiments = []
    label_map = {0: 'neutral', 1: 'positive', 2: 'negative'}
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = preprocess(batch_texts, tokenizer)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        predicted_labels = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        batch_sentiments = [label_map[label] for label in predicted_labels]
        sentiments.extend(batch_sentiments)
    
    return sentiments

# Example usage
stock_news = [
    "Market analysts predict a stable outlook for the coming weeks.",
    "The market remained relatively flat today, with minimal movement in stock prices.",
    "Investor sentiment improved following news of a potential trade deal.",
    "The stock market closed unchanged after a day of mixed trading.",
    "Economic indicators suggest a period of consolidation in the stock market.",
    "The stock market surged today, with record gains across all sectors.",
    "Investors panicked as stock prices plummeted amidst economic uncertainty.",
    "Market analysts predict a stable outlook for the coming weeks.",
    "Tech stocks rallied after a positive earnings report from major companies.",
    "The market experienced a sharp decline, causing concern among investors.",
    "Investor sentiment improved following news of a potential trade deal.",
    "A bear market seems imminent as stock prices continue to fall.",
    "Positive economic indicators boosted investor confidence in the market.",
    "Shares of XYZ Corporation soared after announcing a new product launch.",
    "The market remained relatively flat today, with minimal movement in stock prices.",
    "Investors are awaiting key economic data before making major decisions.",
    "Market volatility remained low as investors adopted a wait-and-see approach.",
    "The market showed signs of indecision as trading volume remained light.",
    "Analysts expect the market to trade sideways in the near term.",
    "Investors remained cautious amid uncertainty in global markets."
]

predicted_sentiments = batch_predict_sentiment(stock_news, model, tokenizer)
for sentiment in predicted_sentiments:
    print("Predicted Sentiment:", sentiment)


Predicted Sentiment: neutral
Predicted Sentiment: neutral
Predicted Sentiment: positive
Predicted Sentiment: neutral
Predicted Sentiment: neutral
Predicted Sentiment: positive
Predicted Sentiment: negative
Predicted Sentiment: neutral
Predicted Sentiment: positive
Predicted Sentiment: negative
Predicted Sentiment: positive
Predicted Sentiment: negative
Predicted Sentiment: positive
Predicted Sentiment: positive
Predicted Sentiment: neutral
Predicted Sentiment: neutral
Predicted Sentiment: negative
Predicted Sentiment: negative
Predicted Sentiment: neutral
Predicted Sentiment: negative


In [6]:
import pandas as pd
df1 = pd.read_parquet('fingpt_sentiment_train-00000-of-00001-7790814d50128a07.parquet')

df1

Unnamed: 0,text,label
0,"Teollisuuden Voima Oyj , the Finnish utility k...",neutral
1,Sanofi poaches AstraZeneca scientist as new re...,neutral
2,Starbucks says the workers violated safety pol...,negative
3,$brcm raises revenue forecast,positive
4,Google parent Alphabet Inc. reported revenue a...,negative
...,...,...
76767,"BP, Statoil, to Withdraw Staff From Algeria Fo...",negative
76768,NEW YORK — A fire broke out late Wednesday at ...,negative
76769,Operating profit margin increased from 11.2 % ...,positive
76770,$vxx adding to position here !,positive


In [7]:
label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}

# Apply the mapping to the 'label' column
df1['label'] = df1['label'].map(label_mapping)

df1

Unnamed: 0,text,label
0,"Teollisuuden Voima Oyj , the Finnish utility k...",0
1,Sanofi poaches AstraZeneca scientist as new re...,0
2,Starbucks says the workers violated safety pol...,2
3,$brcm raises revenue forecast,1
4,Google parent Alphabet Inc. reported revenue a...,2
...,...,...
76767,"BP, Statoil, to Withdraw Staff From Algeria Fo...",2
76768,NEW YORK — A fire broke out late Wednesday at ...,2
76769,Operating profit margin increased from 11.2 % ...,1
76770,$vxx adding to position here !,1


In [8]:
df2 = pd.read_parquet('financial_phrasebank-00000-of-00001-46d5c1a7817abe3d.parquet')

# Apply the mapping to the 'label' column
df2['label'] = df2['label'].map(label_mapping)

df2

Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",0
1,Technopolis plans to develop in stages an area...,0
2,With the new production plant the company woul...,1
3,According to the company 's updated strategy f...,1
4,"For the last quarter of 2010 , Componenta 's n...",1
...,...,...
4212,HELSINKI Thomson Financial - Shares in Cargote...,2
4213,LONDON MarketWatch -- Share prices ended lower...,2
4214,Rinkuskiai 's beer sales fell by 6.5 per cent ...,0
4215,Operating profit fell to EUR 35.4 mn from EUR ...,2


In [9]:
df3= pd.read_parquet('auditor_sentiment-00000-of-00001-7c7834e29b677695.parquet')

# Apply the mapping to the 'label' column
df3['label'] = df3['label'].map(label_mapping)

df3

Unnamed: 0,text,label
0,Altia 's operating profit jumped to EUR 47 mil...,1
1,The agreement was signed with Biohit Healthcar...,1
2,"Kesko pursues a strategy of healthy , focused ...",1
3,"Vaisala , headquartered in Helsinki in Finland...",0
4,"Also , a six-year historic analysis is provide...",0
...,...,...
3872,Dubai Nokia has announced the launch of `` Com...,1
3873,"MADISON , Wis. , Feb. 6 - PRNewswire - -- Fisk...",1
3874,The report provides a comprehensive insight in...,1
3875,Pharmaceuticals - Netherlands This brand-new m...,0


In [10]:
df4 = pd.read_parquet('twitter_financial_news_sentiment-00000-of-00001-e4ff9b93e3f0bcb7.parquet')

label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}

# Apply the mapping to the 'label' column
df4['label'] = df4['label'].map(label_mapping)

df4

Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,2
1,$CCL $RCL - Nomura points to bookings weakness...,2
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",2
3,$ESS: BTIG Research cuts to Neutral https://t....,2
4,$FNKO - Funko slides after Piper Jaffray PT cu...,2
...,...,...
9538,The Week's Gainers and Losers on the Stoxx Eur...,0
9539,Tupperware Brands among consumer gainers; Unil...,0
9540,vTv Therapeutics leads healthcare gainers; Myo...,0
9541,"WORK, XPO, PYX and AMKR among after hour movers",0


In [12]:
# Try reading the file with a different encoding
df5 = pd.read_csv('sentiment_finance.csv', encoding='latin1')

# Apply the label mapping
label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
df5['label'] = df5['label'].map(label_mapping)
df5

Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",0
1,Technopolis plans to develop in stages an area...,0
2,The international electronic industry company ...,2
3,With the new production plant the company woul...,1
4,According to the company 's updated strategy f...,1
...,...,...
4841,LONDON MarketWatch -- Share prices ended lower...,2
4842,Rinkuskiai 's beer sales fell by 6.5 per cent ...,0
4843,Operating profit fell to EUR 35.4 mn from EUR ...,2
4844,Net sales of the Paper segment decreased to EU...,2


In [16]:
df6 = pd.read_csv('Financial_News_Sentiment.csv', encoding='latin1')
df6 = df6.rename(columns={'text': 'text', 'sentiment': 'label'})
df6['label'] = df6['label'].replace({0: 2, 1: 0, 2: 1})
df6

Unnamed: 0,text,label
0,Global COVID-19 death toll exceeds 4 million.,2
1,"reports 67,208 new COVID-19 cases, 2,330 deaths.",2
2,China reports 23 new COVID-19 cases versus 19 ...,2
3,"India records 91,702 new COVID-19 cases over p...",2
4,sharply raises COVID-19 death toll prompting c...,2
...,...,...
206,U.S. equity futures were trading higher the mo...,1
207,"China, US commerce chiefs to cooperate on hand...",1
208,Stock investors celebrate red-hot five-quarter...,1
209,Wall Streetâs roaring 2021 is as good as it ...,1


In [13]:
df7 = pd.read_csv('data_2.csv')
df7.dropna(how='all')
df7 = df7.rename(columns={'Sentence': 'text', 'Sentiment': 'label'})
# Apply the label mapping
label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
df7['label'] = df7['label'].map(label_mapping)
df7

Unnamed: 0,text,label
0,The GeoSolutions technology will leverage Bene...,1
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",2
2,"For the last quarter of 2010 , Componenta 's n...",1
3,According to the Finnish-Russian Chamber of Co...,0
4,The Swedish buyout firm has sold its remaining...,0
...,...,...
5837,RISING costs have forced packaging producer Hu...,2
5838,Nordic Walking was first used as a summer trai...,0
5839,"According shipping company Viking Line , the E...",0
5840,"In the building and home improvement trade , s...",0


In [17]:
combined_df = pd.concat([df1, df2, df3, df4, df5, df6, df7], ignore_index=True)

In [22]:
import numpy as np
import pandas as pd 
from transformers import BertTokenizer, Trainer, BertForSequenceClassification, TrainingArguments
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming combined_df is already defined
df_train, df_test = train_test_split(combined_df, stratify=combined_df['label'], test_size=0.1, random_state=42)
df_train, df_val = train_test_split(df_train, stratify=df_train['label'], test_size=0.1, random_state=42)
print(df_train.shape, df_test.shape, df_val.shape)

model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-pretrain', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-pretrain')

dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

def tokenize_and_count(examples):
    tokens = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
    num_tokens = [len(input_ids) for input_ids in tokens['input_ids']]
    tokens['num_tokens'] = num_tokens
    return tokens

dataset_train = dataset_train.map(tokenize_and_count, batched=True)
dataset_val = dataset_val.map(tokenize_and_count, batched=True)
dataset_test = dataset_test.map(tokenize_and_count, batched=True)

dataset_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

# Calculate total number of tokens
total_tokens_train = sum(dataset_train['num_tokens'])
total_tokens_val = sum(dataset_val['num_tokens'])
total_tokens_test = sum(dataset_test['num_tokens'])

print("Total tokens in training set:", total_tokens_train)
print("Total tokens in validation set:", total_tokens_val)
print("Total tokens in test set:", total_tokens_test)


(85299, 2) (10531, 2) (9478, 2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at yiyanghkust/finbert-pretrain and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/85299 [00:00<?, ? examples/s]

Map:   0%|          | 0/9478 [00:00<?, ? examples/s]

Map:   0%|          | 0/10531 [00:00<?, ? examples/s]

Total tokens in training set: 10918272
Total tokens in validation set: 1213184
Total tokens in test set: 1347968
