# ABOUT:
- Fi_ATSA is a Financial News ATSA Dataset:
     - this dataset was created by making API calls from eodhistoricaldata Financial API
     - followed by manual annotation using doccano
- this notebook:
    - processes Fi_ATSA into the required input for training SATSA model 

In [1]:
import os
import jsonlines
import pandas as pd
from utils import *

### read annotated Fi_ATSA dataset

In [2]:
path = os.path.join(os.getcwd(),"tmp\\Fi_ATSA_train.jsonl")
with jsonlines.open(path, 'r') as reader:
    columns = ["data", "label", "opinion_words"]
    dataset = pd.DataFrame([line for line in reader], columns=columns)
dataset.head()

Unnamed: 0,data,label,opinion_words
0,Some of the top stocks to play this rise in in...,"[[98, 113, NEU], [141, 158, NEU]]",[]
1,Major tech stocks like Apple Inc. (NASDAQ:AAPL...,"[[23, 28, POS], [49, 58, POS], [89, 93, POS], ...",[finally trading in the green]
2,"”Major semiconductors companies like Intel, Nv...","[[44, 50, NEU], [37, 42, NEU], [55, 58, NEU]]",[]
3,Here is what the fund said:“NVIDIA Corporation...,"[[55, 59, NEU], [28, 46, NEU]]",[]
4,“Biden has pointedly ignored Tesla at every tu...,"[[29, 34, NEG], [87, 89, NEU]]",[pointedly ignored]


In [3]:
# from datasets import ClassLabel, Sequence
import pandas as pd
from IPython.display import display, HTML
def show_elements(dataset, randomize = True, num_samples = 10):
    
    if isinstance(dataset,pd.DataFrame):                  # if DataFrame 
        if randomize:                                          # if random> shuffle
            dataset = dataset.sample(frac=1)
        display(HTML(dataset.iloc[:num_samples].to_html()))             # take first n rows
    
    else:                                                    # if not DataFrame
        if randomize:                                           # if random> shuffle
            dataset = dataset.shuffle()   
        dataset = pd.DataFrame(dataset.select(range(num_samples)))   # convert first n rows to dataframe
        display(HTML(dataset.to_html()))

In [4]:
show_elements(dataset,randomize = True)

Unnamed: 0,data,label,opinion_words
419,"Zoom Video Communications, Inc. (NASDAQ:ZM)’s stock price has fallen by two-thirds from its peak of last October, mainly due to market concerns about the decline in revenue growth after the pandemic, the loss of small and medium-sized business customers, and competition from Microsoft Teams, according to the analyst.","[[276, 285, NEU], [0, 25, NEG]]",[stock price has fallen]
420,"Given Buffett is best known for his holding of stakes in “old school” companies like American Express (NYSE:AXP) and Coca-Cola (NYSE:KO), investing heavily in a tech stock back then would’ve seemed unfathomable.","[[85, 101, NEU], [117, 126, NEU]]",[]
5,"Target Raises Pay, Putting Pressure on Amazon, Walmart.","[[39, 45, NEG], [47, 54, NEG], [0, 6, NEU]]",[Putting Pressure]
267,"In the meantime, there are many other semiconductor stocks that offer superior returns, such as Nvidia and AMD.","[[96, 102, POS], [107, 110, POS]]",[superior returns]
106,"Even without that expansion, analysts see a lot of room to increase sales of the company’s flagship software that lets businesses manage and interact with customers, known as CRM.However, Salesforce also faces competition from large companies such as Microsoft Corp. and up-and-comers like Freshworks Inc.","[[188, 198, NEG], [251, 260, NEU]]",[faces competition]
289,"On January 20, JPMorgan analyst Samik Chatterjee lowered his price target on Cisco Systems, Inc. (NASDAQ:CSCO) to $69 from $70 and maintained an Overweight rating on the shares.","[[77, 90, NEG], [105, 109, NEG]]",[lowered his price target]
295,Alphabet’s Google and Meta face record fines for failing to take down content banned in Russia.,"[[0, 8, NEG], [22, 26, NEG], [11, 17, NEG]]",[face record fines]
405,The 7 Best Web 3.0 Stocks to Buy for March 2022Let’s talk about seven recession stocks to buy for strong returns:Pfizer (NYSE:PFE) Walmart (NYSE:WMT),"[[113, 119, POS], [131, 138, POS], [126, 129, POS], [145, 148, POS]]",[buy for strong returns]
259,Yahoo Finance's Dan Howley discusses how Intel and AMD have reportedly halted some processor sales to Russia and how other tech companies are responding to the Russian attack on Ukraine.,"[[41, 46, NEG], [51, 54, NEG]]",[halted some processor sale]
319,"Five of them are — Marathon Oil Corp. MRO, Occidental Petroleum Corp. OXY, Exxon Mobil Corp. XOM, Tesla Inc.","[[19, 31, NEU], [43, 63, NEU], [98, 103, NEU], [75, 86, NEU]]",[]


In [5]:
dataset

Unnamed: 0,data,label,opinion_words
0,Some of the top stocks to play this rise in in...,"[[98, 113, NEU], [141, 158, NEU]]",[]
1,Major tech stocks like Apple Inc. (NASDAQ:AAPL...,"[[23, 28, POS], [49, 58, POS], [89, 93, POS], ...",[finally trading in the green]
2,"”Major semiconductors companies like Intel, Nv...","[[44, 50, NEU], [37, 42, NEU], [55, 58, NEU]]",[]
3,Here is what the fund said:“NVIDIA Corporation...,"[[55, 59, NEU], [28, 46, NEU]]",[]
4,“Biden has pointedly ignored Tesla at every tu...,"[[29, 34, NEG], [87, 89, NEU]]",[pointedly ignored]
...,...,...,...
487,Wells Fargo & Company (NYSE:WFC) and JPMorgan ...,"[[0, 11, POS], [37, 51, POS], [64, 67, POS], [...",[among the gainers]
488,It could give Intel’s (NASDAQ:INTC) soon-to-be...,"[[14, 19, NEU], [30, 34, NEU]]",[]
489,"Market check: Nasdaq, Russell 2000 turn positi...","[[14, 20, POS], [50, 55, POS], [22, 34, POS]]","[turn positive, stock jumps]"
490,"Western tech companies, including Facebook-own...","[[72, 80, NEU], [49, 63, NEU], [87, 93, NEU]]",[]


### split Fi_ATSA into train and test/dev sets
- for fair comparison between eda and no eda

In [6]:
from sklearn.model_selection import train_test_split
test_size = 0.3
random_state=200

In [9]:
train, test = train_test_split(dataset, 
                               test_size=test_size, 
                               random_state = random_state)
train.reset_index(drop = True, inplace = True)


In [11]:
len(train),len(test)

(344, 148)

In [13]:
train

Unnamed: 0,data,label,opinion_words
0,Today's Research Daily features new research r...,"[[83, 88, NEU], [103, 107, NEU], [134, 151, NEU]]",[]
1,"ETFs offered by Fidelity Investments, Invesco ...","[[38, 45, NEG], [50, 68, NEG]]",[attracted minimal or negative flows]
2,Ken Fisher’s Fisher Asset Management is the mo...,"[[76, 89, NEU], [104, 108, NEU]]",[]
3,"For the quarter, revenue increased 37% year ov...","[[121, 128, POS], [150, 155, NEU]]",[revenue increased 37% year over year to $1.3 ...
4,"Intel, AMD reportedly halt processor sales to ...","[[0, 5, NEG], [7, 10, NEG]]",[halt processor sale]
...,...,...,...
339,Here is what the fund said:“NVIDIA Corporation...,"[[28, 46, POS], [55, 59, POS]]",[dominant supplier of Graphics Processing Units]
340,"As of Q3 2021, Tesla, Inc. (NASDAQ:TSLA) is Wo...","[[15, 20, NEU], [138, 152, NEU], [175, 189, NEU]]",[]
341,Close up of Intel sign at their San Jose campu...,"[[12, 17, NEG], [279, 285, NEU], [335, 338, NE...",[has been struggling to right its own ship]
342,"Etsy, Inc. (NASDAQ:ETSY)Jim Cramer in his show...","[[0, 4, NEU], [19, 23, NEU], [67, 71, POS], [8...",[is a winner of the COVID-era]


### perform eda on training set
- eda not performed on test set for fair comparison

In [15]:
from eda import eda
num_aug = 16
alpha = 0.05

In [25]:
eda_dataset = pd.DataFrame()
for i in train.index:
    sentence, labels  = train.iloc[i]["data"],train.iloc[i]["label"]
    aug_samples = eda(sentence, labels, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug, aspect_identifier = "INVULNERABLE_ASPECT")
    eda_dataset = eda_dataset.append(aug_samples)
eda_dataset.columns = ["data","label"]
eda_dataset = eda_dataset.sample(frac=1)
eda_dataset.reset_index(drop = True, inplace = True)
eda_dataset.sample(5)

Unnamed: 0,data,label
4670,"Like Apple Inc. ( NASDAQ : AAPL ) , Microsoft ...","[[5, 10, POS], [36, 45, POS], [82, 86, POS], [..."
3336,MKM Partners analyst Rohit Kulkarni on Februar...,"[[78, 97, POS], [118, 122, POS]]"
4551,"Johnson & Johnson , while trailing Eli Lilly a...","[[0, 17, POS], [35, 56, POS], [66, 69, POS], [..."
3575,"On November 24 leontyne price , Craig-Hallum a...","[[88, 103, POS], [120, 124, POS]]"
31,Story continuesHere is some of what Microsoft ...,"[[36, 45, NEU], [62, 66, NEU], [186, 195, NEU]]"


### process Fi_ATSA 

In [26]:
label_mapping = {"NEG":0,"NEU":1,"POS":2}

In [27]:
def process(dataset):
    output = []
    for i in range(len(dataset)):
        output.extend(_generate_samples(dataset.loc[i,"data"], dataset.loc[i,"label"],label_mapping))
    return pd.DataFrame(output)

# given annotated ATSA sample, generate a list of SATSA samples
def _generate_samples(text, span_and_labels, label_mapping):
    output = []
    for sal in span_and_labels:
        output.append({"text": text, "span":(int(sal[0]),int(sal[1])), "label": label_mapping[sal[2]] })
    return output

### Input for SATSA 
- text: a sentence e.g "The decor is not special at all but their.."
- span: the span of the aspect e.g (4, 9)	
- label: the sentiment of the aspect e.g 0

note: label_mappings = {"negative":0,"neutral":1,"positive":2}

In [40]:
processed_full = process(dataset)
processed_train = process(train)
processed_test = process(test)
processed_eda = process(eda_dataset)

In [42]:
processed_full.label.value_counts()

1    724
2    361
0    359
Name: label, dtype: int64

In [35]:
processed_train.label.value_counts()

1    486
2    256
0    229
Name: label, dtype: int64

In [36]:
processed_test.label.value_counts()

1    238
0    130
2    105
Name: label, dtype: int64

In [37]:
processed_eda.label.value_counts()

1    8262
2    4352
0    3893
Name: label, dtype: int64

### save 

In [43]:
processed_train.to_pickle(TRAIN_PATHS["Fi_ATSA"])
processed_train.to_pickle(TRAIN_PATHS["Fi_ATSA_train"])
processed_test.to_pickle(DEV_PATHS['Fi_ATSA_test'])
processed_eda.to_pickle(TRAIN_PATHS["Fi_ATSA_eda"])

### Ignore below

In [None]:
# get opinion words from label column
def _get_opinion_words(data, label):
    output = []
    for l in label:
        if l[2]=="opinion words":
            opinion_words = data[l[0]:l[1]]
            output.append(opinion_words)
    return output
# remove opinion words from label column
def _remove_opinion_words(data, label):
    output = []
    for l in label:
        if l[2] not in ["POS","NEG","NEU"]:
            continue
        output.append(l)
    return output

# read annotated data
path = os.path.join(os.getcwd(),"tmp\\Fi_ATSA_train.jsonl")
with jsonlines.open(path, 'r') as reader:
    columns = ["data", "label", "opinion_words"]
    dataset = pd.DataFrame([line for line in reader], columns=columns)
    dataset['opinion_words'] = dataset.apply(lambda row: _get_opinion_words(row['data'], row['label']),axis=1)
    dataset['label'] = dataset.apply(lambda row: _remove_opinion_words(row['data'], row['label']),axis=1)
dataset.head()

In [None]:
path = r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\local\data\NLPCC 2020 Shared Task 2 Guideline  Multi-Aspect-based Multi Sentiment Analysis (MAMS)\Dataset_MAMS\ATSA\train.xml"
with open(path, 'r') as f:
    data = f.read()
    data = BeautifulSoup(data, "xml")