In [None]:
PATH_TO_DATA = '../data_preparation/testset_extended.csv'#'../data_preparation/whole_dataset.csv'
PATH_TO_SAVE_RESULTS = './test_predicitons_few_models/testset_extended_results_absa_deberta.csv'

If you wish to test it yourself, the only thing you need to specify is the location of csv file with already preprocessed data. How to prepare the preprocessed data from raw data will be described in README.md file that can be found in data_preparation directory with all the necessary scripts provided there.

It is important to notice, that this data already has NERs extracted and saved to separate column.

# Imports

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, pipeline

from lib.sentiment_analysis_utils import combine_lede_and_text, remove_text_formatting, read_all_news_in_dir
import pandas as pd
import os
import time
import ast
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-large-absa-v1.1")
model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-large-absa-v1.1")

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True, device=0)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
#print(torch.cuda.get_device_name(0))

The computations will be run using GPU in our case.

# Running ABSA (aspect based sentiment analysis)

In [None]:
df = pd.read_csv(PATH_TO_DATA)#pd.read_csv('../data_preparation/testset.csv')

Reading the data prepared for absa task.

In [None]:
df

In [None]:
#Prepare classifying pipeline
# tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-large-absa-v1.1")
# model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-large-absa-v1.1")
#
# classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True, device=0)

We will be using deberta model for ABSA task, which is off the shell provisioned via HuggingFace.

Code below runs predictions on the provided dataframe with model specified above. It runs it for each aspect separately (article keywords + extracted NERs).

In [None]:
start = time.time()
for i, row in df.iterrows():

    keywords = ast.literal_eval(row.keywords_lower)
    keywords = [keyword.strip() for keyword in keywords]
    ners = ast.literal_eval(row.ner_list)
    ners = [ner.strip() for ner in ners]


    keywords_aspect_sentiment_dict = dict()
    for aspect in keywords:
        keywords_aspect_sentiment_dict[aspect] = classifier(row.whole_text, text_pair=aspect)


    ner_aspect_sentiment_dict = dict()
    for aspect in ners:
        ner_aspect_sentiment_dict[aspect] = classifier(row.whole_text, text_pair=aspect)

    df.loc[i, 'keywords_sentiment'] = [keywords_aspect_sentiment_dict]
    df.loc[i, 'ner_sentiment'] = [ner_aspect_sentiment_dict]  #aspect_sentiment_dict

    break
    # if i % 100 == 0:
    #     df.to_csv(f'whole_dataset_results_absa_{i}.csv')#df.to_csv('testset_results_absa.csv')
    #     print(i)

    # df.to_csv('testset_extended_results_absa.csv')
stop = time.time()
print('Took', (stop - start)/60, 'minutes')

4.15 minutes deberta

Whole of analysis of slightly above 1700 articles took 150 minutes on GPU took. This is caused by the fact, that for many of them lots of NERs has been extracted. This caused a single article to be analyzed multiple times with respect to changing aspect (changing keyword term).

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [None]:
print(df.loc[[0],['keywords_sentiment']])
#pd.display_

In [None]:
df

In [None]:
df.to_csv(PATH_TO_SAVE_RESULTS)#df.to_csv('testset_results_absa.csv')