# Set-up and data preparation

In [1]:
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import numpy as np
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer, util, models
import os

In [2]:
path_data = 'C:\\Users\\tnguyen10\\OneDrive - Deloitte (O365D)\\Documents\\GitHub\\Thesis\\data_structured'

## Data loading

In [6]:
df_report = pd.read_csv(os.path.join(path_data,'report_sentences.csv'))

In [14]:
df_pdf = pd.read_csv(os.path.join(path_data,'article_sentences_pdf.csv'))
df_gnews = pd.read_csv(os.path.join(path_data,'article_sentences_gnews.csv'))
df_article = pd.concat([df_pdf,df_gnews])

In [19]:
df_report = df_report[df_report["word count"] > 5]
df_report = df_report[df_report["word count"] < 100]

In [39]:
df_report.rename(columns = {'fname':'company'},inplace = True)

In [20]:
df_article = df_article[df_article["word count"] > 5]
df_article = df_article[df_article["word count"] < 100]

# Applying the pre-trained models

In [6]:
claim_checker = pipeline(model = "climatebert/environmental-claims")
sem_search = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
nli_model = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"

The claim verification model consists of three stages - claim identification, evidence sentence selection and finally inference analysis. The three models above will help us achieve these three tasks. The ClimateBERT model is pre-trained to detect environmental and climate claims, semantic search will help us identify the 5 most relevant sentences from the corpus and finally the actual model can be used to check the entailment.

First, we apply the ClimateBERT model to identify environmental claims:

In [21]:
df_report['claim'] = df_report['sentence'].map(lambda x: claim_checker(x)[0]['label'])
df_report['claim_score'] = df_report['sentence'].map(lambda x: claim_checker(x)[0]['score'])

In [22]:
df_claims = df_report[df_report['claim']=='yes']

Now we create the sentence embeddings using the semantic search model. These embeddings will be used by the sentence transformers package to find the top 5 most similar sentences from the article corpus.

In [23]:
claims_sent = df_claims['sentence'].tolist()
claims_embeddings = sem_search.encode(claims_sent)
df_claims['embeddings'] = list(claims_embeddings)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_claims['embeddings'] = list(claims_embeddings)


In [24]:
article_sent = df_article['sentence'].tolist()
article_embeddings = sem_search.encode(article_sent)
df_article['embeddings'] = list(article_embeddings)

Since this took a while I will also pickle these to save my progress.

In [3]:
# df_article.to_pickle('art.pkl')
# df_claims.to_pickle('claims.pkl')
df_claims = pd.read_pickle('claims.pkl')
df_article = pd.read_pickle('art.pkl')

Sentence transformers has a utility called semantic search which can be used to do this:

In [10]:
for i,row in df_claims.iterrows():
    query_embedding = row['embeddings']
    company = row['company']
    # search only the article embeddings/sentences of the specific company
    corpus_embeddings = df_article[df_article['company']==company]['embeddings'].values
    top_5 = util.semantic_search(torch.Tensor(query_embedding), torch.Tensor(list(corpus_embeddings)), top_k = 5)
    break

Let us now create a new dataframe based on df_claims, which will store the same information as this dataframe, but will also additionally hold the top 5 most similar sentences in a separate column, as well as whether these sentences entail, contradict or are neutral towards each other. I use the MoritzLaurer NLI model for this purpose as it states that it is the best performing NLI model as of June 2022. The code used for the classification is mostly copied from the HuggingFace transformers website and modified for our purposes.

In [4]:
df_entailment = df_claims.copy()
df_entailment.reset_index(inplace = True, drop = True)

In [14]:
#df_entailment = df_entailment.reindex(df_entailment.columns.tolist() + ['top_sentences','predictions','probabilities'], axis=1)  # version > 0.20.0

In [172]:
# df_sample = df_entailment[:5]

We repeat the same code as above but expand upon it further:

In [7]:
tokenizer = AutoTokenizer.from_pretrained(nli_model)
model = AutoModelForSequenceClassification.from_pretrained(nli_model)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [8]:
# making lists to store values for the new columns
top_sentences_column = []
predictions = []
probabilities = []

# we run a for loop for each claim in the df_entailment dataset and check the validity of the claim
for i,row in df_entailment.iterrows():
    # define our query (i.e. claim) and the company it's related to 
    query_embedding = row['embeddings']
    company = row['company']
    # search only the article embeddings/sentences of the specific company
    corpus_embeddings = df_article[df_article['company']==company]['embeddings'].values
    top_5 = util.semantic_search(torch.Tensor(query_embedding), torch.Tensor(list(corpus_embeddings)), top_k = 5)
    # define a list to hold our top sentences and predictions to add these as a new variable after the loop
    hard_predictions = []
    top_sentences = []
    soft_predictions =[]
    for sentence in top_5[0]:
        # the premise is the claim
        premise = row['sentence']
        # the hypothesis is the sentence from the article(identified using the corpus id, which gives us the index of the sentence)
        hypothesis = df_article[df_article['company']==company]['sentence'].values[sentence['corpus_id']]
        tokens = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
        output = model(tokens["input_ids"].to(device))  # device = "cuda:0" or "cpu"
        soft_prediction = torch.softmax(output["logits"][0], -1)
        label_names = ["entailment", "neutral", "contradiction"]
        hard_prediction = label_names[torch.argmax(output["logits"][0], -1).item()]
        # append the different values to the correct list
        top_sentences.append(hypothesis)
        soft_predictions.append(max(torch.softmax(output["logits"][0], -1).tolist()))
        hard_predictions.append(hard_prediction)
    # # now add the different lists as new variables
    # df_sample.at[i,'top_sentences'] = str(top_sentences)
    # df_sample.at[i,'predictions'] = str(hard_predictions)
    # df_sample.at[i,'probabilities'] = str(soft_predictions)
    top_sentences_column.append(top_sentences)
    predictions.append(hard_predictions)
    probabilities.append(soft_predictions)

df_entailment['top_sentences'] = top_sentences_column
df_entailment['predictions'] = predictions
df_entailment['probabilities'] = probabilities

  top_5 = util.semantic_search(torch.Tensor(query_embedding), torch.Tensor(list(corpus_embeddings)), top_k = 5)


KeyboardInterrupt: 

In [None]:
from collections import Counter

def most_frequent_category(categories_list):
    counter = Counter(categories_list)
    return counter.most_common(1)[0][0]

In [None]:
df_entailment['consensus'] = df_entailment['predictions'].apply(most_frequent_category)
df_entailment['consensus'].value_counts()

In [177]:
# eval lets us access it as a list

Unnamed: 0,doc_type,company,sentence,word count,claim,claim_score,embeddings,top_sentences,predictions,probabilities
0,report,abb,customers to deliver annual savings of 100 meg...,13,yes,0.993644,"[-0.6807976, 0.4772148, -0.26226345, -0.029244...",['A key part of our 2030 sustainability strate...,"['neutral', 'neutral', 'neutral', 'contradicti...","[0.9967696666717529, 0.9962621331214905, 0.995..."
1,report,abb,We found that 36 percent of our revenue in 202...,19,yes,0.949681,"[0.23194747, 0.085391074, -0.19298553, 0.04695...",['SN: Company report shows that ABBs greenhous...,"['neutral', 'neutral', 'neutral', 'neutral', '...","[0.9966341853141785, 0.9991693496704102, 0.998..."
2,report,abb,We consider this to be a significant underesti...,33,yes,0.989664,"[-0.14990805, 0.17751908, -0.098325394, 0.1222...",['Theres a whole range of solutions that we ca...,"['entailment', 'neutral', 'neutral', 'neutral'...","[0.9093318581581116, 0.9912990927696228, 0.998..."
3,report,abb,A second goal of our 2030 sustainability strat...,19,yes,0.98979,"[-0.24475533, 0.4377824, -0.21117015, -0.03091...",['A key part of our 2030 sustainability strate...,"['neutral', 'neutral', 'neutral', 'neutral', '...","[0.9479023814201355, 0.9969388246536255, 0.999..."
4,report,abb,"In December 2021, we unveiled a new company-wi...",21,yes,0.992168,"[-0.49132273, -0.4556106, -0.45783857, -0.3235...","['At ABB, weve set a target to take a circular...","['neutral', 'neutral', 'neutral', 'neutral', '...","[0.9995922446250916, 0.8719384670257568, 0.955..."


In [87]:
# tokenizer = AutoTokenizer.from_pretrained(nli_model)
# model = AutoModelForSequenceClassification.from_pretrained(nli_model)

# for sentence in top_5[0]:
#     premise = row['sentence']
#     hypothesis = df_article[df_article['company']==company]['sentence'].values[sentence['corpus_id']]
#     tokens = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
#     output = model(tokens["input_ids"].to(device))  # device = "cuda:0" or "cpu"
#     prediction = torch.softmax(output["logits"][0], -1).tolist()
#     label_names = ["entailment", "neutral", "contradiction"]
#     prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
#     print(prediction)

{'entailment': 0.3, 'neutral': 99.7, 'contradiction': 0.1}
{'entailment': 0.1, 'neutral': 99.6, 'contradiction': 0.3}
{'entailment': 0.1, 'neutral': 99.6, 'contradiction': 0.3}
{'entailment': 0.2, 'neutral': 1.7, 'contradiction': 98.1}
{'entailment': 0.2, 'neutral': 99.2, 'contradiction': 0.6}


In [None]:
# # in case the input sentence is too long:
# input_id_chunks = tokens_plus['input_ids'][0].split(510)
# mask_chunks = tokens_plus['attention_mask'][0].split(510)

# input_id_chunks = list(input_id_chunks)
# mask_chunks = list(mask_chunks)


# chunksize = 512
# for i in range(len(input_id_chunks)):
#     input_id_chunks[i] = torch.cat([
#         torch.Tensor([101]), input_id_chunks[i], torch.Tensor([102])
#     ])
#     mask_chunks[i] = torch.cat([
#         torch.Tensor([1]), mask_chunks[i], torch.Tensor([1])
#     ])
#     pad_len = chunksize - input_id_chunks[i].shape[0]
    
#     if pad_len > 0:
#         input_id_chunks[i] = torch.cat([
#             input_id_chunks[i], torch.Tensor([0]*pad_len)
#         ])
#         mask_id_chunks[i] = torch.cat([
#             mask_id_chunks[i], torch.Tensor([0]*pad_len)
#         ])

In [None]:
# input_ids = torch.stack(input_id_chunks)
# attention_mask = torch.stack(mask_chunks)

# input_dict = {
#     'input_ids':input_ids.long(),
#     'attention_mask': attention_mask.int()
# }
# input_dict

In [None]:
# outputs = model(**input_dict)

# probs = torch.nn.functional.softmax(outputs[0], dim = -1)
# probs

In [None]:
# mean = probs.mean(dim = 0)