## Text Augmentation Using LLM

In [4]:
import re
import os
import json
import torch
import pickle
import random
import requests
import numpy as np
import pandas as pd
import torch
import transformers
from accelerate import Accelerator
from tqdm import tqdm

from transformers import AutoModelForCausalLM, AutoTokenizer

In [5]:
with open('data/FinESG-2023_Train_EN.json', 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)

des = pd.read_excel('./label_description.xlsx')

In [6]:
labels = df['ESG_label'].unique()

In [7]:
des2 = dict(des.values)

In [8]:
des2

{'Access to Finance': 'It is about their efforts to expand financial services to historically underserved markets, including small-business lending and the development of innovative distribution channels.',
 'Health & Safety': 'It is about their management of workplace safety and the workplace safety standards in the industries and regions in which they operate.',
 'Tax Transparency': 'It is about their estimated corporate tax gap, revenue-reporting transparency and their involvement in tax-related controversies.',
 'Business Ethics': 'It is about their oversight and management of business ethics issues such as fraud, executive misconduct, corrupt practices, money laundering, or antitrust violations.',
 'Electronic Waste': 'It is about their production of electronic waste, their potential exposure to e-waste regulations as well as their efforts at product collection and recycling.',
 'Opportunities in Renewable Energy': 'It is about their efforts to develop renewable power generation c

In [9]:
def get_label_desc(label, des2):
    des2 = des2[label]
    return des2

def get_random_samples(df):
    # define a function that returns two random samples from the train set.
    s1, s2 = random.sample(range(0, len(df)), 2)
    return df['news_title'][s1], df['news_content'][s1], df['ESG_label'][s1], df['news_title'][s2], df['news_content'][s2], df['ESG_label'][s2]

def get_prompt(title1, text1, label1,title2, text2, label2, des2):
    # define a function that takes as input two samples and generates the prompt
    # that we should pass to the GPT-3 language model for completion.
    description = "Each item in the following list should contain a #ESG News headline, #ESG news and the related #ESG key issues. #ESG key issues are based on MSCI ESG rating guidelines."
    label_desc = get_label_desc(label1, des2)
    label_desc = f"#ESG Key issue is {label1}. " + label_desc
    description = description + label_desc
    prompt = (f"{description}\n"
            f"// #ESG News headline: {title1} // #ESG News: {text1} // #(ESG key issue: {label1})\n"
            f"// #ESG News headline: {title2} // #ESG News: {text2} // #(ESG key issue: {label2})\n"
            f"// #ESG News headline:")
    return prompt

In [10]:
labels_and_numbers = dict(100 - df['ESG_label'].value_counts())

In [11]:
labels_and_numbers

{'Packaging Material & Waste': 19,
 'Board': 26,
 'Carbon Emissions': 27,
 'Financing Environmental Impact': 29,
 'Responsible Investment': 30,
 'Opportunities in Renewable Energy': 32,
 'Opportunities in Clean Tech': 38,
 'Human Capital Development': 38,
 'Product Carbon Footprint': 45,
 'Biodiversity & Land Use': 47,
 'Consumer Financial Protection': 53,
 'Opportunities in Green Building': 56,
 'Ownership & Control': 58,
 'Community Relations': 60,
 'Business Ethics': 66,
 'Climate Change Vulnerability': 70,
 'Raw Material Sourcing': 71,
 'Water Stress': 73,
 'Toxic Emissions & Waste': 74,
 'Pay': 78,
 'Opportunities in Nutrition & Health': 78,
 'Health & Demographic Risk': 78,
 'Access to Finance': 80,
 'Access to Health Care': 81,
 'Accounting': 82,
 'Chemical Safety': 82,
 'Privacy & Data Security': 86,
 'Access to Communications': 87,
 'Electronic Waste': 88,
 'Supply Chain Labor Standards': 89,
 'Product Safety & Quality': 90,
 'Labor Management': 94,
 'Controversial Sourcing': 

In [12]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/pythia-12b')
model = AutoModelForCausalLM.from_pretrained('EleutherAI/pythia-12b')

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
accelerate = Accelerator()
model = accelerate.prepare(model)

In [14]:
import warnings

warnings.filterwarnings(action="ignore")

from IPython.display import clear_output

import os

for label_name in tqdm(labels_and_numbers):
    file_dir = './data/Pythia/' + label_name + '_augmented.parquet'
    if os.path.exists(file_dir):
        pass
    else:
        n = labels_and_numbers[label_name]
        df2 = df[df['ESG_label'] == label_name].reset_index(drop = True)
        ESG_text = []
        iter = 0
        while iter < n:
            print('***', label_name, iter/n*100, '% ***')
            # select two random samples from training set
            title1, text1, label1,title2, text2, label2 = get_random_samples(df2)
            # create the prompt
            prompt = get_prompt(title1, text1, label1, title2, text2, label2,des2)

            # generate text using GPT model
            input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
            gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=500,)
            gen_text = tokenizer.batch_decode(gen_tokens)[0]
            try:
                text = gen_text.split('\n')[3]
                esg_text = text + '\n'
                ESG_text.append(esg_text)
                iter += 1
                torch.cuda.empty_cache() 
                clear_output(wait=True)
            except:
                pass
        
        
        to_save = pd.DataFrame(columns = ['ESG_text'])
        to_save['ESG_text'] = ESG_text
        to_save.to_parquet(file_dir)

   

100%|██████████| 33/33 [3:41:37<00:00, 402.95s/it]
