Pipeline for classifying the SM data with BERT models.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Master Thesis
%ls

Mounted at /content/drive
/content/drive/MyDrive/Master Thesis
candidate_sentences.csv               FindCandidates_V2_onlyESG.ipynb
candidate_sentences_pred_CLUSTER.csv  [0m[01;34mfull_gov[0m/
candidate_sentences_pred.csv          [01;34mfull_soc[0m/
[01;34mCB_sentence[0m/                          PreProcessing.ipynb
ClassificationPipeline_AD.ipynb       regressionAnalysis.ipynb
[01;34mdata[0m/                                 SP500.ipynb


Preprocess Instagram data

In [None]:
!pip install transformers
!pip install transformers datasets
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m102.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m
Inst

In [None]:
import pandas as pd
import numpy as np
import glob
from datetime import date, datetime
import os

import datasets
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, BertTokenizer, BertForSequenceClassification

In [None]:
######## PREDICT ESG and Sentiment

# Helper for the transformation of the output
def label_to_num(inp):
  if inp == 'risk' or inp == 'Negative' or inp == 'negative':
      return -1

  if inp == 'LABEL_0' or inp == 'neutral' or inp == 'Neutral':
      return 0

  elif inp == 'LABEL_1' or inp == 'opportunity' or inp == 'Positive' or inp == 'positive':
    return 1

  elif inp == 'Environmental':
    return 'E'

  elif inp == 'Social':
    return 'S'

  elif inp == 'Governance':
    return 'G'

def insert_predictions(pips, tasks, sentences, types):

  for i, m in enumerate(pips):
      # Load the pipeline
      cb_trained = pips[i]

      # create list of all sentences
      texts = [str(x) for x in sentences["%s"%types].to_numpy()]

      # classify and insert in dataframe
      classifications = cb_trained(texts, batch_size=64, padding=True, truncation=True)

      sentences[tasks[i]] = [label_to_num(x["label"]) for x in classifications]

  return sentences

def pipelines():
  # load tokenizer
  model_name = 'climatebert/distilroberta-base-climate-f'
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  # Env: Load model
  PATH = 'CB_sentence'
  model_E = AutoModelForSequenceClassification.from_pretrained(PATH)
  pip_E = pipeline('sentiment-analysis', model=model_E, tokenizer=tokenizer, max_length = 512, device=0)
  # Soc: Load model
  PATH = 'full_soc'
  model_S = AutoModelForSequenceClassification.from_pretrained(PATH)
  pip_S = pipeline('sentiment-analysis', model=model_S, tokenizer=tokenizer, max_length = 512, device=0)
  # Gov: Load model
  PATH = 'full_gov'
  model_G = AutoModelForSequenceClassification.from_pretrained(PATH)
  pip_G = pipeline('sentiment-analysis', model=model_G, tokenizer=tokenizer, max_length = 512, device=0)

  # Sentiment: Load model
  PATH = 'yiyanghkust/finbert-esg'
  tokenizer = BertTokenizer.from_pretrained(PATH)
  model_senF = BertForSequenceClassification.from_pretrained(PATH, num_labels = 4)
  pip_ESGF = pipeline('text-classification', model=model_senF, tokenizer=tokenizer, max_length = 512, device=0)

  # Sentiment: Load model
  PATH = 'climatebert/distilroberta-base-climate-sentiment'
  model_sen = AutoModelForSequenceClassification.from_pretrained(PATH)
  tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=512)
  pip_sen = pipeline('sentiment-analysis', model=model_sen, tokenizer=tokenizer, max_length = 512, device=0)

  # Sentiment: Load model
  PATH = f'cardiffnlp/twitter-roberta-base-sentiment-latest'
  tokenizer = AutoTokenizer.from_pretrained(PATH)
  model_senE = AutoModelForSequenceClassification.from_pretrained(PATH)
  pip_senE = pipeline('sentiment-analysis', model=model_senE, tokenizer=tokenizer, max_length = 512, device=0)


  pips = [pip_E, pip_S, pip_G, pip_ESGF, pip_sen, pip_senE]
  tasks = ["env", "soc", "gov", "ESGFin", "sentiment", "sentimentGen"]
  return pips, tasks

def predictESG(raws):
  pips, tasks = pipelines()


  pipsCmmts = pips[4:]
  tasksCmmts = tasks[4:]

  # analyse every raw
  for raw in raws:
      # read every path
      name = raw[7:-9]
      print("--------------- Starting with with %s ---------------"%name)

      path = raw

      sentences = pd.DataFrame()

      try:
          sentences = pd.read_csv(path, index_col=0)

      except:
          print("Used lineterminator")
          try:
              sentences = pd.read_csv(path, lineterminator='\n', index_col=0)
          except:
              print(f"Did not work for {path}!")
              continue

      sentencesDescript = insert_predictions(pips, tasks, sentences, 'Description')
      sentencesCmmts = insert_predictions(pipsCmmts, tasksCmmts, sentences, 'PostsComments')

      # store file
      sentencesDescript.to_csv("./data/Results/%s/predDescript.csv"%name)

      f = sentencesCmmts.groupby(['PostIndex', 'Description', 'PostDate']).mean()
      f.to_csv("./data/Results/%s/predCmmtsMean.csv"%name)
      print("--------------- Done with %s ---------------"%name)

In [None]:
files = glob.glob('./data/*.csv')
files.sort()

In [None]:
files

['./data/DENTSPLY SIRONA INCInsta.csv',
 './data/INSULET CORPInsta.csv',
 './data/PENTAIR PLCInsta.csv',
 './data/PNC FINANCIAL SERVICES GROUPInsta.csv',
 './data/TEXAS INSTRUMENTS INCInsta.csv',
 './data/TRAVELERS COS INCInsta.csv',
 './data/TYSON FOODS INC-CL AInsta.csv',
 './data/UNION PACIFIC CORPInsta.csv',
 './data/UNITED RENTALS INCInsta.csv',
 './data/US BANCORPInsta.csv',
 './data/VISA INC-CLASS A SHARESInsta.csv',
 './data/WASTE MANAGEMENT INCInsta.csv',
 './data/WESTERN DIGITAL CORPInsta.csv',
 './data/WHIRLPOOL CORPInsta.csv']

In [None]:
import warnings
warnings.filterwarnings("ignore")
predictESG(files)

Set sentiment to 0 for posts with no comments

In [None]:
import pandas as pd
import numpy as np
import glob
from datetime import date, datetime
import os

In [None]:
companies = glob.glob('./data/ProcessedComps/*.csv')
comps = sorted(companies)
comps[130:]

In [None]:
companies = glob.glob('./data/ProcessedComps/*.csv')
comps = sorted(companies)

for c in comps[130:]:

  name = c[22:-9]
  print('--- starting for %s ---'%name)

  df = pd.read_csv(c)
  dfNone = df[df['PostsComments']=='None']
  noneIndex = dfNone.PostIndex.tolist()

  sent = pd.read_csv('./data/Results/%s/predCmmtsMean.csv'%name)
  sent.loc[sent['PostIndex'].isin(noneIndex),['sentiment','sentimentGen']] = 0
  sent.to_csv('./data/Results/%s/predCmmtsMean_neutralized.csv'%name)
  print('--- done for %s ---'%name)

--- starting for PENTAIR PLC ---
--- done for PENTAIR PLC ---
--- starting for PEPSICO INC ---
--- done for PEPSICO INC ---
--- starting for PFIZER INC ---
--- done for PFIZER INC ---
--- starting for PNC FINANCIAL SERVICES GROUP ---
--- done for PNC FINANCIAL SERVICES GROUP ---
--- starting for PPG INDUSTRIES INC ---
--- done for PPG INDUSTRIES INC ---
--- starting for PROCTER & GAMBLE CO ---
--- done for PROCTER & GAMBLE CO ---
--- starting for PULTEGROUP INC ---
--- done for PULTEGROUP INC ---
--- starting for QUALCOMM INC ---
--- done for QUALCOMM INC ---
--- starting for REGENERON PHARMACEUTICALS ---
--- done for REGENERON PHARMACEUTICALS ---
--- starting for REGIONS FINANCIAL CORP ---
--- done for REGIONS FINANCIAL CORP ---
--- starting for REPUBLIC SERVICES INC ---
--- done for REPUBLIC SERVICES INC ---
--- starting for ROBERT HALF INC ---
--- done for ROBERT HALF INC ---
--- starting for ROCKWELL AUTOMATION INC ---
--- done for ROCKWELL AUTOMATION INC ---
--- starting for ROSS 

In [None]:
def moveFile(files,names, types):
  for i in range(len(files)):
    f = files[i]
    n = names[i]
    if types == 'Description':
      if os.path.exists(f):
        os.rename(f,"./data/Results/%s/predDescript.csv"%n)
    else:
      if os.path.exists(f):
        os.rename(f,"./data/Results/%s/predCmmtsMean.csv"%n)

names = []
for c in cmts:
  names.append(c[15:-27])