_Does size matter? The effect of Instagram influencer account size on post sentiment and resulting marketing outcomes_

_Master's thesis by Thomas A. Frost_

# Part 3: SiEBERT

This file is nearly completely identical with the proposed GitHub template by Siebert et al: https://github.com/chrsiebert/sentiment-roberta-large-english/blob/main/sentiment_roberta_prediction_example.ipynb

**ATTENTION! As this file is a Python script, it is not executable on myBinder / GESIS Notebooks in the R environment. To execute the code, download the file and run it locally.**

## 01 - Import libraries

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

from cleantext import remove_emoji, normalize_whitespace
from unidecode import unidecode

## 02 - Create class for data preparation

In [None]:
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

## 03 - Import data from Instagram Scraper

In [None]:
posts = pd.read_table("../data/Instagram__Posts_corrected_v4.tsv", sep = "\t")

posts.drop(['Date UTC', 'View Count', 'URL of Video', 'URL of Picture-/ Video-Thumbnail', 'Local Filename of Picture / Video Thumbnail', 'Location of Post'], axis = 1, inplace = True)

posts.dropna(subset = ['Text'], axis = 0, inplace = True)

posts['Text'] = posts['Text'].astype("str")
posts['Text'] = posts['Text'].apply(str.replace, args = ("\\n", "\n"))

## 04 - remove emojis / convert to ANSII

In [None]:
posts['Text'] = posts['Text'].apply(unidecode)

## 05 - Load tokenizer and model, create trainer

In [None]:
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

pred_texts = posts['Text'].tolist()

## 06 - Tokenize texts and create prediction data set

In [None]:
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

## 07 - Run predictions

In [None]:
predictions = trainer.predict(pred_dataset)

## 08 - Transform predictions to labels

In [None]:
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

## 09 - Create DataFrame with texts, predictions, labels, and scores

In [None]:
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text','pred','label','score'])

In [None]:
df.to_csv('../data/predicted-full.csv')