### Load Data

In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline
import pandas as pd
import tqdm
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
import os
from scipy.special import softmax


def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


# OpenSource (HuggingFace)

### Translation

In [24]:
pipe = pipeline("translation", model="facebook/nllb-200-distilled-600M", src_lang="swe_Latn", tgt_lang="eng_Latn")
#https://huggingface.co/facebook/nllb-200-distilled-600M


# Define a function to translate text
def translate_text(text):
    result = pipe(text)
    return result[0]['translation_text']

In [25]:
df = pd.read_csv('./translate.csv')

for i in tqdm.tqdm(range(len(df)), desc="Translating"):
    df.at[i, 'en_translated_text'] = translate_text(df.at[i, 'sw_text'])
df.to_csv('./translated.csv', index = False)

Translating:  20%|█▉        | 3805/19441 [4:29:09<18:35:46,  4.28s/it]Your input_length: 233 is bigger than 0.9 * max_length: 200. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Translating: 100%|██████████| 19441/19441 [22:58:57<00:00,  4.26s/it]   


### English Sentiment Scoring

In [None]:
df = pd.read_csv("./translated.csv")
# https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


In [27]:
for i in tqdm.tqdm(range(len(df)), desc="Sentiment scoring"):
    text = preprocess(df.en_translated_text[i])
    df.at[i, 'model_text'] = text
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    df.at[i, 'negative_score_en'], df.at[i, 'neutral_score_en'], df.at[i, 'positive_score_en'] = softmax(scores)

df.to_csv("./sentiment_scored_en.csv", index = False)


Sentiment scoring: 100%|██████████| 19441/19441 [34:07<00:00,  9.49it/s]


### Swedish sentiment scoring

In [9]:
df =  pd.read_csv("./translated.csv")
sw = pipeline('sentiment-analysis', model='KBLab/robust-swedish-sentiment-multiclass')
#https://huggingface.co/KBLab/robust-swedish-sentiment-multiclass


In [10]:
for i in tqdm.tqdm(range(len(df)), desc="Sentiment scoring (swedish)"):
    text = preprocess(df.sw_text[i])
    output = sw(text)
    df.at[i, 'sentiment_sw'] = output[0]["label"]
    df.at[i, 'score_sw'] = output[0]["score"]

df.to_csv("./sentiment_scored_sw_KBL.csv", index = False)

Sentiment scoring (swedish): 100%|██████████| 19441/19441 [2:28:06<00:00,  2.19it/s]    


# OpenAI - GPt 4o

In [4]:
import openai
from api_keys import openai_key


client = openai.OpenAI(
    api_key= openai_key
,
)


def sentiment_analysis_sw(transcription):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": "Som en AI med expertis inom språk och känsloanalys är din uppgift att analysera sentimentet i följande text. Returnera endast sentimentklassen antingen POSITIVE, NEUTRAL eller NEGATIVE."
            },
            {
                "role": "user",
                "content": transcription
            }
        ]
    )
    return response

def sentiment_analysis_en(transcription):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": "As an AI with expertise in language and emotion analysis, your task is to analyze the sentiment of the following text. Return only the class of the sentiment either POSITIVE, NEUTRAL or NEGATIVE."
            },
            {
                "role": "user",
                "content": transcription
            }
        ]
    )
    return response


In [5]:
df = pd.read_csv("./translated.csv")
for i in tqdm.tqdm(range(len(df)- 19439), desc=f"Sentiment scoring (gpt Swedish & English)"):
    text = preprocess(df.sw_text[i])
    output = sentiment_analysis_sw(text)
    df.at[i, 'sentiment_openai_sw'] = output.choices[0].message.content
    text = preprocess(df.en_translated_text[i])
    output = sentiment_analysis_en(text)
    df.at[i, 'sentiment_openai_en'] = output.choices[0].message.content


df.to_csv("./sentiment_scored_gpt.csv", index = False)

Sentiment scoring (gpt Swedish & English): 100%|██████████| 2/2 [00:03<00:00,  1.59s/it]
