In [1]:
!pip install transformers



In [2]:
!pip install sentencepiece



In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fi-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-fi-en")

Downloading (…)olve/main/source.spm:   0%|          | 0.00/832k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [7]:
# Input sentence in Finnish
input_text = "Ainoastaan Alexander Stubb tai hänen avustajansa eivät vastanneet yhteydenottoihin."

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt")

# Generate translation
translation_ids = model.generate(inputs["input_ids"], max_length=50, num_return_sequences=1, early_stopping=True)

# Decode the generated translation
translation = tokenizer.decode(translation_ids[0], skip_special_tokens=True)

# Print the translation
print("Finnish: ", input_text)
print("English: ", translation)

Finnish:  Ainoastaan Alexander Stubb tai hänen avustajansa eivät vastanneet yhteydenottoihin.
English:  Only Alexander Stubb or his assistants did not respond to the calls.


In [8]:
!pip install numpy scipy



In [9]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

In [10]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)
text = preprocess(translation)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

Downloading (…)lve/main/config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1) negative 0.7241
2) neutral 0.2654
3) positive 0.0105


In [13]:

def translate_and_get_sentiment(input_text):
    # Translation from Finnish to English
    translation_model = "Helsinki-NLP/opus-mt-fi-en"
    translation_tokenizer = AutoTokenizer.from_pretrained(translation_model)
    translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model)

    # Tokenize the input text
    inputs = translation_tokenizer(input_text, return_tensors="pt")

    # Generate translation
    translation_ids = translation_model.generate(inputs["input_ids"], max_length=50, num_return_sequences=1, early_stopping=True)

    # Decode the generated translation
    translation = translation_tokenizer.decode(translation_ids[0], skip_special_tokens=True)

    # Sentiment Analysis
    sentiment_model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model)
    sentiment_config = AutoConfig.from_pretrained(sentiment_model)
    sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model)

    translated_text = preprocess(translation)
    encoded_input = sentiment_tokenizer(translated_text, return_tensors='pt')
    output = sentiment_model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # Determine sentiment based on scores
    positive_score = scores[sentiment_config.label2id['positive']]
    negative_score = scores[sentiment_config.label2id['negative']]

    if positive_score > negative_score:
        sentiment = "positive"
    else:
        sentiment = "negative"

    return sentiment

def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [16]:
# Example usage:
input_text_finnish = "Hjallis Harkimo kertoi 8 minuuttia -ohjelmassa omasta kampanjastaan."
sentiment = translate_and_get_sentiment(input_text_finnish)
print("Sentiment:", sentiment)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Sentiment: positive


In [18]:
!pip install nltk



In [21]:
!pip install nltk.tokenize

[31mERROR: Could not find a version that satisfies the requirement nltk.tokenize (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for nltk.tokenize[0m[31m
[0m

In [22]:
import nltk
from nltk.tokenize import sent_tokenize
import re
nltk.download('punkt')

with open("article.txt", "r", encoding='utf-8') as file:
    content = file.read()

dates_str, textes_str = content.split('\n')
dates = dates_str.split('#')
textes = textes_str.split('#')

dates_textes = list(zip(dates, textes))

dataset = []

for date, text in dates_textes:
    sentences = sent_tokenize(text)
    dataset.append((date, sentences))


name_data = {
    "Pekka Haavisto": [],
    "Alexander Stubb": [],
    "Olli Rehn": [],
    "Mika Aaltola": [],
    "Jussi Halla-aho": [],
    "Li Andersson": [],
    "Jutta Urpilainen": [],
    "Sari Essayah": [],
    "Harry Harkimo": [],
}

for d, s in dataset:
    for sentence in s:
        for name, dates_sentences in name_data.items():
            pattern = re.compile(r'\b' + re.escape(name) + r'\b|\b' + re.escape(name.split()[0]) + r'\b|\b' + re.escape(name.split()[1]) + r'\b', re.IGNORECASE)
            if pattern.search(sentence):
                data_list = []
                sentence_list = []
                dates_sentences_draft2 =[]
                dates_sentences_draft3 =[]
                data_list.append(d)
                sentence_list.append(sentence)
                dates_sentences_draft = [list(data_list), list(sentence_list)]
                dates_sentences.append(dates_sentences_draft)


def extract_month_and_sentence_count(data):
    result = {}
    for item in data:
        date = item[0][0]
        year, month, _ = date.split('-')
        year = int(year)
        month = int(month)
        sentences = item[1][0]
        sentence_count = len(re.split(r'[.!?]', sentences))

        if (year, month) in result:
            result[(year, month)] += sentence_count
        else:
            result[(year, month)] = sentence_count
    sorted_result = {k: v for k, v in sorted(result.items(), key=lambda item: item[0])}

    return sorted_result


#input_data = name_data["Alexander Stubb"]

#output = extract_month_and_sentence_count(input_data)


#print(output)

print(name_data["Alexander Stubb"])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[[['2023-09-14'], ['Haavisto ja Stubb mittauksen kärjessäYlen presidenttikyselyssä selvästi eniten kannatusta saa valitsijayhdistyksen ehdokkaaksi lupautunut Pekka Haavisto (vihr.).']], [['2023-09-14'], ['Häntä tukee 31 prosenttia kyselyyn vastanneista.Toiseksi nousee kokoomuksen ehdokkaaksi lupautunut Alexander Stubb, jota äänestäisi 19 prosenttia.Heidän perässään tulee tasainen kolmikko Olli Rehn (kesk.']], [['2023-09-14'], [').Haavisto ja Stubb innostavat korkeasti koulutettujaWestinen arvioi, että mitä korkeampi kyselyyn vastaajien koulutus on, sitä kovemmaksi nousevat nimenomaan Haaviston ja Stubbin kannatukset.Westisen mukaan molemmilla on arvoliberaalit arvot ja he innostavat korkeasti koulutettuja sekä hyvin toimeentulevia kaupunkilaisia äänestämään heitä.']], [['2023-09-14'], ['Pekka Haavisto vie kyselyssä ensimmäisellä kierroksella keskustavasemmiston äänet.– Sen sijaan keskustaoikeiston äänet hajautuvat ensimmäisellä kierroksella, mutta toisella kierroksella keskustaoikeisto

In [27]:
name_list = [
    "Pekka Haavisto",
    "Alexander Stubb",
    "Olli Rehn",
    "Mika Aaltola",
    "Jussi Halla-aho",
    "Li Andersson",
    "Jutta Urpilainen",
    "Sari Essayah",
    "Harry Harkimo",
]

# Data for each name (replace with your actual data)
sentiment_counts = {}

for name in name_list:
    sentiment_counts[name] = {"positive": 0, "negative": 0}
    for data_point in name_data.get(name, []):
        if len(data_point) == 2:
            text = data_point[1][0]
            print(text)
            sentiment = translate_and_get_sentiment(text)
            if sentiment == "positive":
                print(sentiment)
                sentiment_counts[name]["positive"] += 1
            elif sentiment == "negative":
                print(sentiment)
                sentiment_counts[name]["negative"] += 1

# Print the dictionary with sentiment counts for each name
for name, counts in sentiment_counts.items():
    print(name)
    print("Positive:", counts["positive"])
    print("Negative:", counts["negative"])
    print("---")


Valtuuskunnan kunniapuheenjohtajana toimii entinen puolustusministeri ja RKP:n presidenttiehdokas Elisabeth Rehn.Kannatuskyselyjä johtava Pekka Haavisto (vihr.)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


positive
Molemmilla kerroilla hän sai toiseksi eniten ääniä ja hävisi Sauli Niinistölle.Pekka Haavisto on vihreiden kansanedustaja.


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


negative
Ulkoministeri hän oli vuosina 2019–2023.Haavisto on työskennellyt myös useissa, muun muassa YK:n kansainvälisissä tehtävissä.Esitimme Pekka Haavistolle viisi kysymystä.Mikä on Suomen tulevan presidentin tärkein haaste, Pekka Haavisto?– Tärkein haaste on Euroopan rauha.


KeyboardInterrupt: ignored

In [25]:
additionals = sentiment_counts

In [26]:
print(additionals)

{'Pekka Haavisto': {'positive': 136, 'negative': 28}, 'Alexander Stubb': {'positive': 80, 'negative': 25}, 'Olli Rehn': {'positive': 106, 'negative': 23}, 'Mika Aaltola': {'positive': 148, 'negative': 37}, 'Jussi Halla-aho': {'positive': 69, 'negative': 26}, 'Li Andersson': {'positive': 26, 'negative': 6}, 'Jutta Urpilainen': {'positive': 40, 'negative': 16}, 'Sari Essayah': {'positive': 15, 'negative': 1}, 'Harry Harkimo': {'positive': 22, 'negative': 5}}
