In [36]:
import os
import pandas as pd

#Load the data
data_dir = 'ada/dataset'
articles = []
for file_name in os.listdir(data_dir):
    if file_name.endswith('.txt'):  
        with open(os.path.join(data_dir, file_name), 'r', encoding='utf-8') as file:
            content = file.read()
            articles.append({'file_name': file_name, 'content': content})

df = pd.DataFrame(articles)
print(df.head())

                              file_name  \
0  %C3%81ed%C3%A1n_mac_Gabr%C3%A1in.txt   
1                        %C3%85land.txt   
2                %C3%89douard_Manet.txt   
3                         %C3%89ire.txt   
4        %C3%93engus_I_of_the_Picts.txt   

                                             content  
0     #copyright\n\nÁedán mac Gabráin\n\n2007 Sch...  
1     #copyright\n\nÅland\n\n2007 Schools Wikiped...  
2     #copyright\n\nÉdouard Manet\n\n2007 Schools...  
3     #copyright\n\nÉire\n\n2007 Schools Wikipedi...  
4     #copyright\n\nÓengus I of the Picts\n\n2007...  


In [46]:
import torch

if torch.cuda.is_available():
    print("GPU is available, using CUDA...")
    device = 0
else:
    print("GPU not available, using CPU...")
    device = -1  # CPU

GPU is available, using CUDA...


In [60]:
from transformers import AutoTokenizer, pipeline
# Function to truncate text to 512 tokens using the tokenizer
def truncate_text_with_tokenizer(text, tokenizer, max_length=512):
    encoded = tokenizer.encode(text, truncation=True, max_length=max_length)
    truncated_text = tokenizer.decode(encoded, skip_special_tokens=True)
    return truncated_text

In [62]:
# Load the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")

# Apply truncation with the tokenizer to ensure the tokens of content is lower than the maximum
df['truncated_content'] = df['content'].apply(lambda x: truncate_text_with_tokenizer(x, tokenizer, max_length=512))

df

Unnamed: 0,file_name,content,truncated_content
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in.txt,#copyright\n\nÁedán mac Gabráin\n\n2007 Sch...,#copyright\n\nÁedán mac Gabráin\n\n2007 Sch...
1,%C3%85land.txt,#copyright\n\nÅland\n\n2007 Schools Wikiped...,#copyright\n\nÅland\n\n2007 Schools Wikiped...
2,%C3%89douard_Manet.txt,#copyright\n\nÉdouard Manet\n\n2007 Schools...,#copyright\n\nÉdouard Manet\n\n2007 Schools...
3,%C3%89ire.txt,#copyright\n\nÉire\n\n2007 Schools Wikipedi...,#copyright\n\nÉire\n\n2007 Schools Wikipedi...
4,%C3%93engus_I_of_the_Picts.txt,#copyright\n\nÓengus I of the Picts\n\n2007...,#copyright\n\nÓengus I of the Picts\n\n2007...
...,...,...,...
4599,Zirconium.txt,#copyright\n\nZirconium\n\n2007 Schools Wik...,#copyright\n\nZirconium\n\n2007 Schools Wik...
4600,Zoroaster.txt,#copyright\n\nZoroaster\n\n2007 Schools Wik...,#copyright\n\nZoroaster\n\n2007 Schools Wik...
4601,Zuid-Gelders.txt,#copyright\n\nZuid-Gelders\n\n2007 Schools ...,#copyright\n\nZuid-Gelders\n\n2007 Schools ...
4602,Zulu.txt,#copyright\n\nZulu\n\n2007 Schools Wikipedi...,#copyright\n\nZulu\n\n2007 Schools Wikipedi...


In [64]:
# Load model
emotion_analyzer = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None, device=device)

# Apply the model to a sample of articles
sample_texts = df['truncated_content'][:5].tolist()  # Taking first 5 articles as sample
goemotions_results = [emotion_analyzer(text) for text in sample_texts]

# Print sample results
for idx, result in enumerate(goemotions_results):
    print(f"Article {idx+1}: {result}")

Article 1: [[{'label': 'neutral', 'score': 0.9329707622528076}, {'label': 'approval', 'score': 0.039439521729946136}, {'label': 'realization', 'score': 0.02662878856062889}, {'label': 'disappointment', 'score': 0.005620713345706463}, {'label': 'annoyance', 'score': 0.00509575754404068}, {'label': 'disapproval', 'score': 0.0043046437203884125}, {'label': 'confusion', 'score': 0.004264521412551403}, {'label': 'admiration', 'score': 0.004086356144398451}, {'label': 'sadness', 'score': 0.0038565266877412796}, {'label': 'optimism', 'score': 0.0033053208608180285}, {'label': 'disgust', 'score': 0.0016887536039575934}, {'label': 'amusement', 'score': 0.0014552776701748371}, {'label': 'fear', 'score': 0.0012672347947955132}, {'label': 'joy', 'score': 0.0011649888474494219}, {'label': 'curiosity', 'score': 0.0011400567600503564}, {'label': 'desire', 'score': 0.000999607378616929}, {'label': 'love', 'score': 0.0009969371603801847}, {'label': 'gratitude', 'score': 0.0009945124620571733}, {'label'