In [2]:
import pandas as pd
import re

# Load the CSV file
df = pd.read_csv('extracted_sentences.csv', low_memory=False, index_col=0)

# Remove anything after '.' and '.' itself in the first column (assuming it's the index column)
df.index = df.index.map(lambda x: str(x).split('.')[0] if isinstance(x, str) else x)

# Remove '( URL )' and preceding '[...]' in all other columns (excluding the first column)
def remove_url(text):
    if isinstance(text, str):
        return re.sub(r'\[|\]\s*\(\s*URL\s*\)', '', text)
    return text

for column in df.columns:
    df[column] = df[column].apply(remove_url)

# Save the cleaned CSV file
df.to_csv('cleaned_extracted_sentences.csv')

print("CSV processing complete. The cleaned file is saved as 'cleaned_extracted_sentences.csv'.")

CSV processing complete. The cleaned file is saved as 'cleaned_extracted_sentences.csv'.


In [3]:
df.index

Index(['Royal_Marines', 'Recycling', 'Retreat_of_glaciers_since_1850',
       'Remember_Girls_Like_Rajpura', 'Rutherfordium', 'Rule_of_St_Benedict',
       'Red_dwarf', 'Romeo_and_Juliet', 'Rudyard_Kipling', 'Race',
       ...
       'Witold_Pilecki', 'What_Next_For_Tsunami_Orphans', 'Walt_Disney',
       'Winter2005_Brick', 'Winter2005_Stop_Press', 'Wikispecies',
       'William_Butler_Yeats', 'Western_painting', 'William_Ewart_Gladstone',
       'World_War_II'],
      dtype='object', length=5462)

In [4]:
import torch

if torch.cuda.is_available():
    print("GPU is available, using CUDA...")
    device = 0
else:
    print("GPU not available, using CPU...")
    device = -1  # CPU

GPU is available, using CUDA...


In [5]:
from transformers import AutoTokenizer, pipeline
# Function to truncate text to 512 tokens using the tokenizer
def truncate_text_with_tokenizer(text, tokenizer, max_length=512):
    encoded = tokenizer.encode(text, truncation=True, max_length=max_length)
    truncated_text = tokenizer.decode(encoded, skip_special_tokens=True)
    return truncated_text

In [6]:
# Load the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")

# Apply truncation with the tokenizer to ensure the tokens of content is lower than the maximum
for column in df.columns:
    df[column] = df[column].apply(lambda x: truncate_text_with_tokenizer(x, tokenizer, max_length=512) if isinstance(x, str) else x)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,721,722,723,724,725,726,727,728,729,730
Royal_Marines,Royal Marines Royal Marines 2007 Schools Wiki...,Related subject : Military History and War N...,A core component of the country 's Rapid Deplo...,"The high level of training and competence , co...",The Royal Marines are a component part of the ...,Role Corps of Royal Marines Cap Badge of the R...,As the United Kingdom Armed Forces ' specialis...,In common with the other armed force the Royal...,"Command , Control and Organisation Command of ...",The operational capability of the Corps compri...,...,,,,,,,,,,
Recycling,Recycling Recycling 2007 Schools Wikipedia Se...,Related subject : Environment The intern...,Environmental science Environmental technolo...,Recycling prevents useful material resource be...,Recycling is a key concept of modern waste ma...,"Recyclable material , also called `` recyclabl...","They include glass , paper , aluminium , a...","Biodegradable waste , such a food waste or gar...",Recyclates need to be sorted and separated int...,Contamination of the recylates with other mate...,...,,,,,,,,,,
Retreat_of_glaciers_since_1850,Retreat of glacier since 1850 Retreat of glaci...,Related subject : Climate and the Weather ...,"Studied by glaciologists , the temporal coinci...",Mid-latitude mountain range such a the Himala...,The Little Ice Age wa a period from about 1550...,"Subsequently , until about 1940 glacier around...","Glacial retreat slowed and even reversed , in ...","However , since 1980 a significant global warm...",In location such a the Andes of South America...,"The retreat of mountain glacier , notably in w...",...,,,,,,,,,,
Remember_Girls_Like_Rajpura,How Remembering a Charity help SOS Chidren 's...,She share with u her success story .,My Success Story I came to SOS Children 's Vil...,At that point I could have never thought that ...,My SOS mother wa very supportive and always en...,"After getting grade ' A ' in class eight , my ...",At school I played badminton and also particip...,"Looking at my performance in badminton , my sp...","After class twelve , I joined Guru Nanak Khals...",Hostel life wa a different experience all toge...,...,,,,,,,,,,
Rutherfordium,Rutherfordium Rutherfordium 2007 Schools Wiki...,Related subject : Chemical element 104 lawr...,This is a highly radioactive synthetic element...,This element therefore ha no application and l...,Rutherfordium is the first transactinide eleme...,History Rutherfordium ( named in honour of not...,Researchers there bombarded 242 Pu with acce...,In 1969 researcher at the University of Califo...,The UC group also stated that they could not r...,This resulted in an element naming controversy...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wikispecies,Wikispecies Wikispecies 2007 Schools Wikipedi...,Related subject : Websites and the Internet ...,No Type of site Species directory Registration...,"It is an open , free directory of specie .",Typical specie page A typical specie page woul...,Vernacular name are link to Wikipedia articl...,Community Portal The Wikispecies Community Por...,The Wikispecies Village Pump is an area of Wik...,It is like a Wikipedia article 's talk page .,Currently it is the largest page on Wikispecies .,...,,,,,,,,,,
William_Butler_Yeats,William Butler Yeats William Butler Yeats 200...,Related subject : Writers and critic W.B .,Yeats in Dublin on 24 January 1908 .,William Butler Yeats ( IPA : /jeɪts/ ) ( 13 Ju...,"Yeats , though born to an Anglo-Saxon Protesta...",Yeats also served a an Irish Senator .,He wa awarded the Nobel Prize in Literature in...,"Early life and work When Yeats wa young , his ...","At first , the Yeats child were educated at ho...","Their mother , who wa homesick for Sligo , ent...",...,,,,,,,,,,
Western_painting,Western painting Western painting 2007 School...,"Related subject : Art Jan Vermeer , Girl...",Until the early 20th century it relied primari...,Developments in Western painting historically ...,"African art , Islamic art , Indian art , Chine...","Initially serving religious patronage , West...",From the Middle Ages through the Renaissanc...,Beginning with the Baroque era artist receiv...,By the 19th century painter became liberated f...,The idea `` art for art 's sake '' began to fi...,...,,,,,,,,,,
William_Ewart_Gladstone,William Ewart Gladstone William Ewart Gladston...,Related subject : Historical figure ; Polit...,"He wa a notable political reformer , known for...",Gladstone wa famously at odds with Queen Vict...,She once complained `` He always address me a ...,"Early life Born in Liverpool , England at ...",The final `` s '' wa later dropped from the fa...,Although Gladstone wa born and brought up in L...,"Gladstone wa educated at Eton College , and in...",In December 1831 after sitting for his final e...,...,,,,,,,,,,


In [7]:
# Load model
emotion_analyzer = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None, device=device)

for column in df.columns:
    df[column] = df[column].apply(lambda x: emotion_analyzer(x) if pd.notna(x) else x)

# Print sample results
df.to_csv('processed_extracted_sentences.csv')

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
