In [1]:
from transformers import pipeline
import torch
import pandas as pd
import numpy as np
import polars as pl

```@inproceedings{camacho-collados-etal-2022-tweetnlp,
    title = "{T}weet{NLP}: Cutting-Edge Natural Language Processing for Social Media",
    author = "Camacho-collados, Jose  and
      Rezaee, Kiamehr  and
      Riahi, Talayeh  and
      Ushio, Asahi  and
      Loureiro, Daniel  and
      Antypas, Dimosthenis  and
      Boisson, Joanne  and
      Espinosa Anke, Luis  and
      Liu, Fangyu  and
      Mart{\'\i}nez C{\'a}mara, Eugenio" and others,
    booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
    month = dec,
    year = "2022",
    address = "Abu Dhabi, UAE",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.emnlp-demos.5",
    pages = "38--49"
}
```

- F1 (micro): 0.6931034482758621
- F1 (macro): 0.692628774202147
- Accuracy: 0.6931034482758621

In [2]:
device = "mps" if torch.backends.mps.is_available() else "cuda:1" if torch.cuda.is_available() else "cpu"


model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path, device=device, batch_size=128)

print(device)

mps


In [3]:
sentiment_task("The worlds greatest event in footbal is today aoisdjfioqjawepofjqoiwjepi jqoipwjfpoi jqwoeifj opiqjefoip jqweoifj oiwejfo iqwjeo ijqwoiepjf oiwejf ioqjwefoi jw")

[{'label': 'positive', 'score': 0.44229432940483093}]

In [5]:
# Load the DataFrame using Polars
filtered_df_sport_category = pl.read_parquet(
    'filtered_sport_category_without_description_column_metadata.parquet')

In [6]:
BATCH_SIZE = 1024
MAX_LENGTH = 512  # Maximum length for the model


def analyze_sentiment_in_batches(df, column_names, batch_size=BATCH_SIZE):
    total_rows = len(df)
    sentiments = {col: [] for col in column_names}
    
    for i in range(0, total_rows, batch_size):
        end_index = min(i + batch_size, total_rows)
        # Extract the batch
        batch = df[i:end_index]
        
        for col in column_names:
            texts = batch[col].to_list()

            # Truncate texts to the maximum length
            texts = [text[:MAX_LENGTH] for text in texts]
            
            results = sentiment_task(texts)
            
            sentiments[col].extend([result['label'] for result in results])
        
        print(f"Processed {min(i + batch_size, total_rows)}/{total_rows} rows...")
    
    for col in column_names:
        df = df.with_columns(pl.Series(f"{col}_sentiment", sentiments[col]))
    
    return df


filtered_df_sport_category = analyze_sentiment_in_batches(filtered_df_sport_category, ["title"])


print(filtered_df_sport_category.head())

Processed 1024/4354412 rows...
Processed 2048/4354412 rows...
Processed 3072/4354412 rows...
Processed 4096/4354412 rows...
Processed 5120/4354412 rows...
Processed 6144/4354412 rows...
Processed 7168/4354412 rows...
Processed 8192/4354412 rows...
Processed 9216/4354412 rows...
Processed 10240/4354412 rows...
Processed 11264/4354412 rows...
Processed 12288/4354412 rows...
Processed 13312/4354412 rows...
Processed 14336/4354412 rows...
Processed 15360/4354412 rows...
Processed 16384/4354412 rows...
Processed 17408/4354412 rows...
Processed 18432/4354412 rows...
Processed 19456/4354412 rows...
Processed 20480/4354412 rows...
Processed 21504/4354412 rows...
Processed 22528/4354412 rows...
Processed 23552/4354412 rows...
Processed 24576/4354412 rows...
Processed 25600/4354412 rows...
Processed 26624/4354412 rows...
Processed 27648/4354412 rows...
Processed 28672/4354412 rows...
Processed 29696/4354412 rows...
Processed 30720/4354412 rows...
Processed 31744/4354412 rows...
Processed 32768/4

In [10]:
filtered_df_sport_category

categories,channel_id,crawl_date,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count,title_sentiment
str,str,str,f64,str,i64,f64,str,str,str,f64,str
"""Sports""","""UCzWn_gTaXyH5Idyo8Raf7_A""","""2019-11-03 16:39:57.427254""",35.0,"""JOeSxtcNdHQ""",8620,1673.0,"""catfishing,fishing,fishing cha…","""Catching 100 lbs of Catfish 🔴L…","""2019-10-01 00:00:00""",48737.0,"""neutral"""
"""Sports""","""UCzWn_gTaXyH5Idyo8Raf7_A""","""2019-11-03 16:39:58.108323""",15.0,"""EPMLTw2zINw""",355,1297.0,"""""","""big cat""","""2019-10-01 00:00:00""",19999.0,"""positive"""
"""Sports""","""UCzWn_gTaXyH5Idyo8Raf7_A""","""2019-11-03 16:39:58.773085""",78.0,"""Y1_pK68iSYQ""",603,3305.0,"""Catfishing,how to catch catfis…","""Classy Catfishing - How to Cat…","""2019-09-28 00:00:00""",58518.0,"""neutral"""
"""Sports""","""UCzWn_gTaXyH5Idyo8Raf7_A""","""2019-11-03 16:39:59.465346""",70.0,"""jF8TSo3ZfTc""",1426,1889.0,"""Fishing,Fishing uk,Angling,Sea…","""2 Day Saltwater Fishing Catch …","""2019-09-21 00:00:00""",71998.0,"""neutral"""
"""Sports""","""UCzWn_gTaXyH5Idyo8Raf7_A""","""2019-11-03 16:40:00.188768""",73.0,"""Gp00dNaVouo""",990,2699.0,"""Fishing,catfish,wels catfish,h…","""How to Catch Wels Catfish - Fi…","""2019-09-14 00:00:00""",101924.0,"""neutral"""
…,…,…,…,…,…,…,…,…,…,…,…
"""Sports""","""UCrwEMKhsjY8P9-GuIKMYVrQ""","""2019-11-17 22:39:14.232693""",7.0,"""Q9H_fk6uHDk""",1121,89.0,"""hypnosis,progressive hypnosis,…","""Play Better Golf Part 4 ★ Putt…","""2017-02-14 00:00:00""",20430.0,"""neutral"""
"""Sports""","""UCrwEMKhsjY8P9-GuIKMYVrQ""","""2019-11-17 22:39:14.843290""",13.0,"""3lwXzOboOzk""",1341,91.0,"""hypnosis,progressive hypnosis,…","""Play Better Golf Part 3 ★ Owni…","""2017-02-14 00:00:00""",25817.0,"""neutral"""
"""Sports""","""UCrwEMKhsjY8P9-GuIKMYVrQ""","""2019-11-17 22:39:15.484430""",19.0,"""242JzJuuG78""",1098,115.0,"""hypnosis,progressive hypnosis,…","""Play Better Golf Part 2 ★ Fair…","""2017-02-14 00:00:00""",29909.0,"""neutral"""
"""Sports""","""UCrwEMKhsjY8P9-GuIKMYVrQ""","""2019-11-17 22:39:16.111873""",28.0,"""CpMWSgoRwNI""",1245,372.0,"""hypnosis,progressive hypnosis,…","""Play Better Golf Part 1 ★ Gett…","""2017-02-14 00:00:00""",61980.0,"""neutral"""


In [8]:
filtered_df_sport_category.write_parquet("filtered_sport_category_with_sentiment_and_without_description_column_metadata.parquet")