# This was originally run in Colab Enterprise

In [1]:
import pandas as pd
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import string

stop_words = set(stopwords.words('german'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Extract the most important words from title, h1, abstract, url with NLP (e.g. TF-IDF )

In [2]:
df_nlp = pd.read_csv('/data_nlp_A.csv')

In [3]:
df_nlp.columns

Index(['page_id', 'n_days', 'date_min', 'n_urls', 'date_max', 'age', 'url',
       'no_versions', 'last_publish_date', 'word_count',
       'classification_product', 'classification_type', 'page_name', 'title',
       'author_list', 'external_clicks', 'external_impressions',
       'likes_n_days', 'dislikes_n_days', 'video_play', 'page_impressions',
       'clickouts', 'ctr', 'h1', 'scraped_author', 'date_scraped', 'abstract',
       'scraped_word_count', 'meta_title', 'meta_description',
       'meta_image_url', 'media_type', 'page_img_size', 'merged_url',
       'meta_title_len', 'meta_desc_len', 'h1_len', 'abstract_len',
       'merged_url_len', 'sentiment_abstract', 'confidence_abstract',
       'sentiment_meta_title', 'confidence_meta_title'],
      dtype='object')

In [4]:
# df_nlp = df[['page_id', 'external_clicks', 'external_impressions', 'ctr', 'h1',
#        'abstract', 'meta_title', 'meta_description', 'merged_url']]

In [5]:
df_nlp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6815 entries, 0 to 6814
Data columns (total 43 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   page_id                 6815 non-null   int64  
 1   n_days                  6815 non-null   int64  
 2   date_min                6815 non-null   object 
 3   n_urls                  6815 non-null   int64  
 4   date_max                6815 non-null   object 
 5   age                     6815 non-null   int64  
 6   url                     6815 non-null   object 
 7   no_versions             6815 non-null   int64  
 8   last_publish_date       6815 non-null   object 
 9   word_count              6815 non-null   float64
 10  classification_product  6815 non-null   object 
 11  classification_type     6815 non-null   object 
 12  page_name               6815 non-null   object 
 13  title                   6815 non-null   object 
 14  author_list             6815 non-null   

In [6]:
df_nlp.head()

Unnamed: 0,page_id,n_days,date_min,n_urls,date_max,age,url,no_versions,last_publish_date,word_count,...,merged_url,meta_title_len,meta_desc_len,h1_len,abstract_len,merged_url_len,sentiment_abstract,confidence_abstract,sentiment_meta_title,confidence_meta_title
0,1037,6,2024-03-13,2,2024-03-18,22,https://efahrer.chip.de/e-wissen/aufladen_1037...,0,2024-03-10,827.0,...,"['ladestation', 'elektrofahrzeug']",55,64,61,217.0,2,positive,0.891435,neutral,0.994909
1,1039,1,2024-03-17,2,2024-03-17,697,https://efahrer.chip.de/e-wissen/elektroauto-f...,0,2022-05-05,1066.0,...,"['und', 'satt', 'absahnen', 'koennen', 'sie', ...",46,141,75,486.0,11,neutral,0.950322,neutral,0.940748
2,1040,10,2024-02-22,1,2024-03-02,40,https://efahrer.chip.de/news/haetten-sie-es-ge...,0,2024-02-21,466.0,...,"['haetten', 'gibt', 'autos', 'sie', 'e', 'scho...",53,155,53,264.0,10,negative,0.980951,neutral,0.967907
3,10245,3,2023-03-25,1,2023-03-27,2282,https://efahrer.chip.de/news/kleinste-auto-der...,0,2018-01-01,0.0,...,"['bauen', 'selber', 'kleinste', 'kommt', 'koen...",71,156,71,314.0,12,neutral,0.999933,neutral,0.962445
4,10273,24,2023-12-01,4,2024-03-06,2282,https://efahrer.chip.de/news/mehr-reichweite-i...,4,2024-02-28,530.5,...,"['alles', 'aus', 'mehr', 'sie', 'akku', 'reich...",72,152,72,262.0,14,neutral,0.899187,negative,0.946896


### Scale target variable

In [7]:
scaler = StandardScaler()
scaler.fit(df_nlp[['external_impressions']])
# Transform the target variable
df_nlp['external_impressions_scaled'] = scaler.transform(df_nlp[['external_impressions']])

### Remove stop words

In [8]:
def remove_stopwords(text):
    if isinstance(text, str):
        words = word_tokenize(text)
        # Remove punctuation and special characters
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove stopwords
        return ' '.join([word for word in words if word.lower() not in stop_words])
    else:
        return text

def remove_stopwords_from_columns(df, columns):
    for col in columns:
        df[col] = df[col].apply(remove_stopwords)
    return df

columns_to_clean = ['h1','abstract','meta_title','meta_description','merged_url']
df_nlp = remove_stopwords_from_columns(df_nlp, columns_to_clean)

### Use pretrained model

In [9]:
from transformers import pipeline
import pandas as pd
import numpy as np
import tensorflow
import torch
from tqdm import tqdm
import sentencepiece
import os

In [10]:
data = df_nlp

print(f'There are {data.shape[0]} rows in the dataset')

There are 6815 rows in the dataset


In [25]:
# Set your Hugging Face token
os.environ["HF_TOKEN"] = "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"

pipe = pipeline("text-classification", model="Stremie/roberta-base-clickbait")
#pipe = pipeline("text-classification", model="valurank/distilroberta-clickbait")

config.json:   0%|          | 0.00/885 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [26]:
def classify_headline(headline):
    result = pipe(headline)[0]
    label = result['label']
    score = result['score']
    return label, score

# Apply classification function to 'h1' column
data[['label', 'score']] = df_nlp['h1'].apply(classify_headline).apply(pd.Series)

print(data.head())

   page_id  n_days    date_min  n_urls    date_max   age  \
0     1037       6  2024-03-13       2  2024-03-18    22   
1     1039       1  2024-03-17       2  2024-03-17   697   
2     1040      10  2024-02-22       1  2024-03-02    40   
3    10245       3  2023-03-25       1  2023-03-27  2282   
4    10273      24  2023-12-01       4  2024-03-06  2282   

                                                 url  no_versions  \
0  https://efahrer.chip.de/e-wissen/aufladen_1037...            0   
1  https://efahrer.chip.de/e-wissen/elektroauto-f...            0   
2  https://efahrer.chip.de/news/haetten-sie-es-ge...            0   
3  https://efahrer.chip.de/news/kleinste-auto-der...            0   
4  https://efahrer.chip.de/news/mehr-reichweite-i...            4   

  last_publish_date  word_count  ... h1_len abstract_len merged_url_len  \
0        2024-03-10       827.0  ...     61        217.0              2   
1        2022-05-05      1066.0  ...     75        486.0             11   

In [28]:
data[['h1','label']].query("label == 'Clickbait'")

Unnamed: 0,h1,label
2,Hätten gewusst ? lange gibt schon E-Autos,Clickbait
41,Solaranlage-Mythen : 10 Gerüchte stimmen,Clickbait
50,"schwer , teuer & unhandlich : 7 Argumente Kauf...",Clickbait
73,Verkehrsschild Schneeketten : blaue Schild ver...,Clickbait
74,Streusalz benutzen : erlaubt ? Alternative leg...,Clickbait
...,...,...
6667,Akkus verlieren Energie : Spritze leistungsfähig,Clickbait
6677,Wer fährt eigentlich Roller Deutschland ? sieh...,Clickbait
6747,15 größten Solarfirmen Welt : Gibt deutsches U...,Clickbait
6774,Öffentliche Ladestationen : Zugangsbeschränkun...,Clickbait


In [29]:
df_nlp.to_csv('/clickbait.csv', encoding='utf-8', index=False)