In [21]:
import pandas as pd
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, PowerTransformer
import string

stop_words = set(stopwords.words('german'))

[nltk_data] Downloading package stopwords to /Users/clara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/clara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/clara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Extract the most important words from title, h1, abstract, url with NLP (e.g. TF-IDF )

In [41]:
df = pd.read_csv('/Users/clara/Desktop/neuefische/d-drivers/data/data_nlp_A.csv')

In [42]:
df.columns

Index(['page_id', 'n_days', 'date_min', 'n_urls', 'date_max', 'age', 'url',
       'no_versions', 'last_publish_date', 'word_count',
       'classification_product', 'classification_type', 'page_name', 'title',
       'author_list', 'external_clicks', 'external_impressions',
       'likes_n_days', 'dislikes_n_days', 'video_play', 'page_impressions',
       'clickouts', 'ctr', 'mean_version_lifetime', 'publ_freq',
       'ext_impr_norm', 'h1', 'scraped_author', 'date_scraped', 'abstract',
       'scraped_word_count', 'meta_title', 'meta_description',
       'meta_image_url', 'media_type', 'page_img_size', 'merged_url',
       'meta_title_len', 'meta_desc_len', 'h1_len', 'abstract_len',
       'merged_url_len', 'title_has_colon', 'clickbait_label',
       'clickbait_prob', 'google_trend_prob', 'google_trend_label',
       'google_trend_score', 'video_player_types', 'sentiment_abstract',
       'confidence_abstract', 'sentiment_meta_title', 'confidence_meta_title'],
      dtype='object')

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6815 entries, 0 to 6814
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   page_id                 6815 non-null   int64  
 1   n_days                  6815 non-null   int64  
 2   date_min                6815 non-null   object 
 3   n_urls                  6815 non-null   int64  
 4   date_max                6815 non-null   object 
 5   age                     6815 non-null   int64  
 6   url                     6815 non-null   object 
 7   no_versions             6815 non-null   int64  
 8   last_publish_date       6815 non-null   object 
 9   word_count              6815 non-null   float64
 10  classification_product  6815 non-null   object 
 11  classification_type     6815 non-null   object 
 12  page_name               6815 non-null   object 
 13  title                   6815 non-null   object 
 14  author_list             6815 non-null   

In [45]:
df.head()

Unnamed: 0,page_id,n_days,date_min,n_urls,date_max,age,url,no_versions,last_publish_date,word_count,...,clickbait_label,clickbait_prob,google_trend_prob,google_trend_label,google_trend_score,video_player_types,sentiment_abstract,confidence_abstract,sentiment_meta_title,confidence_meta_title
0,1037,6,2024-03-13,2,2024-03-18,22,https://efahrer.chip.de/e-wissen/elektrofahrze...,0,2024-03-10,827.0,...,Not Clickbait,0.742681,0.534224,e-auto vergleich,15,,positive,0.891435,neutral,0.994909
1,1039,1,2024-03-17,2,2024-03-17,697,https://efahrer.chip.de/e-wissen/elektroauto-f...,0,2022-05-05,1066.0,...,Not Clickbait,0.856208,0.247981,e-auto prämie,13,1 - Standard,neutral,0.950322,neutral,0.940748
2,1040,10,2024-02-22,1,2024-03-02,40,https://efahrer.chip.de/news/haetten-sie-es-ge...,0,2024-02-21,466.0,...,Clickbait,0.773124,0.306558,elektroauto,34,3 - Widget,negative,0.980951,neutral,0.967907
3,10245,3,2023-03-25,1,2023-03-27,2282,https://efahrer.chip.de/news/kleinste-auto-der...,0,2018-01-01,0.0,...,Not Clickbait,0.827747,0.394159,elektro auto,13,,neutral,0.999933,neutral,0.962445
4,10273,24,2023-12-01,4,2024-03-06,2282,https://efahrer.chip.de/news/mehr-reichweite-i...,4,2024-02-28,530.5,...,Not Clickbait,0.780902,0.136335,elektroauto,34,1 - Standard,neutral,0.899187,negative,0.946896


### Scale target variable

In [46]:
scaler = PowerTransformer()

# Transform the target variable
df['external_impressions_scaled'] = scaler.fit_transform(df[['external_impressions']])
df['external_clicks_scaled'] = scaler.fit_transform(df[['external_clicks']])
df['ctr_scaled'] = scaler.fit_transform(df[['ctr']])

df['likes_scaled'] = scaler.fit_transform(df[['likes_n_days']])

### Remove stop words

In [47]:
def remove_stopwords(text):
    if isinstance(text, str):        
        words = word_tokenize(text)
        # Remove punctuation and special characters
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove stopwords
        return ' '.join([word for word in words if word.lower() not in stop_words])
    else:
        return text

def remove_stopwords_from_columns(df, columns):
    for col in columns:
        df[col] = df[col].apply(remove_stopwords)
    return df

columns_to_clean = ['h1','abstract','meta_title','meta_description']
df = remove_stopwords_from_columns(df, columns_to_clean)

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6815 entries, 0 to 6814
Data columns (total 57 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   page_id                      6815 non-null   int64  
 1   n_days                       6815 non-null   int64  
 2   date_min                     6815 non-null   object 
 3   n_urls                       6815 non-null   int64  
 4   date_max                     6815 non-null   object 
 5   age                          6815 non-null   int64  
 6   url                          6815 non-null   object 
 7   no_versions                  6815 non-null   int64  
 8   last_publish_date            6815 non-null   object 
 9   word_count                   6815 non-null   float64
 10  classification_product       6815 non-null   object 
 11  classification_type          6815 non-null   object 
 12  page_name                    6815 non-null   object 
 13  title             

### One hot encode categorical values

In [49]:
df.columns

Index(['page_id', 'n_days', 'date_min', 'n_urls', 'date_max', 'age', 'url',
       'no_versions', 'last_publish_date', 'word_count',
       'classification_product', 'classification_type', 'page_name', 'title',
       'author_list', 'external_clicks', 'external_impressions',
       'likes_n_days', 'dislikes_n_days', 'video_play', 'page_impressions',
       'clickouts', 'ctr', 'mean_version_lifetime', 'publ_freq',
       'ext_impr_norm', 'h1', 'scraped_author', 'date_scraped', 'abstract',
       'scraped_word_count', 'meta_title', 'meta_description',
       'meta_image_url', 'media_type', 'page_img_size', 'merged_url',
       'meta_title_len', 'meta_desc_len', 'h1_len', 'abstract_len',
       'merged_url_len', 'title_has_colon', 'clickbait_label',
       'clickbait_prob', 'google_trend_prob', 'google_trend_label',
       'google_trend_score', 'video_player_types', 'sentiment_abstract',
       'confidence_abstract', 'sentiment_meta_title', 'confidence_meta_title',
       'external_impres

In [53]:
categorical = ['sentiment_abstract','sentiment_meta_title','video_player_types','clickbait_label','title_has_colon','media_type']
df_encoded = pd.get_dummies(df, columns=categorical, prefix= categorical,drop_first=True)

In [54]:
df_encoded.columns

Index(['page_id', 'n_days', 'date_min', 'n_urls', 'date_max', 'age', 'url',
       'no_versions', 'last_publish_date', 'word_count',
       'classification_product', 'classification_type', 'page_name', 'title',
       'author_list', 'external_clicks', 'external_impressions',
       'likes_n_days', 'dislikes_n_days', 'video_play', 'page_impressions',
       'clickouts', 'ctr', 'mean_version_lifetime', 'publ_freq',
       'ext_impr_norm', 'h1', 'scraped_author', 'date_scraped', 'abstract',
       'scraped_word_count', 'meta_title', 'meta_description',
       'meta_image_url', 'page_img_size', 'merged_url', 'meta_title_len',
       'meta_desc_len', 'h1_len', 'abstract_len', 'merged_url_len',
       'clickbait_prob', 'google_trend_prob', 'google_trend_label',
       'google_trend_score', 'confidence_abstract', 'confidence_meta_title',
       'external_impressions_scaled', 'external_clicks_scaled', 'ctr_scaled',
       'likes_scaled', 'sentiment_abstract_neutral',
       'sentiment_abstract

In [55]:
df_encoded.to_csv('/Users/clara/Desktop/neuefische/d-drivers/data/preprocessing_nlp_v3.csv', encoding='utf-8', index=False)