### NLP

In [1]:
import pandas as pd
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import string

stop_words = set(stopwords.words('german'))

[nltk_data] Downloading package stopwords to /Users/clara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/clara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/clara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Extract the most important words from title, h1, abstract, url with NLP (e.g. TF-IDF )

In [7]:
df = pd.read_csv('/Users/clara/Desktop/neuefische/d-drivers/data/data_nlp.csv')

In [10]:
df.columns

Index(['page_id', 'n_days', 'url', 'n_urls', 'no_versions', 'age',
       'last_publish_date', 'publish_date_min', 'word_count',
       'classification_product', 'classification_type', 'page_name', 'title',
       'last_author', 'external_clicks', 'external_impressions',
       'likes_n_days', 'dislikes_n_days', 'video_play', 'page_impressions',
       'clickouts', 'ctr', 'h1', 'scraped_author', 'date_scraped', 'abstract',
       'scraped_word_count', 'meta_title', 'meta_description',
       'meta_image_url', 'media_type', 'page_img_size', 'merged_url',
       'meta_title_len', 'meta_desc_len', 'h1_len', 'abstract_len',
       'merged_url_len', 'sentiment_abstract', 'confidence_abstract',
       'sentiment_meta_title', 'confidence_meta_title'],
      dtype='object')

In [11]:
df_nlp = df[['page_id', 'external_clicks', 'external_impressions', 'ctr', 'h1', 
        'abstract', 'meta_title', 'meta_description', 'merged_url']]

In [12]:
df_nlp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6815 entries, 0 to 6814
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   page_id               6815 non-null   int64  
 1   external_clicks       6815 non-null   float64
 2   external_impressions  6815 non-null   float64
 3   ctr                   6815 non-null   float64
 4   h1                    6815 non-null   object 
 5   abstract              6808 non-null   object 
 6   meta_title            6815 non-null   object 
 7   meta_description      6815 non-null   object 
 8   merged_url            6815 non-null   object 
dtypes: float64(3), int64(1), object(5)
memory usage: 479.3+ KB


In [13]:
df_nlp.head()

Unnamed: 0,page_id,external_clicks,external_impressions,ctr,h1,abstract,meta_title,meta_description,merged_url
0,1037,256.0,5444.0,4.702425,Elektrofahrzeug-Ladestation: Kosten und Anbiet...,"Im Grunde kann man sein E-Auto überall laden, ...","Elektrofahrzeug-Ladestation: Anbieter, Kosten,...",Ladestationen für Elektroautos. Kosten und Anb...,"['elektrofahrzeug', 'ladestation']"
1,1039,124.0,1728.0,7.175926,Elektroauto-Förderung 2024: Wo Sie noch satt a...,Die große staatliche Förderung für Elektroauto...,E-Auto Förderung 2024: Alle Infos im Überblick,E-Auto Prämie 2024- so können Sie beim Kauf Ge...,"['absahnen', 'koennen', 'satt', '2024', 'sie',..."
2,1040,83.0,2575.0,3.223301,Hätten Sie es gewusst? So lange gibt es schon ...,Noch vor wenigen Jahren wurden Elektroautos al...,Hätten Sie es gewusst? So lange gibt es schon ...,Noch vor wenigen Jahren wurden Elektroautos al...,"['so', 'gewusst', 'e', 'haetten', 'autos', 'si..."
3,10245,80.0,845.0,9.467456,Kleinstes Auto der Welt kommt zurück: Fans kön...,"Der P.50, auch bekannt als das kleinste Auto d...",Kleinstes Auto der Welt kommt zurück: Fans kön...,"Der P.50, auch bekannt als das kleinste Auto d...","['zurueck', 'welt', 'fans', 'koennen', 'kommt'..."
4,10273,3384.0,94132.0,3.594952,Mehr Reichweite im Winter: So holen Sie alles ...,Wer freut sich schon über eisige Kälte? E-Auto...,Mehr Reichweite im Winter: So holen Sie alles ...,Wer freut sich schon über eisige Kälte? E-Auto...,"['so', 'e', 'ihrem', 'raus', 'reichweite', 'au..."


### Scale target variable

In [14]:
scaler = StandardScaler()
scaler.fit(df_nlp[['external_impressions']])
# Transform the target variable
df_nlp['external_impressions_scaled'] = scaler.transform(df_nlp[['external_impressions']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nlp['external_impressions_scaled'] = scaler.transform(df_nlp[['external_impressions']])


### Remove stop words

In [15]:
def remove_stopwords(text):
    if isinstance(text, str):        
        words = word_tokenize(text)
        # Remove punctuation and special characters
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove stopwords
        return ' '.join([word for word in words if word.lower() not in stop_words])
    else:
        return text

def remove_stopwords_from_columns(df, columns):
    for col in columns:
        df[col] = df[col].apply(remove_stopwords)
    return df

columns_to_clean = ['h1','abstract','meta_title','meta_description','merged_url']
df_nlp = remove_stopwords_from_columns(df_nlp, columns_to_clean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(remove_stopwords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(remove_stopwords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(remove_stopwords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [16]:
df_nlp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6815 entries, 0 to 6814
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   page_id                      6815 non-null   int64  
 1   external_clicks              6815 non-null   float64
 2   external_impressions         6815 non-null   float64
 3   ctr                          6815 non-null   float64
 4   h1                           6815 non-null   object 
 5   abstract                     6808 non-null   object 
 6   meta_title                   6815 non-null   object 
 7   meta_description             6815 non-null   object 
 8   merged_url                   6815 non-null   object 
 9   external_impressions_scaled  6815 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 532.6+ KB


In [17]:
df_nlp.to_csv('/Users/clara/Desktop/neuefische/d-drivers/data/preprocessing_nlp.csv', encoding='utf-8', index=False)


### Use pretrained model

In [18]:
from transformers import pipeline
import pandas as pd
import numpy as np
import tensorflow
import torch
from tqdm import tqdm
import sentencepiece

In [19]:
data = df_nlp

print(f'There are {data.shape[0]} rows in the dataset')

There are 6815 rows in the dataset


In [20]:
#Preparing the pipeline in one-line of code!
#classifier = pipeline("zero-shot-classification",device = 0)
classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
# Define the function get_predictions_score
def get_predictions_score(prediction):
    pred_labels = prediction['labels']
    pred_scores = prediction['scores']
    
    # Find the index of the label with the highest probability
    max_index = pred_scores.index(max(pred_scores))
    
    # Extract the label and its corresponding probability
    max_label = pred_labels[max_index]
    max_probability = pred_scores[max_index]
    
    return max_label, max_probability

# Define the sequence, candidate_labels, and make predictions
sequence = df_nlp['h1'].iloc[1]  # Assuming 'h1' is the column you want to process
candidate_labels = ["e auto",
                    "e-auto laden",
                    "leasing e-auto",
                    "förderung e-auto",
                    "elektroauto",
                    "e-auto kaufen",
                    "e-auto ladestation",
                    "vw",
                    "e-auto reichweite",
                    "vw e-auto",
                    "e auto laden",
                    "tesla",
                    "e-auto kleinwagen",
                    "auto leasing",
                    "wallbox",
                    "e-auto gebraucht",
                    "bmw e-auto",
                    "förderung e-auto 2023",
                    "e-auto vergleich",
                    "leasing e auto",
                    "skoda",
                    "opel",
                    "skoda e-auto",
                    "opel e-auto",
                    "e-auto prämie",
                    "auto kaufen",
                    "auto mieten",
                    "auto leasing",
                    "mein auto",
                    "android auto",
                    "auto mobile",
                    "auto versicherung",
                    "wir kaufen dein auto",
                    "auto abmelden",
                    "auto verkaufen",
                    "auto batterie",
                    "auto kennzeichen",
                    "gebrauchtwagen",
                    "auto kaufvertrag",
                    "ebay kleinanzeigen auto",
                    "auto abo",
                    "auto gebraucht kaufen",
                    "auto ummelden",
                    "elektro auto",
                    "hybrid auto",
                    "auto waschen",
                    "auto bild",
                    "auto werkstatt",
                    "mg auto",
                    "auto teile",
                    "camping zubehör",
                    "vw zubehör",
                    "dyson zubehör",
                    "kärcher zubehör",
                    "tm6 zubehör",
                    "monsieur cuisine",
                    "vorwerk thermomix zubehör",
                    "bmw",
                    "bmw motorrad",
                    "motorrad louis",
                    "honda motorrad",
                    "louis",
                    "honda",
                    "führerschein motorrad",
                    "führerschein",
                    "motorrad kaufen",
                    "125 motorrad",
                    "polo",
                    "yamaha motorrad",
                    "yamaha",
                    "suzuki",
                    "suzuki motorrad",
                    "kawasaki motorrad",
                    "kawasaki",
                    "motorrad unfall",
                    "motorrad gebraucht",
                    "ktm",
                    "ktm motorrad",
                    "triumph",
                    "triumph motorrad",
                    "motorrad batterie",
                    "motorrad reifen",
                    "cottbus",
                    "maingau energie",
                    "maingau",
                    "erneuerbare energie",
                    "new energie",
                    "rhein energie stadion",
                    "kinetische energie",
                    "team energie",
                    "montana energie",
                    "energie sparen",
                    "energie bkk",
                    "grünwelt energie",
                    "eon",
                    "sachsen energie",
                    "lekker energie",
                    "potentielle energie",
                    "energie einheit",
                    "elektroniker für energie und gebäudetechnik",
                    "energie cottbus tickets",
                    "gebäude energie gesetz",
                    "jump verkehr",
                    "maps verkehr",
                    "ndr 2 verkehr",
                    "welche bereiche müssen bei stockendem verkehr freigehalten werden",
                    "rpr1 verkehr",
                    "photovoltaik",
                    "solaranlagen kosten",
                    "solaranlagen mit speicher",
                    "wechselrichter",
                    "solaranlagen rechner",
                    "premium solaranlagen gmbh",
                    "enpal erfahrungen",
                    "e bike",
                    "fahrrad",
                    "e-bike damen",
                    "cube",
                    "e-bike cube",
                    "e-bike test",
                    "bosch e-bike",
                    "ebike",
                    "e-bike kaufen",
                    "e-bike gebraucht",
                    "e-bike herren",
                    "e-bike fully",
                    "e-bike trekking",
                    "fischer e-bike",
                    "e bike damen",
                    "fahrradträger",
                    "ktm e-bike",
                    "cube e bike",
                    "prophete e-bike",
                    "e-bike mountainbike",
                    "mountainbike",
                    "klapprad e-bike",
                    "kalkhoff e-bike",
                    "e-bike leasing",
                    "pedelec",
                    "fahrrad xxl",
                    "fahrrad fahren",
                    "cube fahrrad",
                    "cube",
                    "kinder fahrrad",
                    "fahrrad kaufen",
                    "damen fahrrad",
                    "e bike",
                    "fahrrad stadler",
                    "stadler",
                    "28 zoll fahrrad",
                    "decathlon fahrrad",
                    "26 zoll fahrrad",
                    "24 zoll fahrrad",
                    "20 zoll fahrrad",
                    "fahrrad de",
                    "puky fahrrad",
                    "woom fahrrad",
                    "woom",
                    "ebike",
                    "gepäckträger fahrrad",
                    "rennrad",
                    "kinderfahrrad",
                    "mountainbike",
                    "bulls fahrrad",
                    "scooter",
                    "e scooter",
                    "e-scooter straßenzulassung",
                    "e-scooter mit straßenzulassung",
                    "xiaomi",
                    "e-scooter xiaomi",
                    "roller",
                    "e-scooter ninebot",
                    "ninebot",
                    "e-scooter kaufen",
                    "e-scooter test",
                    "e roller",
                    "e-scooter versicherung",
                    "segway",
                    "e scooter mit straßenzulassung",
                    "xiaomi e scooter",
                    "soflow",
                    "e-roller",
                    "segway ninebot",
                    "escooter",
                    "e scooter kaufen",
                    "xiaomi pro 2",
                    "e scooter ninebot",
                    "e scooter test",
                    "kinder e-scooter",
                    "solarspeicher balkonkraftwerk",
                    "balkonkraftwerk",
                    "wechselrichter",
                    "solarspeicher 10 kw",
                    "solarspeicher kaufen",
                    "byd",
                    "huawei",
                    "solarspeicher kosten",
                    "solaranlage",
                    "solarspeicher förderung",
                    "photovoltaik",
                    "stromspeicher",
                    "solarspeicher selber bauen",
                    "solar speicher",
                    "pv speicher",
                    "solarspeicher nachrüsten",
                    "batteriespeicher",
                    "solarspeicher 24",
                    "growatt",
                    "balkonkraftwerk speicher",
                    "anker solarspeicher",
                    "solarspeicher für balkonkraftwerk",
                    "anker solix solarbank e1600",
                    "speicher balkonkraftwerk",
                    "balkonkraftwerk mit speicher",
                    "balkonkraftwerk 800 watt",
                    "förderung balkonkraftwerk",
                    "balkonkraftwerk anmelden",
                    "solar balkonkraftwerk",
                    "wechselrichter",
                    "wechselrichter balkonkraftwerk",
                    "600w balkonkraftwerk",
                    "balkonkraftwerk 600w",
                    "pv balkonkraftwerk",
                    "balkonkraftwerk test",
                    "netto balkonkraftwerk",
                    "halterung balkonkraftwerk",
                    "balkonkraftwerk kaufen",
                    "solaranlage",
                    "800w balkonkraftwerk",
                    "pv anlage",
                    "balkonkraftwerk testsieger",
                    "speicher für balkonkraftwerk",
                    "photovoltaik",
                    "balkonkraftwerk 2024",
                    "balkonkraftwerk steckdose",
                    "anker balkonkraftwerk",
                    "anker",
                    "kosten wärmepumpe",
                    "wärmepumpe förderung",
                    "luft luft wärmepumpe",
                    "luft wärmepumpe",
                    "wärmepumpe heizung",
                    "wasser wärmepumpe",
                    "wasser wasser wärmepumpe",
                    "wärmepumpe viessmann",
                    "viessmann",
                    "vaillant wärmepumpe",
                    "vaillant",
                    "wärmepumpen",
                    "warmwasser wärmepumpe",
                    "wärmepumpe altbau",
                    "wärmepumpe pool",
                    "stromverbrauch wärmepumpe",
                    "luft wasser wärmepumpe",
                    "was kostet wärmepumpe",
                    "bosch wärmepumpe",
                    "wärmepumpe preis",
                    "daikin wärmepumpe",
                    "daikin",
                    "wie funktioniert wärmepumpe",
                    "buderus wärmepumpe",
                    "buderus",
                    "versicherung kfz",
                    "versicherung auto",
                    "allianz",
                    "allianz versicherung",
                    "ergo versicherung",
                    "huk",
                    "huk versicherung",
                    "ergo",
                    "adac versicherung",
                    "adac",
                    "vhv versicherung",
                    "check24 versicherung",
                    "axa versicherung",
                    "versicherung kündigen",
                    "lvm versicherung",
                    "württembergische versicherung",
                    "devk versicherung",
                    "devk",
                    "württembergische",
                    "kfz versicherung vergleich",
                    "wgv versicherung",
                    "hdi versicherung",
                    "wgv",
                    "nürnberger versicherung",
                    "hdi",
                    "thg prämie",
                    "thg preetz",
                    "förderung wallbox",
                    "11kw wallbox",
                    "wallbox kosten",
                    "kfw",
                    "22kw wallbox",
                    "kfw wallbox",
                    "tesla",
                    "tesla wallbox",
                    "wallbox test",
                    "test wallbox",
                    "abl wallbox",
                    "abl",
                    "keba wallbox",
                    "keba",
                    "wallbox huawei",
                    "wallbox kaufen",
                    "wallbox heidelberg",
                    "wallbox 11 kw",
                    "wallbox easee",
                    "ladestation",
                    "kfw förderung",
                    "wallbox anmelden",
                    "überschussladen wallbox",
                    "mobile wallbox",
                    "wallbox installation",
                    "bidirektionales laden wallbox",
                    "tesla",
                    "bidirektionale wallbox"
                ]
pred = classifier(sequence, candidate_labels)

# Apply the function to the DataFrame column and save the result in new columns
df_nlp['predicted_label'], df_nlp['predicted_probability'] = zip(*df_nlp['h1'].apply(lambda x: get_predictions_score(classifier(x, candidate_labels))))

# Display the DataFrame with the new columns
print(df_nlp)

In [None]:
df_nlp.head()

Unnamed: 0,page_id,external_clicks,external_impressions,ctr,h1,abstract,meta_title,meta_description,merged_url,predicted_label,predicted_probability
0,1037,256.0,5444.0,4.702425,Elektrofahrzeug-Ladestation : Kosten Anbieter ...,"Grunde E-Auto überall laden , Strom gibt . Tat...","Elektrofahrzeug-Ladestation : Anbieter , Koste...",Ladestationen Elektroautos . Kosten Anbieter V...,"[ 'elektrofahrzeug ' , 'ladestation ' ]",Ladeinfrastruktur für Elektroautos,0.431657
1,1039,124.0,1728.0,7.175926,Elektroauto-Förderung 2024 : satt absahnen -,große staatliche Förderung Elektroautos beende...,E-Auto Förderung 2024 : Infos Überblick,E-Auto Prämie 2024- beim Kauf Geld sparen . Fö...,"[ 'elektroauto ' , 'und ' , '2024 ' , 'koennen...",Vorschriften in der Elektrofahrzeugindustrie,0.303427
2,1040,83.0,2575.0,3.223301,Hätten gewusst ? lange gibt schon E-Autos,wenigen Jahren wurden Elektroautos absolute Ne...,Hätten gewusst ? lange gibt schon E-Autos,wenigen Jahren wurden Elektroautos absolute Ne...,"[ 'schon ' , 'so ' , 'lange ' , 'autos ' , 'ha...",Technologien für Elektroautos,0.296509
3,10245,80.0,845.0,9.467456,Kleinstes Auto Welt kommt zurück : Fans selber...,"P.50 , bekannt kleinste Auto Welt , gilt Samml...",Kleinstes Auto Welt kommt zurück : Fans selber...,"P.50 , bekannt kleinste Auto Welt , gilt Samml...","[ 'auto ' , 'koennen ' , 'bauen ' , 'jetzt ' ,...",Elektroauto-Bewertungen,0.128451
4,10273,3384.0,94132.0,3.594952,Mehr Reichweite Winter : holen E-Auto-Akku raus,Wer freut schon eisige Kälte ? E-Autos sicherl...,Mehr Reichweite Winter : holen E-Auto-Akku raus,Wer freut schon eisige Kälte ? E-Autos sicherl...,"[ 'mehr ' , 'auto ' , 'holen ' , 'so ' , 'wint...",Batterietechnologie für Elektroautos,0.295967


In [None]:
df_nlp.to_csv('/Users/clara/Desktop/neuefische/d-drivers/data/matching_trends.csv', encoding='utf-8', index=False)

### Vertex 

### Stemming

In [None]:
# tbd

### Vectorizing

In [None]:
df_nlp_vec = df_nlp.copy()
df_nlp_vec.fillna('', inplace=True)
df_nlp_vec.isna().sum() #info()

In [None]:
def vectorize_text(column, df):
    col = df[column]
    vect = CountVectorizer().fit(col)
    transformed = vect.transform(col)
    for i, feature_name in enumerate(vect.get_feature_names_out()):
        df[f'{column}_{feature_name}'] = transformed[:, i].toarray().flatten()  # Convert to dense array
    return df

for item in ['meta_description']:
    df = vectorize_text(column=item, df=df_nlp_vec)
    filename = f'../data/nlp_features_{item}.csv'
    df.to_csv(filename, encoding='utf-8', index=False)

In [None]:
def vectorize_text(column, df):
    col = df[column]
    vect = CountVectorizer().fit(col)
    transformed = vect.transform(col)
    for i, feature_name in enumerate(vect.get_feature_names_out()):
        df[f'{column}_{feature_name}'] = transformed[:, i].toarray().flatten()  # Convert to dense array
    return df

for item in ['merged_url']:
    df = vectorize_text(column=item, df=df_nlp_vec)
    filename = f'../data/nlp_features_{item}.csv'
    df.to_csv(filename, encoding='utf-8', index=False)

In [None]:
def vectorize_text(column, df):
    col = df[column]
    vect = CountVectorizer().fit(col)
    transformed = vect.transform(col)
    
    # Create a DataFrame from the transformed array
    df_transformed = pd.DataFrame(transformed.toarray(), columns=vect.get_feature_names_out(), index=df.index)
    
    # Concatenate the new DataFrame with the original DataFrame
    df_concatenated = pd.concat([df, df_transformed], axis=1)
    
    # Drop the original column
    df_concatenated.drop(columns=[column], inplace=True)
    
    return df_concatenated

# Iterate over columns to clean
for item in columns_to_clean:
    df_nlp_vec = vectorize_text(column=item, df=df_nlp_vec)
    filename = f'../data/nlp_features_{item}.csv'
    df_nlp_vec.to_csv(filename, encoding='utf-8', index=False)

In [None]:
df_full_vec = pd.merge('nlp_features_h1','nlp_features_abstract',how='left',left_index=True)

In [None]:
df_full_vec = pd.merge(nlp_features_h1, nlp_features_abstract, how='left', left_index=True, right_index=True)

In [13]:
from transformers import pipeline
import pandas as pd
import numpy as np
from tqdm import tqdm
data = pd.read_csv(
    "data/SMSSpamCollection.txt",
    encoding="utf-8",
    header=None,
    delimiter="\t",
    names=["target", "text"],
)
data.head(5)
print(f'There are {data.shape[0]} rows in the dataset')
Preparing the pipeline in one-line of code!
classifier = pipeline("zero-shot-classification",device = 0)

SyntaxError: invalid syntax (3410687870.py, line 14)