# Notebook for labeling Articles with ranking of related google searches (extracted from google trends)

1. Load Data
2. Labeling Script

In [2]:
from transformers import pipeline
import pandas as pd
from tqdm import tqdm

2024-04-15 22:41:22.688070: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Proprocessing (one time job): Concatenate "wallbox laden" and "Solargenerator" with other file

In [None]:
# # Read original data with missing rows
# df_labels_orig = pd.read_csv('../data/related_queries_orig.csv')
# df_labels_orig.drop(['Unnamed: 0'], axis=1, inplace=True)    # file contains unnecessary column

# # Read data for "Wallbox/Laden" and "Solargenerator" 
# df_wall = pd.read_csv('../data/relatedQueries_Wallbox-Laden.csv')
# df_wall['classification_product'] = 'Wallbox/Laden'  # the google search term was "Wallbox Laden" and hast to be changed to "Wallbox/Laden" enable joining with real classification_product
# df_sol = pd.read_csv('../data/relatedQueries_Solargenerator.csv')
# df_sol['classification_product'] = 'Solargenerator' 

# # concatenate all data and export it as CSV
# df_labels_new = pd.concat([df_labels_orig, df_wall, df_sol], axis=0).reset_index(drop=True)
# df_labels_new.to_csv('../data/related_queries.csv', encoding='utf-8', index=False)

## 1. Load data for labeling process

In [38]:
file_path_features = '../data/data_features.csv'
file_path_labels = '../data/related_queries.csv'

df = pd.read_csv(file_path_features)
df_labels = pd.read_csv(file_path_labels)

## 2. Enrich page_ids with google score

In [4]:
classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli", device=0)

2024-04-15 22:41:35.891100: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-15 22:41:35.891135: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFXLMRobertaForSequenceClassification.

All the layers of TFXLMRobertaForSequenceClassification were initialized from the model checkpoint at joeddav/xlm-roberta-large-xnli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForSequenceClassification for predictions without further training.


In [5]:
# prepare dataset
relevant_columns = ['page_id', 'classification_product', 'abstract', 'meta_description', 'meta_title' ]
df_gscore = df[relevant_columns].copy()
df_gscore['text_to_classify'] = df_gscore['abstract'].fillna('') + ' ' + df_gscore['meta_description'].fillna('') + ' ' + df_gscore['meta_title'].fillna('')


display(df_gscore.shape)
display(df_gscore.isna().sum())

(6815, 6)

page_id                   0
classification_product    0
abstract                  7
meta_description          0
meta_title                0
text_to_classify          0
dtype: int64

In [17]:
# Define the function get_predictions_score
def get_predictions_score(prediction):
    pred_labels = prediction['labels']
    pred_scores = prediction['scores']
    
    # Find the index of the label with the highest probability
    max_index = pred_scores.index(max(pred_scores))
    
    # Extract the label and its corresponding probability
    max_label = pred_labels[max_index]
    max_probability = pred_scores[max_index]
    
    return max_label, max_probability

In [18]:
def trends_classify(filter, testrun=False, df_labels=df_labels, df_gscore=df_gscore, classifier=classifier):
    iter = filter

    df_labels_per_category = df_labels[df_labels['classification_product'] == iter]
    candidate_labels = df_labels_per_category['query'].astype(str).tolist()

    df_gscore_iter = df_gscore[df_gscore['classification_product'] == iter]

    if testrun:
        df_gscore_iter = df_gscore_iter.iloc[0:2]

    tqdm.pandas(desc=f"Googel search related keyword classification for {iter}")
    df_gscore_iter['predicted_query_label'], df_gscore_iter['predicted_probability'] = zip(*df_gscore_iter['text_to_classify'].progress_apply(lambda x: get_predictions_score(classifier(x, candidate_labels))))

    return df_gscore_iter

In [19]:
class_product = df.classification_product.unique().tolist()
df_gscore_out = pd.DataFrame(columns=relevant_columns + ['text_to_classify', 'predicted_query_label', 'predicted_probability'])


In [20]:
for cp in tqdm(class_product):
    df_gscore_classified = trends_classify(cp, testrun=True)
    df_gscore_out = pd.concat([df_gscore_out, df_gscore_classified], axis=0).reset_index(drop=True)

Googel search related keyword classification for E-Auto: 100%|██████████| 2/2 [00:31<00:00, 15.68s/it]
Googel search related keyword classification for Auto: 100%|██████████| 2/2 [00:29<00:00, 14.55s/it]
Googel search related keyword classification for Zubehör: 100%|██████████| 2/2 [00:09<00:00,  4.83s/it]
Googel search related keyword classification for Motorrad: 100%|██████████| 2/2 [00:30<00:00, 15.20s/it]
Googel search related keyword classification for Energie: 100%|██████████| 2/2 [00:24<00:00, 12.14s/it]
Googel search related keyword classification for Verkehr: 100%|██████████| 2/2 [00:06<00:00,  3.45s/it]
Googel search related keyword classification for Wallbox/Laden: 100%|██████████| 2/2 [00:31<00:00, 15.75s/it]
Googel search related keyword classification for Solaranlagen: 100%|██████████| 2/2 [00:09<00:00,  4.85s/it]
Googel search related keyword classification for E-Bike: 100%|██████████| 2/2 [00:30<00:00, 15.31s/it]
Googel search related keyword classification for Fahrrad:

In [39]:
# df_labels['predicted_query_label'] = df_labels['query']
df_labels = df_labels.rename(columns={'query': 'predicted_query_label'})

In [40]:
df_gscore_new = df_gscore_out.merge(df_labels, on=['classification_product', 'predicted_query_label'], how='left')

In [41]:
display(df_labels.info())
display(df_gscore_out.shape)
display(df_gscore_new.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   predicted_query_label   351 non-null    object
 1   value                   351 non-null    int64 
 2   classification_product  351 non-null    object
dtypes: int64(1), object(2)
memory usage: 8.4+ KB


None

(34, 8)

(34, 9)

In [46]:
df_gscore_new = df_gscore_new.rename(columns={'value': 'query_score'})
# df_gscore_new

In [47]:
df_gscore_new.to_csv('../data/google_trends/data_trends_classified.csv')