# Keywords extraction with [KeyBERT](https://maartengr.github.io/KeyBERT/)
This notebook demonstrates keyword extraction using KeyBERT. The article titles per day and per website are consolidated, and keywords are extracted from this text. To process the German texts, the [distiluse-base-multilingual-cased-v1](https://www.sbert.net/docs/pretrained_models.html) Sentence Transformers model was selected.

Since the model has a maximum sequence length of 128, the idea was to divide the text into shorter sequences and extract the keywords for each sequence. It was determined that for each day/website, 5 sequences would be created, from which 2 keywords would be extracted. This approach should cover the majority of the text.


In [1]:
from keybert import KeyBERT
import pandas as pd
import numpy as np
import math
import os
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sentence_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
kw_model = KeyBERT(model=sentence_model)

In [3]:
df_articles = os.listdir('./data/articles')

In [4]:
df_articles

['23_11_18_23_blick.csv',
 '23_11_18_23_srf.csv',
 '23_11_19_23_blick.csv',
 '23_11_19_23_srf.csv',
 '23_11_20_23_blick.csv',
 '23_11_20_23_srf.csv',
 '23_11_21_23_blick.csv',
 '23_11_21_23_srf.csv',
 '23_11_22_23_blick.csv',
 '23_11_22_23_srf.csv',
 '23_11_23_23_blick.csv',
 '23_11_23_23_srf.csv',
 '23_11_24_23_blick.csv',
 '23_11_24_23_srf.csv']

In [6]:
# DataFrame for all keywords
df_keywords = pd.DataFrame(columns=['file', 'section', 'keyword'])
# Number of sections to split the content into
num_sections = 5

for df_name in df_articles:
    df_file = pd.read_csv(f'./data/articles/{df_name}')
    
    # Convert all titles to list
    all_content = df_file['title'].tolist()
    # all_content to one string
    all_content = ' '.join(all_content)
    
    # Split the content into sections
    section_length = math.ceil(len(all_content) / num_sections)
    content_sections = [all_content[i:i+section_length] for i in range(0, len(all_content), section_length)]

    # Number of sections in the article
    num_sections_actual = len(content_sections)
    
    # Create a temporary DataFrame
    df_temp = pd.DataFrame(columns=['file', 'section', 'keyword'])
    
    # Extract keywords for each section
    for i, section in enumerate(content_sections):
        keywords = kw_model.extract_keywords(section, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=2)
        
        # Extract only the words from the keywords
        keywords_without_prob = [word for word, _ in keywords]
        
        # Add the keywords to the temporary DataFrame
        if keywords_without_prob:
            df_temp = pd.concat([df_temp, pd.DataFrame({'file': [df_name], 'section': [i+1], 'keyword': [keywords_without_prob]})], ignore_index=True)
    
    # Add the temporary DataFrame to the main DataFrame
    df_keywords = pd.concat([df_keywords, df_temp], ignore_index=True)


In [7]:
# Combine the keywords for each section
combined_df = df_keywords.groupby('file').agg({'section': list, 'keyword': sum}).reset_index()
combined_df.columns = ['file', 'sections_combined', 'keywords_combined']

In [8]:
combined_df['keywords_combined']

0     [schweiz, gefahr, unfall, feuer, ukrainischen,...
1     [politik, wofür, thailand, energieversorger, w...
2     [blumen, mobilisierung, crash, golfbälle, wolf...
3     [wahlsiegerin, kandidat, linke, linkem, schwei...
4     [tunnelwand, flugzeugunglück, neuenkirch, kant...
5     [reichsten, wahrheit, einwanderung, neue, weib...
6     [schweizer, schnee, verkehrsunfall, minivan, n...
7     [recycling, rapperswil, bundeshaus, orban, pos...
8     [tragische, tragischer, schweizern, ukraine, i...
9     [rechtspopulisten, kinderspital, elefantenkuh,...
10    [flughafenpolizei, polizei, schweiz, sexualdel...
11    [schweizerinnen, schweizer, schweiz, rabbiner,...
12    [sterben, spanier, ostschweizer, reparaturarbe...
13    [atommacht, schweizer, medizinstudium, studier...
Name: keywords_combined, dtype: object

In [10]:
combined_df = combined_df.drop(columns=['sections_combined'])

In [12]:
combined_df.head()

Unnamed: 0,file,keywords_combined
0,23_11_18_23_blick.csv,"[schweiz, gefahr, unfall, feuer, ukrainischen,..."
1,23_11_18_23_srf.csv,"[politik, wofür, thailand, energieversorger, w..."
2,23_11_19_23_blick.csv,"[blumen, mobilisierung, crash, golfbälle, wolf..."
3,23_11_19_23_srf.csv,"[wahlsiegerin, kandidat, linke, linkem, schwei..."
4,23_11_20_23_blick.csv,"[tunnelwand, flugzeugunglück, neuenkirch, kant..."


In [22]:
# store dataframe as csv file
combined_df.to_csv('./data/articles_keywords_23_11_18-23.csv', index=False)