In [42]:
import string 

import pandas as pd
import nltk
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
nlp = spacy.load('de_core_news_sm') 

#Definition für top keywords bzw. Ermittlung TF-IDF Werte
def get_top_n_kw(document, vectorizer, n):
    matrix = vectorizer.transform([document])
    scores = pd.DataFrame(matrix[0].T.todense(), index=vectorizer.get_feature_names_out(), columns=["TF-IDF"])
    scores = scores[scores["TF-IDF"]!=0]
    return {k: v["TF-IDF"] for k,v in scores.sort_values("TF-IDF", ascending=False)[:n].to_dict("index").items()}
    


In [141]:
#Preprocessing mit Lemmatisierung, Umlauten und stemming
with open ("../analysis/custom-replacements.json", "r") as file:
    replacements = json.load(file)

def preprocess(article):
    document = " ".join([article["text"], article["title"], article["surtitle"]]).lower()
    document.replace("sammads", "sammad")
    for sw in ["\n"]:
        document = document.replace(sw, " ")
    # part of speech tagging und lemmatization
    document = " ".join([t.lemma_ for t in nlp(document) if t.pos_ in ["NOUN", "PROPN", "VERB", "X"]]).lower()
    for k,v in replacements.items():
        document = document.replace(k, v)
    for n in range(10):
        document = document.replace(str(n), "")
    return document

In [142]:
import json
with open("../data/sz/all-relevant-fulltext.json", "r", encoding="utf-8") as file:
    s = file.read()
    articles_sz = json.loads(s)

with open("../data/faz/all-relevant-fulltext.json", "r", encoding="utf-8") as file:
    s = file.read()
    articles_faz = json.loads(s)

corpus = [preprocess(article) for article in articles_faz + articles_sz]


In [143]:
nltk.download('stopwords')
german_stop_words = stopwords.words('german')

tfidf_vect = TfidfVectorizer(stop_words=german_stop_words)
tfidf_vect = tfidf_vect.fit(corpus)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ddoro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [144]:

final_keywords = []

for doc in corpus:
    keywords = get_top_n_kw(doc, tfidf_vect, 10).keys()
    final_keywords += list(keywords)

print(f"A total of {len(final_keywords)} keywords were produced" )
unique_keywords = list(set(final_keywords))
print(f"An {len(unique_keywords)} unique ones")


A total of 6830 keywords were produced
An 3350 unique ones


In [147]:
from collections import Counter
c = Counter(final_keywords)
[(x,v) for x,v in dict(c).items() if v>1]

[('houthi', 96),
 ('rebellen', 25),
 ('sammad', 2),
 ('führer', 4),
 ('tod', 10),
 ('militärkoalition', 17),
 ('luftangriff', 20),
 ('äußern', 2),
 ('gesundheitsministerium', 4),
 ('gehen', 5),
 ('militär', 4),
 ('terrorismus', 2),
 ('extremistengruppe', 3),
 ('töten', 25),
 ('aqap', 11),
 ('militäratz', 4),
 ('terrormiliz', 4),
 ('alqaida', 31),
 ('opposition', 2),
 ('versichern', 2),
 ('luftwaffe', 2),
 ('sold', 3),
 ('soldat', 14),
 ('selbstmordanschlag', 4),
 ('attentäter', 3),
 ('sicherheitskreis', 3),
 ('tat', 2),
 ('bekennen', 3),
 ('amt', 7),
 ('saudis', 10),
 ('iran', 29),
 ('huthis', 37),
 ('einfluss', 4),
 ('stellvertreterkrieg', 2),
 ('staat', 10),
 ('saudiarabien', 35),
 ('jemer', 8),
 ('kaserne', 3),
 ('anschlag', 12),
 ('treue', 3),
 ('zahlung', 2),
 ('selbstmordattentäter', 4),
 ('sprengen', 2),
 ('attentat', 2),
 ('lösung', 2),
 ('eu', 4),
 ('krise', 4),
 ('jemen', 14),
 ('krieg', 32),
 ('krankheit', 4),
 ('kind', 25),
 ('bedrohen', 3),
 ('konfliktland', 2),
 ('attacke

In [132]:
with open("tmp.txt", "w", encoding="utf-8") as f:
    json.dump(unique_keywords, f, indent=2, ensure_ascii=False)

In [110]:
unique_keywords


['laufend',
 'jugendliche',
 'berbera',
 'familie',
 'carter',
 'vereint',
 '325',
 'verdsfäll',
 'emiratisch',
 'computer',
 'verhalten',
 'bombenteil',
 'anderslautend',
 'pünktlich',
 'menschengedenke',
 'petra',
 'behandlungsmöglichkeit',
 'emirate',
 'indisch',
 'beistand',
 'zusammentreffen',
 'häftling',
 'wasserleitung',
 'verbünden',
 'extrem',
 'verständigen',
 'doha',
 'vortragen',
 'amt',
 'shiban',
 'nonn',
 'zusagen',
 'gespräche',
 '2600',
 'weniger',
 'direktor',
 'liga',
 'beladen',
 'fremd',
 'tor',
 'vergessen',
 'friedensgesprächen',
 'regierungssitz',
 'tatendrang',
 'krisensitzung',
 'pca',
 'durchtrieben',
 'mhabers',
 'sprecher',
 'ortsansässig',
 'befreien',
 'washington',
 'gespalten',
 'beiderseitig',
 'ramadan',
 'kontrollierte',
 'küstenprovinz',
 '6000',
 '00',
 'neu',
 '161',
 'ölraffinerie',
 'kriegsschiffe',
 'umschlagplatz',
 'york',
 'militärintervention',
 'menschenrechtsbeauftragt',
 'organisieren',
 'almandab',
 'provinzhauptstadt',
 'aufständisch'

{'die': 0.22606718257288827,
 'der': 0.19833280394310387,
 'rebellen': 0.18224222693436887}

{'machine': 0.7559289460184544, 'learning': 0.3779644730092272}

In [14]:
df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(25))

            TF-IDF
machine   0.513720
learning  0.513720
about     0.256860
subject   0.256860
phd       0.256860
and       0.256860
my        0.256860
is        0.256860
reading   0.195349
enjoy     0.195349
library   0.000000
park      0.000000
in        0.000000
the       0.000000
walk      0.000000
was       0.000000
would     0.000000


In [19]:
get_values_with_feature_names(tfIdfVectorizer.transform([dataset[0]]), tfIdfVectorizer)


Unnamed: 0,TF-IDF
about,0.25686
and,0.25686
enjoy,0.195349
in,0.0
is,0.25686
learning,0.51372
library,0.0
machine,0.51372
my,0.25686
park,0.0
