# Clean Data from newspaper

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('maxent_treebank_pos_tagger')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import pandas as pd
import numpy as np
from string import digits
from gensim import corpora, models, similarities
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/celsadiaz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /Users/celsadiaz/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/celsadiaz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/celsadiaz/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### 1. Extract topics, remove duplicates and get training data ready

In [3]:
news_23_08 = pd.read_pickle('../../data/news_complete_23_08_17.pkl')
news_24_08 = pd.read_pickle('../../data/news_complete_24_08_17.pkl')
news_12_09 = pd.read_pickle('../../data/news_complete_12_09_17.pkl')
news = pd.concat([news_23_08,news_24_08,news_12_09])
news.drop('keywords', axis=1, inplace=True)

In [4]:
news.head(2)

Unnamed: 0,title,topic,url,meta_description,text
0,Champions-League-Qualifikation: Liverpool läss...,"(Sport, 22:34)",http://spiegel.de/sport/fussball/champions-lea...,Starker Auftritt des FC Liverpool: Auch im Rüc...,Der Traum von der Champions League ist für die...
1,Niederlande: Terrorwarnung - Rockkonzert in Ro...,"(Politik, 22:28)",http://spiegel.de/politik/ausland/rotterdam-ko...,In Rotterdam hat die Polizei das Konzert der U...,Die Band verließ den Saal unter Polizeischutz:...


In [5]:
print("Number of rows: {} and number of unique urls: {}, dropping duplicates: {}".format(len(news), 
                                                    len(news['url'].unique()), len(news.drop_duplicates(subset=['url']))))

Number of rows: 866 and number of unique urls: 662, dropping duplicates: 662


In [6]:
news_clean = news.copy()
news_clean = news_clean.drop_duplicates()

#### 1.1. Extract topic

In [7]:
topics_de = ['Politik', 'Meinung', 'Wirtschaft', 'Panorama', 'Sport', 'Kultur', 'Netzwelt', 'Wissenschaft', 'Gesundheit']
topics_en = ['politics', 'opinion', 'economy', 'society', 'sport', 'culture', 'technology', 'science', 'health']

In [8]:
news_clean['topic'] = news_clean['topic'].apply(lambda x: x.split(',')[0].split('(')[1])

In [9]:
news_clean = news_clean[news_clean['topic'].isin(topics_de)]

In [10]:
n_topics = len(news_clean['topic'].unique())
print("Found {} topics.".format(len(news_clean['topic'].unique())))

Found 8 topics.


In [31]:
pd.Series(news_clean['topic']).value_counts()

Politik         175
Panorama        130
Sport            89
Wirtschaft       76
Kultur           54
Netzwelt         29
Wissenschaft     28
Gesundheit       20
Name: topic, dtype: int64

#### 1.2. Balance train data by topic

In [84]:
def pick_random_index_n_times(source, n_times):
    indices = source.index.tolist()
    indices = np.random.choice(indices, n_times)
    return source.loc[indices]

In [109]:
n_Panorama = pd.Series(news_clean['topic']).value_counts()['Politik']-pd.Series(news_clean['topic']).value_counts()['Panorama']
n_Sport = pd.Series(news_clean['topic']).value_counts()['Politik']-pd.Series(news_clean['topic']).value_counts()['Sport']
n_Wirtschaft = pd.Series(news_clean['topic']).value_counts()['Politik']-pd.Series(news_clean['topic']).value_counts()['Wirtschaft']
n_Kultur = pd.Series(news_clean['topic']).value_counts()['Politik']-pd.Series(news_clean['topic']).value_counts()['Kultur']
n_Netzwelt = pd.Series(news_clean['topic']).value_counts()['Politik']-pd.Series(news_clean['topic']).value_counts()['Netzwelt']
n_Wissenschaft = pd.Series(news_clean['topic']).value_counts()['Politik']-pd.Series(news_clean['topic']).value_counts()['Wissenschaft']
n_Gesundheit = pd.Series(news_clean['topic']).value_counts()['Politik']-pd.Series(news_clean['topic']).value_counts()['Gesundheit']

In [110]:
df_Panorama = pick_random_index_n_times(news_clean[news_clean['topic'] == 'Panorama'], n_Panorama)
df_Sport = pick_random_index_n_times(news_clean[news_clean['topic'] == 'Sport'], n_Sport)
df_Wirtschaft = pick_random_index_n_times(news_clean[news_clean['topic'] == 'Wirtschaft'], n_Wirtschaft)
df_Kultur = pick_random_index_n_times(news_clean[news_clean['topic'] == 'Kultur'], n_Kultur)
df_Netzwelt = pick_random_index_n_times(news_clean[news_clean['topic'] == 'Netzwelt'], n_Netzwelt)
df_Wissenschaft = pick_random_index_n_times(news_clean[news_clean['topic'] == 'Wissenschaft'], n_Wissenschaft)
df_Gesundheit = pick_random_index_n_times(news_clean[news_clean['topic'] == 'Gesundheit'], n_Gesundheit)
complete_news = pd.concat([news_clean,df_Panorama,df_Sport,df_Wirtschaft,df_Kultur,df_Netzwelt,df_Wissenschaft,df_Gesundheit],ignore_index=True)

In [111]:
pd.Series(complete_news['topic']).value_counts()

Wissenschaft    197
Kultur          195
Sport           193
Wirtschaft      189
Panorama        184
Netzwelt        175
Politik         175
Gesundheit      175
Name: topic, dtype: int64

#### 1.3. Text to Vectors

In [112]:
def clean_texts(text):
    remove_digits = str.maketrans('', '', digits)
    text = text.replace('\n','').replace('(','').replace(')','').replace(':','').replace('@','').replace(';','').replace('\'','').replace("\"",'').replace('?','').replace('!','').replace('/','').replace('-','').replace('.','').replace(',','').translate(remove_digits)
    text = text.lower()
    text = " ".join(text.split())
    return text

In [129]:
def build_dictionary_and_vectorize_texts(list_of_texts):
    # to avoid messing things up, I'll call all objects relating to our first model _m1
    ldainput = [text.split() for text in list_of_texts]           # convert all strings to list of words
    id2word = corpora.Dictionary(ldainput)                       # assign a token_id to each word
    ldacorpus = [id2word.doc2bow(doc) for doc in ldainput] 
    return id2word, ldacorpus # represent each speech by (token_id, token_count) tuples

def build_dictionary_and_vectorize_texts_wo_stopwords(list_of_texts, language):
    # get stopwords for the given language
    mystopwords = set(stopwords.words(language))
    list_of_texts = [" ".join([w for w in text.split() if w not in mystopwords]) for text in list_of_texts]
    
    # to avoid messing things up, I'll call all objects relating to our first model _m1
    ldainput = [text.split() for text in list_of_texts]           # convert all strings to list of words
    id2word = corpora.Dictionary(ldainput)                       # assign a token_id to each word
    ldacorpus = [id2word.doc2bow(doc) for doc in ldainput] 
    return id2word, ldacorpus # represent each speech by (token_id, token_count) tuples

def build_dictionary_and_vectorize_texts_wo_random_wo_common_words(list_of_texts, min_count, max_freq):
    # to avoid messing things up, I'll call all objects relating to our first model _m1
    ldainput = [text.split() for text in list_of_texts]           # convert all strings to list of words
    id2word = corpora.Dictionary(ldainput)
    id2word.filter_extremes(no_below=min_count, no_above=max_freq) 
    ldacorpus = [id2word.doc2bow(doc) for doc in ldainput] 
    return id2word, ldacorpus# represent each speech by (token_id, token_count) tuples

def build_dictionary_and_vectorize_texts_w_stemmer(list_of_texts, min_count, max_freq, stemmer):
    texts_stemmed = [" ".join([stemmer.stem(word) for word in text.split()]) for text in list_of_texts]
    ldainput = [text.split() for text in texts_stemmed]           # convert all strings to list of words
    id2word = corpora.Dictionary(ldainput)
    id2word.filter_extremes(no_below=min_count, no_above=max_freq) 
    ldacorpus = [id2word.doc2bow(doc) for doc in ldainput] 
    return id2word, ldacorpus# represent each speech by (token_id, token_count) tuples

def build_dictionary_and_vectorize_texts_w_ngrams_features(list_of_texts, language, n_grams,  min_count, max_freq):
    # get stopwords for the given language
    enstopwords = list(set(stopwords.words('english')))
    mystopwords = list(set(stopwords.words(language)))
    allstopwords = enstopwords + enstopwords
    list_of_texts = [" ".join([w for w in text.split() if w not in allstopwords]) for text in list_of_texts]
    
    # get text with n_gram features
    text_n_grams = [["_".join(tup) for tup in nltk.ngrams(text.split(),n_grams)] for text in list_of_texts]
    
    # get combined text
    text_combined = []
    for a,b in zip([text.split() for text in list_of_texts],text_n_grams):
        text_combined.append(a + b)
    
    id2word = corpora.Dictionary(text_combined)                       
    id2word.filter_extremes(no_below=min_count, no_above=max_freq)
    ldacorpus = [id2word.doc2bow(doc) for doc in text_combined] 
    return id2word, ldacorpus # represent each speech by (token_id, token_count) tuples



def build_dictionary_and_vectorize_texts_w_ngrams_features_only_noun_adj(list_of_texts, language, n_grams,  min_count, max_freq):
    # get stopwords for the given language
    enstopwords = list(set(stopwords.words('english')))
    mystopwords = list(set(stopwords.words(language)))
    allstopwords = enstopwords + enstopwords
    list_of_texts = [" ".join([w for w in text.split() if w not in allstopwords]) for text in list_of_texts]
    
    # get text with n_gram features
    text_n_grams = [["_".join(tup) for tup in nltk.ngrams(text.split(),n_grams)] for text in list_of_texts]
    
    # get combined text NOT HELPING!!!
    text_combined = []
    for a,b in zip([text.split() for text in list_of_texts],text_n_grams):
        text_combined.append(a + b)
        
    # consider only nouns and adjectives NOT WORKING! DO IT BEFORE DOING ANYTHING ELSE!
    text_combined = [" ".join(text) for text in text_combined]
    texts_nounsadj=[]
    for text in text_combined:
        tokens = nltk.word_tokenize(text)
        tagged = nltk.pos_tag(tokens)
        cleantext = ""
        for element in tagged:
            if element[1] in ('NN','JJ'):
                cleantext=cleantext+element[0]+" "
        texts_nounsadj.append(cleantext)
    
    # INCREASE MIN_COUNT MAX_FREQ BOUNDARIES
    texts_nounsadj = [text.split() for text in texts_nounsadj]
    id2word = corpora.Dictionary(texts_nounsadj)                       
    id2word.filter_extremes(no_below=min_count, no_above=max_freq)
    ldacorpus = [id2word.doc2bow(doc) for doc in texts_nounsadj] 
    return id2word, ldacorpus # represent each speech by (token_id, token_count) tuples

In [114]:
news_clean['text'] = news_clean['text'].apply(lambda x: clean_texts(x))
news_clean['title'] = news_clean['title'].apply(lambda x: clean_texts(x))
news_clean['meta_description'] = news_clean['meta_description'].apply(lambda x: clean_texts(x))

In [115]:
stemmer = nltk.stem.snowball.GermanStemmer()

id2word_m1, ldacorpus_m1 = build_dictionary_and_vectorize_texts(news_clean['text'].tolist())
id2word_m2, ldacorpus_m2 = build_dictionary_and_vectorize_texts_wo_stopwords(news_clean['text'].tolist(), 'german')
id2word_m4, ldacorpus_m4 = build_dictionary_and_vectorize_texts_wo_random_wo_common_words(news_clean['text'].tolist(), 
                                                                                          5, 0.5)
id2word_m5, ldacorpus_m5 = build_dictionary_and_vectorize_texts_w_stemmer(news_clean['text'].tolist(), 5, 0.5, stemmer)
id2word_m6, ldacorpus_m6 = build_dictionary_and_vectorize_texts_w_ngrams_features(news_clean['text'].tolist(), 
                                                                                  'german', 2, 5, 0.5)

In [130]:
id2word_m7, ldacorpus_m7 = build_dictionary_and_vectorize_texts_w_ngrams_features_only_noun_adj(
                                                                                news_clean['text'].tolist(), 
                                                                                  'german', 2, 5, 0.5)

### 2. Train LDA topic model

#### 2.1. LDA topic modeling + removing stopwords

In [117]:
lda_m2 = models.LdaModel(ldacorpus_m2, id2word=id2word_m2, num_topics=n_topics)

In [118]:
lda_m2.print_topics()

[(0,
  '0.006*"dass" + 0.003*"sei" + 0.003*"sagte" + 0.003*"mehr" + 0.003*"wurde" + 0.002*"immer" + 0.002*"prozent" + 0.002*"menschen" + 0.002*"deutschland" + 0.002*"schon"'),
 (1,
  '0.006*"dass" + 0.003*"wurde" + 0.003*"sagte" + 0.003*"deutschland" + 0.003*"sei" + 0.002*"mehr" + 0.002*"schon" + 0.002*"menschen" + 0.001*"usa" + 0.001*"seit"'),
 (2,
  '0.007*"dass" + 0.003*"mehr" + 0.003*"wurde" + 0.003*"menschen" + 0.002*"sei" + 0.002*"euro" + 0.002*"schon" + 0.002*"zwei" + 0.002*"jahren" + 0.002*"sagte"'),
 (3,
  '0.004*"dass" + 0.004*"sagte" + 0.003*"sei" + 0.003*"mehr" + 0.002*"wurde" + 0.002*"schon" + 0.002*"seit" + 0.002*"jahren" + 0.002*"menschen" + 0.002*"zwei"'),
 (4,
  '0.007*"dass" + 0.004*"mehr" + 0.004*"sagte" + 0.003*"wurde" + 0.003*"sei" + 0.003*"deutschland" + 0.002*"immer" + 0.002*"zwei" + 0.002*"sagt" + 0.002*"menschen"'),
 (5,
  '0.006*"dass" + 0.004*"mehr" + 0.003*"wurde" + 0.002*"schon" + 0.002*"beim" + 0.002*"sagte" + 0.002*"sei" + 0.002*"seit" + 0.002*"menschen" 

#### 2.2. Train TF-IDF model

In [119]:
# normalize scores by most common words instead of just simply removing standard stopwords
ldacorpus_m3 = ldacorpus_m1
id2word_m3 = id2word_m1
tfidfcorpus_m3 = models.TfidfModel(ldacorpus_m3)
lda_m3 = models.ldamodel.LdaModel(corpus=tfidfcorpus_m3[ldacorpus_m3],id2word=id2word_m3,num_topics=n_topics)

In [120]:
lda_m3.print_topics()

[(0,
  '0.000*"trump" + 0.000*"er" + 0.000*"sie" + 0.000*"oder" + 0.000*"polizei" + 0.000*"sagte" + 0.000*"zschäpe" + 0.000*"euro" + 0.000*"afghanistan" + 0.000*"diesen"'),
 (1,
  '0.000*"er" + 0.000*"prozent" + 0.000*"millionen" + 0.000*"deutschland" + 0.000*"fc" + 0.000*"air" + 0.000*"sind" + 0.000*"so" + 0.000*"sie" + 0.000*"gegen"'),
 (2,
  '0.000*"ich" + 0.000*"millionen" + 0.000*"iphone" + 0.000*"er" + 0.000*"euro" + 0.000*"menschen" + 0.000*"sie" + 0.000*"dollar" + 0.000*"milliarden" + 0.000*"lewis"'),
 (3,
  '0.000*"türkei" + 0.000*"ich" + 0.000*"sonnenfinsternis" + 0.000*"er" + 0.000*"rooney" + 0.000*"iphone" + 0.000*"euro" + 0.000*"sie" + 0.000*"habe" + 0.000*"usa"'),
 (4,
  '0.000*"prozent" + 0.000*"trump" + 0.000*"sie" + 0.000*"er" + 0.000*"afd" + 0.000*"serebrennikov" + 0.000*"spd" + 0.000*"wie" + 0.000*"online" + 0.000*"sei"'),
 (5,
  '0.000*"schulz" + 0.000*"polizei" + 0.000*"er" + 0.000*"merkel" + 0.000*"euro" + 0.000*"sagte" + 0.000*"wir" + 0.000*"dass" + 0.000*"deutsc

#### 2.3. TF-IDF model + remove random & common words

In [121]:
# remove words that are not more than 5 times or more than 50% in all documents
tfidfcorpus_m4 = models.TfidfModel(ldacorpus_m4)
lda_m4 = models.ldamodel.LdaModel(corpus=tfidfcorpus_m4[ldacorpus_m4],id2word=id2word_m4,num_topics=n_topics)

In [122]:
lda_m4.print_topics()

[(0,
  '0.001*"frauen" + 0.001*"euro" + 0.001*"ich" + 0.001*"berlin" + 0.001*"wir" + 0.001*"oder" + 0.001*"man" + 0.001*"sagte" + 0.001*"the" + 0.001*"polizei"'),
 (1,
  '0.002*"trump" + 0.001*"wir" + 0.001*"gegen" + 0.001*"fc" + 0.001*"bis" + 0.001*"sagte" + 0.001*"regierung" + 0.001*"bayern" + 0.001*"millionen" + 0.001*"deutschland"'),
 (2,
  '0.001*"gegen" + 0.001*"deutschland" + 0.001*"millionen" + 0.001*"euro" + 0.001*"prozent" + 0.001*"zwei" + 0.001*"türkei" + 0.001*"polizei" + 0.001*"mehr" + 0.001*"barcelona"'),
 (3,
  '0.001*"prozent" + 0.001*"film" + 0.001*"sei" + 0.001*"frauen" + 0.001*"ich" + 0.001*"frau" + 0.001*"menschen" + 0.001*"oder" + 0.001*"wir" + 0.001*"türkei"'),
 (4,
  '0.002*"ich" + 0.001*"man" + 0.001*"kinder" + 0.001*"prozent" + 0.001*"oder" + 0.001*"menschen" + 0.001*"euro" + 0.001*"seine" + 0.001*"sagte" + 0.001*"wenn"'),
 (5,
  '0.001*"prozent" + 0.001*"euro" + 0.001*"the" + 0.001*"spd" + 0.001*"wir" + 0.001*"trump" + 0.001*"habe" + 0.001*"polizei" + 0.001*"i

#### 2.4. TF-IDF model + remove random & common words + similar lexical root

In [123]:
tfidfcorpus_m5 = models.TfidfModel(ldacorpus_m5)
lda_m5 = models.ldamodel.LdaModel(corpus=tfidfcorpus_m5[ldacorpus_m5],id2word=id2word_m5,num_topics=n_topics)

In [124]:
lda_m5.print_topics()

[(0,
  '0.002*"prozent" + 0.002*"iphon" + 0.001*"ich" + 0.001*"deutsch" + 0.001*"air" + 0.001*"soll" + 0.001*"eu" + 0.001*"dann" + 0.001*"geg" + 0.001*"deutschland"'),
 (1,
  '0.002*"frau" + 0.001*"ich" + 0.001*"indi" + 0.001*"podcast" + 0.001*"konn" + 0.001*"turkei" + 0.001*"oder" + 0.001*"spiel" + 0.001*"mann" + 0.001*"bis"'),
 (2,
  '0.002*"ich" + 0.002*"wir" + 0.002*"trump" + 0.001*"berlin" + 0.001*"million" + 0.001*"spanisch" + 0.001*"barcelona" + 0.001*"nordkorea" + 0.001*"mich" + 0.001*"fc"'),
 (3,
  '0.002*"kind" + 0.001*"muslim" + 0.001*"mensch" + 0.001*"prozent" + 0.001*"geg" + 0.001*"land" + 0.001*"afd" + 0.001*"man" + 0.001*"deutschland" + 0.001*"wenn"'),
 (4,
  '0.001*"mensch" + 0.001*"euro" + 0.001*"prozent" + 0.001*"zschap" + 0.001*"deutsch" + 0.001*"googl" + 0.001*"schulz" + 0.001*"usa" + 0.001*"soll" + 0.001*"mann"'),
 (5,
  '0.002*"turkei" + 0.002*"million" + 0.001*"euro" + 0.001*"deutsch" + 0.001*"fc" + 0.001*"deutschland" + 0.001*"zahl" + 0.001*"geg" + 0.001*"turkis

#### 2.5. TF-IDF model + remove stopwords + ignore common and random words + ngrams as features

In [125]:
tfidfcorpus_m6 = models.TfidfModel(ldacorpus_m6)
lda_m6 = models.ldamodel.LdaModel(corpus=tfidfcorpus_m6[ldacorpus_m6],id2word=id2word_m6,num_topics=n_topics)

In [126]:
lda_m6.print_topics()

[(0,
  '0.001*"usa" + 0.001*"singapur" + 0.001*"trump" + 0.001*"sagte" + 0.001*"sei" + 0.001*"fc" + 0.001*"worden" + 0.001*"habe" + 0.000*"millionen" + 0.000*"selbst"'),
 (1,
  '0.001*"türkei" + 0.001*"der_türkei" + 0.001*"prozent" + 0.001*"soll" + 0.001*"wurden" + 0.001*"wegen" + 0.001*"menschen" + 0.001*"mit_dem" + 0.001*"deutschland" + 0.001*"gruppe"'),
 (2,
  '0.001*"euro" + 0.001*"ich" + 0.001*"millionen" + 0.001*"millionen_euro" + 0.001*"menschen" + 0.001*"wir" + 0.001*"prozent" + 0.001*"milliarden_euro" + 0.001*"milliarden" + 0.001*"gegen"'),
 (3,
  '0.001*"ich" + 0.001*"wir" + 0.001*"habe" + 0.001*"millionen" + 0.001*"polizei" + 0.001*"k" + 0.001*"sagte" + 0.001*"uns" + 0.001*"fc" + 0.001*"laut"'),
 (4,
  '0.001*"ich" + 0.001*"deutschland" + 0.001*"oder" + 0.001*"macron" + 0.001*"wir" + 0.001*"gegen" + 0.001*"nur" + 0.001*"zum" + 0.001*"euro" + 0.000*"männer"'),
 (5,
  '0.001*"prozent" + 0.001*"prozent_der" + 0.001*"studie" + 0.001*"open" + 0.001*"oder" + 0.001*"frauen" + 0.001

#### 2.6. TF-IDF model + remove stopwords + ignore common and random words + ngrams as features + only noums and adj

In [131]:
tfidfcorpus_m7 = models.TfidfModel(ldacorpus_m7)
lda_m7 = models.ldamodel.LdaModel(corpus=tfidfcorpus_m7[ldacorpus_m7],id2word=id2word_m7,num_topics=n_topics)

In [132]:
lda_m7.print_topics()

[(0,
  '0.001*"polizei" + 0.001*"trump" + 0.001*"online" + 0.001*"man" + 0.001*"ich" + 0.001*"gegen" + 0.001*"b" + 0.001*"für_die" + 0.001*"sagte" + 0.001*"ihrer"'),
 (1,
  '0.001*"polizei" + 0.001*"ich" + 0.001*"spahn" + 0.001*"deutschland" + 0.001*"man" + 0.001*"oder" + 0.001*"menschen" + 0.001*"ja" + 0.001*"sind" + 0.001*"zum"'),
 (2,
  '0.001*"seine" + 0.001*"ich" + 0.001*"open" + 0.001*"habe" + 0.001*"sei" + 0.001*"trump" + 0.001*"sein" + 0.001*"nur" + 0.001*"sind" + 0.001*"dann"'),
 (3,
  '0.001*"prozent" + 0.001*"türkei" + 0.001*"ich" + 0.001*"euro" + 0.001*"milliarden_euro" + 0.001*"peta" + 0.001*"soll" + 0.001*"milliarden" + 0.001*"wir" + 0.001*"deutschland"'),
 (4,
  '0.001*"euro" + 0.001*"sagt" + 0.001*"ich" + 0.001*"iphone" + 0.001*"man" + 0.001*"menschen" + 0.001*"habe" + 0.001*"gegen" + 0.001*"oder" + 0.001*"dort"'),
 (5,
  '0.001*"fc" + 0.001*"schulz" + 0.001*"bayern" + 0.001*"millionen" + 0.001*"ich" + 0.001*"wir" + 0.001*"dollar" + 0.001*"sagt" + 0.001*"millionen_dolla