In [33]:
import re, string
import pandas as pd
from bs4 import BeautifulSoup
import requests
import GetOldTweets3 as got 
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words
from ipywidgets import widgets
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')

global KEYWORDS
global DF
global ANALYSIS
global TEXT_SOURCE
global WORDS
global TOPICS

In [34]:
def get_text_from_url(url):
    if url:
        try:
            soup = BeautifulSoup(requests.get(url).text, 'html.parser')
            return " ".join([p.get_text().replace(u'\xa0', u' ') for p in soup.find_all('p')])
        except:
            return ""
    else:
        return ""

def get_top_words(model, feature_names, n_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        topics[topic_idx] = " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
    return topics

def get_topics(texts, 
               n_components=20, 
               n_top_words = 20, 
               n_features=int(1e12),
               keywords=[],
               topic_model='NMF'
              ):
    vect = CountVectorizer(max_df=0.8, 
                           min_df=1,
                           max_features=n_features, 
                           stop_words=get_stop_words('de') + keywords)
    if topic_model == 'LDA':
        model = LatentDirichletAllocation(n_components=n_components, max_iter=200,
                                learning_method='online',
                                random_state=0)
    elif topic_model == 'NMF':
        model = NMF(n_components=n_components, random_state=0)
    tf = vect.fit_transform(texts)
    model.fit(tf)
    topic_assignments = model.transform(tf)
    tf_feature_names = vect.get_feature_names()
    topics_dict = get_top_words(model, tf_feature_names, n_top_words)
    
    # sort topics by occurrence
    topic_loadings = topic_assignments.sum(axis=0).argsort()[:-1:]
    topic_assignments = topic_assignments[:,topic_loadings]
    topics = [topics_dict[topic_idx] for topic_idx in topic_loadings]
    return topic_assignments.argmax(axis=1), topics

def get_top_word_counts(texts, n_top_words, keywords):
    vect = CountVectorizer(max_df=1., 
                           min_df=1,
                           max_features=n_top_words, 
                           stop_words=get_stop_words('de') + keywords)
    wordcounts = vect.fit_transform(texts)
    return wordcounts, vect.get_feature_names()

# Twitter Analyse mit Topic Models (oder ohne)

## Tweets Runterladen

Lade Tweets aus deutschsprachigem Raum für den angegebenen Zeitraum, die die angegebenen keywords enthalten.

Wenn maxTweets auf 0 gesetzt ist, wird versucht alle zu holen. Das kann dauern, und manchmal ist man danach gedrosselt. 

Klicke ``Run Interact`` button zum ausführen.

In [53]:
@widgets.interact_manual(maxTweets=(0, 100))
def get_tweets(keywords='covid klopapier', 
               start=widgets.DatePicker(value=pd.to_datetime('2020-04-10')), 
               stop=widgets.DatePicker(value=pd.to_datetime('2020-04-16')), 
               maxTweets=10):
    print(f'Fetching tweets for keywords: {keywords}')
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(keywords)\
                                             .setSince(start.strftime('%Y-%m-%d'))\
                                             .setUntil(stop.strftime('%Y-%m-%d'))\
                                             .setMaxTweets(maxTweets)\
                                             .setLang('de')\
                                             .setNear('Berlin, Germany')\
                                             .setWithin('1000km') 
    tweets = got.manager.TweetManager.getTweets(tweetCriteria)

    tweet_dicts = []
    for tweet in tweets:
        tweet_dict = tweet.__dict__ 
        tweet_dict['url_text'] = get_text_from_url(tweet.urls)
        tweet_dicts.append(tweet_dict)
        
    print(f'Found {len(tweets)} tweets for keywords: {keywords}')
    global DF
    DF = pd.DataFrame(tweet_dicts).set_index('date')
    global KEYWORDS
    KEYWORDS = re.split("[" + string.punctuation + " \t\n]+", keywords)

interactive(children=(Text(value='covid klopapier', description='keywords'), DatePicker(value=Timestamp('2020-…

## Tweets Analysieren

- ``text_source``: Wähle, ob Text aus tweets oder den in tweets verlinkten Artikeln analysiert werden soll. 
- ``analysis``: Wähle ob Häufigkeit einzelner Worte oder Topics analysiert werden.
- ``n_top_words``: wie viele Worte berücksichtigt werden oder bei Topic Analysis, wieviele Wòrter angezeigt werden
- ``topic_model``: Art des Topic Models (Latent Dirichlet Allocation oder Nonnegative Matrix Factorization) - wird ignoriert, wnen ``analysis`` auf ``wordcount`` gesetzt ist
- ``num_topics``: Wie viele Topics extrahiert werden soll.

Klicke ``Run Interact`` button zum ausführen.

In [50]:
@widgets.interact_manual(num_topics=(2, 20),
                         text_source=['tweets','urls in tweets'],
                         analysis=['topic model', 'wordcounts'],
                         n_top_words = (5, 20),
                         topic_model=['LDA','NMF']
                        )
def plot_tweets(
         text_source='tweets',
         analysis='topic model',
         n_top_words=5,
         topic_model='NMF',
         num_topics=10    
        ):
    global DF
    global TOPICS
    global ANALYSIS 
    ANALYSIS = analysis
    global TEXT_SOURCE
    TEXT_SOURCE = text_source
    if len(DF) == 0:
        print("Get some tweets first.")
        return
    else:
        print(f"Analysing {len(DF)} {text_source} with {analysis}, exluding keywords: {KEYWORDS}")
        
        if text_source == 'tweets':
            text_col = 'text'
        elif text_source == 'urls in tweets':            
            text_col = 'url_text'
            
        if analysis == 'topic model':
            DF['topic'], TOPICS = get_topics(DF[text_col], 
                                                    n_components=num_topics, 
                                                    n_top_words=n_top_words,
                                                    keywords=KEYWORDS,
                                                    topic_model=topic_model
                                                  )
            pd.get_dummies(DF['topic']).resample('D').sum().plot(marker='o')
            plt.legend([f'Topic {idx}: {text[:10]} ...' for idx, text in enumerate(TOPICS)]);
            print("\n".join([f'Topic {idx}: {text}' for idx, text in enumerate(TOPICS)]))
        elif analysis == 'wordcounts':
            word_counts, words = get_top_word_counts(DF[text_col],  
                                                     n_top_words=n_top_words,
                                                    keywords=KEYWORDS)
            global TOP_WORDS
            TOP_WORDS = words
            pd.DataFrame(word_counts.toarray(), 
                         index=DF.index, 
                         columns=words).resample('D').sum().plot(marker='o')
            

interactive(children=(Dropdown(description='text_source', options=('tweets', 'urls in tweets'), value='tweets'…

## Tweets Anschauen

In [52]:
from IPython.html.widgets import interactive

df = DF.copy(deep=True)
if TEXT_SOURCE == 'tweets':
    text_col = 'text'
elif TEXT_SOURCE == 'urls in tweets':
    text_col = 'url_text'

if ANALYSIS == 'wordcounts':
    items = ['All'] + sorted(TOP_WORDS)
elif ANALYSIS == 'topic model':
    items = ['All'] + [f'{idx}: {text[:20]} ...' for idx, text in enumerate(TOPICS)]
    
def make_clickable(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)
    
def view(contains=''):
    if contains=='All': 
        print(f'Showing all {len(df)} tweets')
        display(df.loc[:,['username','to','text','permalink']].style.format({'permalink': make_clickable}))
    elif ANALYSIS == 'wordcounts':
        idx = df[text_col].str.lower().str.contains(contains.lower())
        print(f'Found {idx.sum()} tweets')
        display(df.loc[idx,['username','to','text','permalink']].style.format({'permalink': make_clickable}))
    elif ANALYSIS == 'topic model':
        idx = df['topic'] == int(contains[:contains.find(':')])
        print(f'Found {idx.sum()} tweets')
        display(df.loc[idx,['username','to','text','topic','permalink']].style.format({'permalink': make_clickable}))

w = widgets.Select(options=items)
interactive(view, contains=w)

interactive(children=(Select(description='contains', options=('All', 'corona', 'coronavirus', 'coronavirusde',…