In [58]:
import re
import pandas as pd
from bs4 import BeautifulSoup
import requests
import GetOldTweets3 as got 
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def get_text_from_url(url):
    if url:
        try:
            soup = BeautifulSoup(requests.get(url).text, 'html.parser')
            return " ".join([p.get_text().replace(u'\xa0', u' ') for p in soup.find_all('p')])
        except:
            return ""
    else:
        return ""

def get_urls(text):
    return re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)

def get_tweets(keywords, start="2020-04-01", stop="2020-04-16", maxTweets=0):
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(keywords)\
                                             .setSince(start)\
                                             .setUntil(stop)\
                                             .setMaxTweets(maxTweets)\
                                             .setLang('de')\
                                             .setNear('Berlin, Germany')\
                                             .setWithin('1000km') 
    tweets = got.manager.TweetManager.getTweets(tweetCriteria)
    tweet_dicts = []
    for tweet in tweets:
        tweet_dict = tweet.__dict__ 
        tweet_dict['url_text'] = get_text_from_url(tweet.urls)
        tweet_dicts.append(tweet_dict)

    return tweet_dicts

def get_top_words(model, feature_names, n_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        topics[topic_idx] = " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
    return topics

def get_topics(texts, n_components=20, n_top_words = 20, n_features=int(1e12)):
    vect = CountVectorizer(max_df=0.5, 
                           min_df=5,
                           max_features=n_features, 
                           stop_words=get_stop_words('de'))
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=200,
                                learning_method='online',
                                random_state=0)
    tf = vect.fit_transform(texts)
    lda.fit(tf)
    topic_assignments = lda.transform(tf)
    tf_feature_names = vect.get_feature_names()
    topics_dict = get_top_words(lda, tf_feature_names, n_top_words)
    
    # sort topics by occurrence
    topic_loadings = topic_assignments.sum(axis=0).argsort()[:-1:]
    topic_assignments = topic_assignments[:,topic_loadings]
    topics = [topics_dict[topic_idx] for topic_idx in topic_loadings]
    return topic_assignments.argmax(axis=1), topics

In [59]:
from ipywidgets import widgets
import warnings
warnings.filterwarnings("ignore")

@widgets.interact_manual(num_topics=(2, 20),
                         text_source=['tweets','urls in tweets'],
                         max_tweets=(0, 10000)
                        )
def plot_tweets(keywords="covid deutschland", 
         start=widgets.DatePicker(value=pd.to_datetime('2020-01-01')), 
         stop=widgets.DatePicker(value=pd.to_datetime('2020-04-16')), 
         num_topics=10,
         text_source='tweets',
         max_tweets=0
        ):
    print(f"Fetching tweets for keywords {keywords}")
    tweets = get_tweets(keywords, start.strftime('%Y-%m-%d'), stop.strftime('%Y-%m-%d'), maxTweets=100)
    if tweets:
        print(f'Found {len(tweets)} tweets')
        df = pd.DataFrame(tweets).set_index('date')
        if text_source == 'tweets':
            df['topics'], text_topics = get_topics(df['text'], n_components=num_topics, n_top_words=5)
        elif text_source == 'urls in tweets':
            df['topics'], text_topics = get_topics(df['url_text'], n_components=num_topics, n_top_words=5)
        pd.get_dummies(df['topics']).resample('D').sum().plot(marker='o')
        plt.legend(range(len(text_topics)));
        display(df)#[['text', 'urls', 'topics']])
        print("\n".join([f'Topic {idx}: {text}' for idx, text in enumerate(text_topics)]))

interactive(children=(Text(value='covid deutschland', description='keywords'), DatePicker(value=Timestamp('202…