## Imports

In [9]:
from pymongo import MongoClient
from bson.json_util import loads, dumps
import json
import nltk
from nltk.corpus import stopwords
import collections

## Füllwörter

In [10]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\A14AEC9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Datenbankverbindung herstellen
Es wurde ein Docker-Container für die MongoDB gestartet mit Port 27017. Die Daten werden mit Python-Skripts in die Datenbank geladen, indem zuerst ein Skript die Daten von der API anfragt und mittels Apache Kafka zwischenspeichert. Von dort aus werden die Daten nacheinander in die Datenbank eingefügt sodass letztlich in diesem Notebook darauf zugegriffen werden kann.

In [11]:
client = MongoClient('mongodb://localhost:27017')
db = client['twitter']
collection = db['tweets']

## Daten laden

Daten direkt aus Datenbank laden

In [12]:
data = json.loads(dumps(collection.find()))

Daten aus Test-Datei mit 5.000 Einträgen laden

In [4]:
with open("5000.json", "r", encoding="utf-8") as f:
    data = json.loads(f.read())

In [None]:
data

## Rausfiltern der Tweet-Inhalte
{
    ('word', 'year'): #count
}

In [5]:
text_strings = []
for d in data:
    if "text" in d.keys():
        text_strings += [d["text"]]

words_dict = {}
for d in data:
    if not "text" in d.keys() or not "user" in d.keys() or not "created_at" in d["user"].keys() or d["user"]["created_at"] == None:
        continue
    text = d["text"].lower()
    created = d["user"]["created_at"][-4:]
    
    for word in text.split(' '):
        pair = (word, created)
        if pair in words_dict.keys():
            words_dict[pair] += 1
        else:
            words_dict[pair] = 1
words_dict

{('rt', '2016'): 248,
 ('@jewishaction:', '2016'): 1,
 ('major', '2016'): 2,
 ('news:', '2016'): 2,
 ('joe', '2016'): 21,
 ('biden', '2016'): 79,
 ('is', '2016'): 92,
 ('projected', '2016'): 1,
 ('to', '2016'): 121,
 ('win', '2016'): 14,
 ('wisconsin.\n\nthis', '2016'): 1,
 ('a', '2016'): 89,
 ('flip', '2016'): 1,
 ('from', '2016'): 11,
 ('2016', '2016'): 7,
 ('—', '2016'): 7,
 ('blow', '2016'): 1,
 ('trumpism.\n\nwe', '2016'): 1,
 ('are', '2016'): 35,
 ('showin…', '2016'): 1,
 ('rt', '2019'): 281,
 ('@tirangabhaiya:', '2019'): 1,
 ('joe', '2019'): 25,
 ('biden', '2019'): 93,
 ('just', '2019'): 19,
 ('won', '2019'): 14,
 ('wisconsin.', '2019'): 4,
 ('\nleading', '2019'): 1,
 ('at', '2019'): 23,
 ('248.\n#uselection2020', '2019'): 1,
 ('#biden', '2019'): 5,
 ('https://t.co/3htra8ttys', '2019'): 1,
 ('@joebiden:', '2019'): 1,
 ('to', '2019'): 115,
 ('make', '2019'): 3,
 ('sure', '2019'): 2,
 ('every', '2019'): 6,
 ('vote', '2019'): 22,
 ('is', '2019'): 96,
 ('counted,', '2019'): 1,
 ('we

## Wort-Jahr-Anzahl

In [6]:
ignore_words = stop_words + ['rt', '', '-', '&amp;', 'like', 'get', 'one', 'via', 'new', 'i\'m', 'rt']
words = [(k,v) for k, v in sorted(words_dict.items(), key=lambda item: -item[1]) if k[0] not in ignore_words]
words[:10]

[(('trump', '2020'), 220),
 (('biden', '2020'), 157),
 (('trump', '2009'), 153),
 (('trump', '2019'), 124),
 (('trump', '2012'), 123),
 (('trump', '2018'), 115),
 (('trump', '2011'), 110),
 (('biden', '2012'), 109),
 (('trump', '2017'), 105),
 (('trump', '2013'), 104)]

## Top-10 Wörter pro Jahr der Account-Erstellung

In [7]:
result = {}

for pair in words_dict.keys():
    word, year = pair
    amount = (word, words_dict[pair])
    
    if year in result.keys():              
        result[year] = result[year] + [(word, words_dict[pair])]
            
    else:
        result[year] = [(word, words_dict[pair])]

for year in sorted(result.keys()):
    print(year)
    value = result[year]
    r = [(k,v) for k, v in sorted(value, key=lambda item: -item[1]) if k not in ignore_words]
    for x in r[:10]:
        print("    ", x[0],":", x[1])

2006
     @amandamull: : 1
     need : 1
     say : 1
     loud: : 1
     fulton : 1
     county, : 1
     atlanta : 1
     is, : 1
     approximately : 1
     biden/trump : 1
2007
     biden : 4
     trump : 3
     state : 2
     es : 2
     dnc : 1
     briefing : 1
     showing : 1
     much : 1
     270! : 1
     (first : 1
2008
     trump : 25
     biden : 16
     election : 11
     vote : 8
     votes : 7
     stop : 6
     joe : 6
     counting : 5
     people : 4
     going : 4
2009
     trump : 153
     biden : 101
     election : 79
     de : 42
     votes : 34
     joe : 32
     vote : 29
     ballots : 26
     campaign : 25
     big : 24
2010
     trump : 96
     biden : 70
     de : 33
     election : 31
     que : 28
     joe : 28
     votes : 25
     en : 19
     ballots : 18
     vote : 18
2011
     trump : 110
     biden : 72
     election : 60
     votes : 33
     vote : 31
     win : 25
     joe : 24
     de : 23
     it’s : 19
     count : 18
2012
     trump : 123
 

## Durchschnittliches Alter der Accounts

In [8]:
years = [int(year) for d in data if "user" in d.keys() for year in [d["user"]["created_at"][-4:]]]
sum(years) / len(years)

2014.606691146471