## Imports

In [1]:
from pymongo import MongoClient
from bson.json_util import loads, dumps
import json
import nltk
from nltk.corpus import stopwords
import collections

## Stopwords

In [2]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\A14AEC9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Connecting to database

In [3]:
client = MongoClient('mongodb://localhost:27017')
db = client['twitter']
collection = db['tweets']

## Get data

In [4]:
data = json.loads(dumps(collection.find()))

In [5]:
len(data)

76

In [6]:
data[0]

{'_id': {'$oid': '5fa3c493502013e4dd04544e'},
 'created_at': 'Thu Nov 05 09:23:16 +0000 2020',
 'id': 1324281116065484803,
 'id_str': '1324281116065484803',
 'text': 'RT @deborabpaim: FRAUDE DESCARADA!!!',
 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',
 'truncated': False,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 1123298202,
  'id_str': '1123298202',
  'name': 'RKSS #USTRA #KOZEL #CRUZ #BOLSONARO #VOTETRUMP2020',
  'screen_name': 'rkss2',
  'location': None,
  'url': None,
  'description': 'Conservative/Conservador Christian/Cristao Every free people may have the right to bear firearms/Toda pessoa livre dever ter o direito de portar armas de fogo',
  'translator_type': 'none',
  'protected': False,
  'verified': False,
  'followers_count': 2261,
  'friends_count': 4488,
  'listed_count': 62,
  'f

In [7]:
data[3]["text"]

'@Alexis_Cossette Mais du coup Biden sera-t-il quand même élu ???'

## Rausfiltern der Tweet-Inhalte
{
    ('word', 'year'): #count
}

In [8]:
text_strings = []
for d in data:
    if "text" in d.keys():
        text_strings += [d["text"]]

words_dict = {}
for d in data:
    if not "text" in d.keys() or not "user" in d.keys() or not "created_at" in d["user"].keys() or d["user"]["created_at"] == None:
        continue
    text = d["text"].lower()
    created = d["user"]["created_at"][-4:]
    
    for word in text.split(' '):
        pair = (word, created)
        if pair in words_dict.keys():
            words_dict[pair] += 1
        else:
            words_dict[pair] = 1
words_dict

{('rt', '2013'): 6,
 ('@deborabpaim:', '2013'): 1,
 ('fraude', '2013'): 1,
 ('descarada!!!', '2013'): 1,
 ('rt', '2018'): 4,
 ('@realjameswoods:', '2018'): 1,
 ('you’ll', '2018'): 1,
 ('never', '2018'): 1,
 ('hear', '2018'): 1,
 ('about', '2018'): 1,
 ('it', '2018'): 1,
 ('again', '2018'): 1,
 ('if', '2018'): 1,
 ('biden', '2018'): 1,
 ('manages', '2018'): 1,
 ('to', '2018'): 2,
 ('steal', '2018'): 1,
 ('this.', '2018'): 1,
 ('congratulations,', '2018'): 1,
 ('the', '2018'): 4,
 ('pandemic', '2018'): 1,
 ('is', '2018'): 1,
 ('over!', '2018'): 1,
 ('https://t.co/5n…', '2018'): 1,
 ('rt', '2020'): 6,
 ('@simonehorst6:', '2020'): 1,
 ('trump', '2020'): 5,
 ('vai', '2020'): 1,
 ('levar', '2020'): 1,
 ('a', '2020'): 2,
 ('geórgia!\n🤞', '2020'): 1,
 ('@alexis_cossette', '2011'): 1,
 ('mais', '2011'): 1,
 ('du', '2011'): 1,
 ('coup', '2011'): 1,
 ('biden', '2011'): 1,
 ('sera-t-il', '2011'): 1,
 ('quand', '2011'): 1,
 ('même', '2011'): 1,
 ('élu', '2011'): 1,
 ('???', '2011'): 1,
 ('rt', '201

## Wort-Jahr-Anzahl

In [12]:
#ignore_words = stop_words + ['rt', '', '-', '&amp;', 'like', 'get', 'one', 'via', 'new', 'i\'m']
words = [(k,v) for k, v in sorted(words_dict.items(), key=lambda item: -item[1])]
words

[(('the', '2017'), 10),
 (('the', '2020'), 8),
 (('rt', '2017'), 7),
 (('rt', '2013'), 6),
 (('rt', '2020'), 6),
 (('rt', '2012'), 6),
 (('the', '2009'), 6),
 (('trump', '2020'), 5),
 (('trump', '2012'), 5),
 (('in', '2017'), 5),
 (('in', '2009'), 5),
 (('rt', '2011'), 5),
 (('rt', '2018'), 4),
 (('the', '2018'), 4),
 (('rt', '2009'), 4),
 (('you', '2009'), 4),
 (('are', '2011'), 4),
 (('rt', '2014'), 4),
 (('election', '2013'), 4),
 (('to', '2020'), 4),
 (('of', '2020'), 4),
 (('the', '2011'), 4),
 (('in', '2012'), 3),
 (('on', '2012'), 3),
 (('rt', '2010'), 3),
 (('biden', '2017'), 3),
 (('biden', '2012'), 3),
 (('trump', '2017'), 3),
 (('of', '2017'), 3),
 (('trump', '2009'), 3),
 (('trump', '2018'), 3),
 (('of', '2009'), 3),
 (('the', '2012'), 3),
 (('is', '2012'), 3),
 (('election', '2012'), 3),
 (('you', '2015'), 3),
 (('the', '2013'), 3),
 (('is', '2011'), 3),
 (('to', '2011'), 3),
 (('was', '2008'), 3),
 (('é', '2018'), 3),
 (('rt', '2019'), 3),
 (('of', '2018'), 3),
 (('to', '

## Top-10 Wörter pro Jahr der Account-Erstellung

In [56]:
result = {}

for pair in words_dict.keys():
    word, year = pair
    amount = (word, words_dict[pair])
    
    if year in result.keys():              
        result[year] = result[year] + [(word, words_dict[pair])]
            
    else:
        result[year] = [(word, words_dict[pair])]
        
result

{'2013': [('rt', 6),
  ('@deborabpaim:', 1),
  ('fraude', 1),
  ('descarada!!!', 1),
  ('@abc:', 1),
  ('latest:', 1),
  ('election', 4),
  ('officials', 1),
  ('in', 1),
  ("arizona's", 1),
  ('maricopa', 1),
  ('county', 1),
  ('say', 1),
  ('that', 1),
  ('they', 2),
  ('have', 1),
  ('an', 1),
  ('estimated', 1),
  ('275,000', 1),
  ('more', 1),
  ('ballots', 1),
  ('to', 1),
  ('process', 1),
  ('and', 1),
  ('tabula…', 1),
  ('@zaaain16:', 1),
  ('the', 3),
  ('next', 1),
  ('president', 1),
  ('of', 1),
  ('united', 1),
  ('states,', 1),
  ('mr', 1),
  ('joe', 2),
  ('biden.\n#joebidenkamalaharris2020', 1),
  ('', 1),
  ('#elections2020', 1),
  ('https://t.co/pkx2ophaol', 1),
  ('@leonline2000:', 1),
  ('tan', 1),
  ('risible', 1),
  ('como', 1),
  ('patético', 1),
  ('quienes', 1),
  ('dicen', 1),
  ('trump', 1),
  ('es', 1),
  ('un', 2),
  ('villano', 1),
  ('nivel', 1),
  ('kim', 1),
  ('jong', 1),
  ('al', 1),
  ('impugnar', 1),
  ('resultados', 1),
  ('vía', 1),
  ('#scotus