## Imports

In [18]:
from pymongo import MongoClient
from bson.json_util import loads, dumps
import json
import nltk
from nltk.corpus import stopwords
import collections

## Stopwords

In [30]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\A14AEC9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Connecting to database

In [2]:
client = MongoClient('mongodb://localhost:27017')
db = client['twitter']
collection = db['tweets']

## Get data

In [13]:
data = json.loads(dumps(collection.find()))

In [14]:
len(data)

76

In [15]:
data[0]

{'_id': {'$oid': '5fa3c493502013e4dd04544e'},
 'created_at': 'Thu Nov 05 09:23:16 +0000 2020',
 'id': 1324281116065484803,
 'id_str': '1324281116065484803',
 'text': 'RT @deborabpaim: FRAUDE DESCARADA!!!',
 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',
 'truncated': False,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 1123298202,
  'id_str': '1123298202',
  'name': 'RKSS #USTRA #KOZEL #CRUZ #BOLSONARO #VOTETRUMP2020',
  'screen_name': 'rkss2',
  'location': None,
  'url': None,
  'description': 'Conservative/Conservador Christian/Cristao Every free people may have the right to bear firearms/Toda pessoa livre dever ter o direito de portar armas de fogo',
  'translator_type': 'none',
  'protected': False,
  'verified': False,
  'followers_count': 2261,
  'friends_count': 4488,
  'listed_count': 62,
  'f

In [27]:
data[3]["text"]

'@Alexis_Cossette Mais du coup Biden sera-t-il quand même élu ???'

## Rausfiltern der Tweet-Inhalte

In [41]:
text_strings = []
for d in data:
    if "text" in d.keys():
        text_strings += [d["text"]]

words_dict = {}
for d in data:
    if not "text" in d.keys() or not "user" in d.keys() or not "created_at" in d["user"].keys() or d["user"]["created_at"] == None:
        continue
    text = d["text"].lower()
    created = d["user"]["created_at"][-4:]
    
    for word in text.split(' '):
        pair = (word, created)
        if pair in words_dict.keys():
            words_dict[pair] += 1
        else:
            words_dict[pair] = 1
words_dict

{('rt', '2013'): 6,
 ('@deborabpaim:', '2013'): 1,
 ('fraude', '2013'): 1,
 ('descarada!!!', '2013'): 1,
 ('rt', '2018'): 4,
 ('@realjameswoods:', '2018'): 1,
 ('you’ll', '2018'): 1,
 ('never', '2018'): 1,
 ('hear', '2018'): 1,
 ('about', '2018'): 1,
 ('it', '2018'): 1,
 ('again', '2018'): 1,
 ('if', '2018'): 1,
 ('biden', '2018'): 1,
 ('manages', '2018'): 1,
 ('to', '2018'): 2,
 ('steal', '2018'): 1,
 ('this.', '2018'): 1,
 ('congratulations,', '2018'): 1,
 ('the', '2018'): 4,
 ('pandemic', '2018'): 1,
 ('is', '2018'): 1,
 ('over!', '2018'): 1,
 ('https://t.co/5n…', '2018'): 1,
 ('rt', '2020'): 6,
 ('@simonehorst6:', '2020'): 1,
 ('trump', '2020'): 5,
 ('vai', '2020'): 1,
 ('levar', '2020'): 1,
 ('a', '2020'): 2,
 ('geórgia!\n🤞', '2020'): 1,
 ('@alexis_cossette', '2011'): 1,
 ('mais', '2011'): 1,
 ('du', '2011'): 1,
 ('coup', '2011'): 1,
 ('biden', '2011'): 1,
 ('sera-t-il', '2011'): 1,
 ('quand', '2011'): 1,
 ('même', '2011'): 1,
 ('élu', '2011'): 1,
 ('???', '2011'): 1,
 ('rt', '201

## Top 10 verwendete Wörter (ohne Füllwörter)

In [42]:
ignore_words = stop_words + ['rt', '', '-', '&amp;', 'like', 'get', 'one', 'via', 'new', 'i\'m']
words = [(k,v) for k, v in sorted(words_dict.items(), key=lambda item: -item[1]) if not k in ignore_words]
words[:10]

[(('the', '2017'), 10),
 (('the', '2020'), 8),
 (('rt', '2017'), 7),
 (('rt', '2013'), 6),
 (('rt', '2020'), 6),
 (('rt', '2012'), 6),
 (('the', '2009'), 6),
 (('trump', '2020'), 5),
 (('trump', '2012'), 5),
 (('in', '2017'), 5)]

## Analyse der Location der Twitter-User

In [37]:
location_strings = []
for d in data:
    if "user" in d.keys() and "location" in d["user"].keys() and d["user"]["location"] != None:
        location_strings += [d["user"]["location"]]
location_strings

['Miamisburg, OH',
 'Bushkill Twp, PA',
 'Hertfordshire',
 'In your head Zombie',
 'Los Angeles, CA',
 'Florida, USA',
 'Thomaston, GA',
 'Tempe, AZ',
 'Madrid, Comunidad de Madrid',
 'Dublin , Ireland ',
 'Los Angeles / Vegas',
 'Lagos, Nigeria',
 'North Fort Myers, FL',
 'Huddersfield',
 'Tasmania, Australia',
 'USA',
 'Middlesbrough, England',
 'blue planet in space',
 'Nairobi',
 'Back Porch Ashram',
 'Goiânia, Brasil',
 ' Le Havre',
 'Birmingham, England',
 'VENEZUELA',
 'North West England',
 'Orlando,Fl',
 'he/him/cis/white',
 'Florida, USA',
 'United Kingdom',
 'Glasgow, Scotland',
 '𝙶𝚑𝚎𝚝𝚝𝚘🏴\u200d☠ 𝙺𝙴🇰🇪',
 '01010111 01111001 01100100',
 'California, USA',
 'I love Koi fish',
 'Nairobi - Kenya',
 '🌐',
 'Weltweit',
 'McKinney, TX',
 'California',
 'Nigeria',
 'United Kingdom',
 'Twitter, USA',
 'Bonnie Argyll/Bonnie Scotland',
 'The internet',
 'Baton Rouge']

## Zeitpunkt der Account-Erstellung

In [39]:
created = []
for d in data:
    if "user" in d.keys() and "created_at" in d["user"].keys() and d["user"]["created_at"] != None:
        created += [d["user"]["created_at"][-4:]]
created

['2013',
 '2018',
 '2020',
 '2011',
 '2012',
 '2007',
 '2008',
 '2015',
 '2010',
 '2020',
 '2017',
 '2012',
 '2017',
 '2009',
 '2010',
 '2018',
 '2011',
 '2009',
 '2010',
 '2012',
 '2014',
 '2009',
 '2013',
 '2017',
 '2009',
 '2017',
 '2016',
 '2020',
 '2012',
 '2012',
 '2015',
 '2015',
 '2013',
 '2011',
 '2020',
 '2008',
 '2014',
 '2018',
 '2014',
 '2020',
 '2012',
 '2010',
 '2020',
 '2011',
 '2011',
 '2013',
 '2017',
 '2011',
 '2019',
 '2009',
 '2012',
 '2014',
 '2017',
 '2018',
 '2011',
 '2019',
 '2018',
 '2013',
 '2016',
 '2014',
 '2017',
 '2009',
 '2013',
 '2020',
 '2017',
 '2015',
 '2009',
 '2019',
 '2019',
 '2013',
 '2017',
 '2012',
 '2020',
 '2016',
 '2011']

## Tupel für Wort-Jahr-Anzahl

In [None]:
orangensaft = [(w, y, c) for w, c in words_dict for ]