In [4]:
import tweepy
import pandas as pd
import datetime
from quantiphy import Quantity
import json

# Script for crawling tweets

In [2]:
# You can obtain this credentials by creating an account on apps.twitter.com
consumer_key = "Insert your consumer key"
consumer_secret = "Insert your consumer secret key"
access_token = "Insert your access token"
access_token_secret = "Insert your access token secret"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

api.wait_on_rate_limit = True
api.wait_on_rate_limit_notify = True

In [3]:
maga = pd.read_excel("the MAGA corpus.xlsx",index_col=0,converters={'ID':str})

In [4]:
maga["ID"].apply(lambda x: str(x)).iloc[0]

'996212720046243840'

In [5]:
tweets_ids = maga['ID'].to_list()

In [6]:
tweets = []
not_crawled = []

In [8]:
# We use the tweets' ids in order to get the full json of each tweet including its location

for tweet_id in tweets_ids:
#rint(tweet_#id)
    try:
        tweet = api.get_status(tweet_id, tweet_mode='extended') 
        tweets.append(tweet._json)
    except:
        not_crawled.append({"tweet_id":tweet_id,"message":tweet._api.last_response._content.decode()})

Rate limit reached. Sleeping for: 514
Rate limit reached. Sleeping for: 535
Rate limit reached. Sleeping for: 532
Rate limit reached. Sleeping for: 534
Rate limit reached. Sleeping for: 534
Rate limit reached. Sleeping for: 529
Rate limit reached. Sleeping for: 381


In [9]:
# Number of no longer available tweets
len(not_crawled)

1136

In [75]:
# Saving the crawled data
data = {"tweets":tweets}
with open('tweets_maga.json', 'w') as f:  # writing JSON object
    json.dump(data, f)

# Locating the tweets

Here you a set of steps to get the locations of the tweets of MaGa Corpus.

In [28]:
# Identifying the geolocated tweets
count_geo = 0 
for tweet in tweets:
    if tweet["coordinates"] == None:
        # this will save the number of non-located tweets
        count_geo +=1
    else:
        print(tweet["coordinates"])

{'type': 'Point', 'coordinates': [-79.29448005, 43.69369369]}
{'type': 'Point', 'coordinates': [-86.6957927, 36.115582]}
{'type': 'Point', 'coordinates': [-86.8574, 35.9291]}


In [45]:
# Getting the valid locations 
locations = set()
count_geo = 0 
for tweet in tweets:
    if tweet["coordinates"] == None:
        if tweet["user"]["location"] != "":
            count_geo +=1
            locations.add(tweet["user"]["location"]) 

In [46]:
# Number of valid locations
len(locations)

1984

## The following code will obtain a coordinate from a location

In [65]:
# coding=utf-8
from __future__ import unicode_literals
import unicodedata
import unidecode
from geopy.geocoders import Nominatim
from geopy.distance import vincenty, great_circle


class GeoUtilities():

    def __init__(self):
        self.geolocator = Nominatim()

    def address_from_coord(self, lat, lon):
        coord = str(lat) + "," + str(lon)
        return self.geolocator.reverse(coord).address

    def coords_distance(self, coords1, coords2, method="vincenty"):
        if method == "vincenty":
            return vincenty(coords1, coords2).kilometers
        elif method == "great_circle":
            return great_circle(coords1, coords2).kilometers
        else:
            raise ValueError("Distance method not recognized")

    def coord_from_address(self, address):
        location = self.geolocator.geocode(address)
        if not location:
            return "Not Found"
        return tuple([location.latitude, location.longitude])

    def country_from_text(self, text):
        location = self.geolocator.geocode(text)
        if not location:
            return "Not Found"
        return location.address.split(",")[-1].strip()

    def country_from_text_alt(self, text):
        lista = []
        countries = self.get_country_db()
        country = self.f(countries, text, lista)
        return country

    def f(self, db, s, list):
        s = self.process_string(s, list)
        for country, cities in db.items():
            if country in s:
                #print country
                return country
            if self.find(s, cities):
                #print country
                return country
        #print "Not Found"
        return "Not Recognizable"

    @staticmethod
    def find(query_list, cities):
        for query in query_list:
            if query in cities:
                return True
        return False

    def process_string(self, s, list):
        try:
            s = str(s)
        except UnicodeDecodeError as e:
            raise TypeError("Please, append 'u' before the query or convert the str to unicode.")
        s = unicodedata.normalize('NFKC', s)
        s = unidecode.unidecode(s)
        s = s.lower()
        new = ""
        for c in s:
            if c.isalpha() == False:
                new += " "
            else:
                new += c
        query_list = new.split(" ")
        while "" in query_list:
            query_list.remove("")
        list += query_list
        return query_list

    @staticmethod
    def get_country_db():
        f = open("citiesDB.txt", 'r', encoding="utf8")
        i = 0
        db = {}
        for line in f:
            if i == 0:
                i += 1
                continue
            line = str(line.strip())
            info = line.split(",")
            country = info[4].lower()
            if country not in db:
                db[country] = set()
            if info[6] != "":
                db[country].add(info[6].lower())
            if info[7] != "":
                db[country].add(info[7].lower())
            # if info[3] != "":
            #    db[country].add(info[3].lower())
            # if info[5] != "":
            #    db[country].add(info[5].lower())
        f.close()
        
        us_cities = {'al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'de', 'fl', 'ga', 'hi', 'id', 'il', 'in', 'ia', 'ks',
                     'ky',
                     'la', 'me', 'md', 'ma', 'mi', 'mn', 'ms', 'mo', 'mt', 'ne', 'nv', 'nh', 'nj', 'nm', 'ny', 'nc',
                     'nd',
                     'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy'}
        db['"united states"'] = db['"united states"'].union(us_cities)
        del db["jersey"]
        db['mexico'].add("xico")
        db['mexico'].add("queretaro")
        db['belgium'].add("belgique")
        db['belgium'].add("bruxelles")
        db['belgium'].add("gent")
        db['italy'].add("napoli")
        db['germany'].add("munchen")
        db['germany'].add("zegveld")
        db['canada'].add("montreal")
        db['france'].remove("montreal")
        db['"united kingdom"'].add("kingdom")
        db['"united kingdom"'].add("uk")
        db['"united kingdom"'].add("brixton")
        db['"united states"'].add("states")
        db['"united states"'].add('nyc')
        db['"united states"'].add("francisco")
        db['"united states"'].add("USA")
        db['"united states"'].add("angeles")
        db['"united states"'].add("diego")
        db['"united states"'].add("louis")
        db['"united states"'].add("antonio")
        db['"united states"'].add("marcos")
        db['"united states"'].add("mateo")
        db['"united states"'].add("jose")
        db['"united states"'].add("vegas")
        db['"united states"'].remove("manchester")
        db['"united states"'].remove("london")
        db['"united states"'].remove("italy")
        db['"united states"'].remove("delhi")
        db['"united states"'].remove("montreal")
        db['greece'].remove('rio')
        db['"south africa"'].remove("london")
        db['netherlands'].add("limburg")
        db['finland'].add("suomi")
        db['japan'].remove('usa')
        db['germany'].add("deutschland")
        db['"south korea"'].add("corea")
        db['venezuela'].remove("barcelona")
        db['bahrain'].remove("northern")
        db['fiji'].remove("western")
        db['brazil'].add("brasil")
        db['brazil'].add("paulo")
        db['argentina'].add("buenos")
        db['colombia'].add("bogot")
        db['colombia'].add("medellin")
        db['spain'].add('españa')
        db['spain'].add("chamberi")
        db['spain'].add("sevilla")
        db['"south africa"'].add("cape")
        db['"south africa"'].add("africa")
        db['argentina'].remove('victoria')
        db['canada'].remove('victoria')
        db['malaysia'].remove('victoria')
        db['"united kingdom"'].remove('victoria')
        db['panama'].remove('victoria')
        db['mexico'].remove('victoria')
        db['romania'].remove('victoria')
        db['seychelles'].remove('victoria')
        db['chile'].remove('victoria')
        db['"trinidad and tobago"'].remove('victoria')
        db['canada'].add('peel')
        db['"isle of man"'].remove('peel')
        db['"united states"'].remove('victoria')
        db['"united states"'].remove('bogota')
        db['spain'].remove("trujillo")
        db['venezuela'].remove("trujillo")
        db['sweden'].remove('lima')
        db['"united states"'].remove('lima')
        db['argentina'].remove('lima')
        db['mexico'].remove('mendoza')
        db['uruguay'].remove('florida')
        db['argentina'].remove('florida')
        db['"puerto rico"'].remove('florida')
        db['"united states"'].remove('palermo')
        db['italy'].remove('palermo')
        db['"united states"'].remove('ghent')
        db['"united states"'].remove('antwerp')
        db['"united states"'].remove('amsterdam')
        db['"united states"'].remove('breda')
        db['"united states"'].remove('manila')
        db['spain'].remove('breda')
        db['philippines'].remove("nueva")
        db['philippines'].remove("rosario")
        db['philippines'].remove("santiago")
        db['colombia'].remove("santiago")
        db['brazil'].remove("santiago")
        db['peru'].remove("santiago")
        db['mexico'].remove("santiago")
        db['"united states"'].remove("zurich")
        db['"united states"'].remove("geneva")
        db['spain'].remove('cartagena')
        db['colombia'].remove('miami')
        db['canada'].remove('miami')
        db['"united states"'].remove("belgrade")
        db['"united states"'].remove("athens")
        db['spain'].remove('león')
        db['nicaragua'].remove('león')
        db['colombia'].add('bogota')
        db['spain'].remove('leon')
        db['nicaragua'].remove('granada')
        db['panama'].remove('sacramento')
        db['"united states"'].remove("granada")
        db['"united states"'].add("sacramento")
        db['brazil'].remove("california")
        db['"trinidad and tobago"'].remove("california")
        db['brazil'].remove('sacramento')
        db['spain'].remove('palmeira')
        db['namibia'].remove('olympia')
        db["ireland"].remove("virginia")
        db["brazil"].remove("virginia")
        db["australia"].remove("virginia")
        db['"south africa"'].remove("virginia")
        db['canada'].remove('nantes')
        db['brazil'].remove('nantes')
        db['brazil'].remove('orleans')
        db['canada'].remove('orleans')
        db['panama'].remove('peru')
        db['"united states"'].remove('peru')
        db['belgium'].remove("belgrade")
        db['"united states"'].add("nueva")
        db['belgium'].remove('essen')
        db['belgium'].remove('waterloo')
        db['romania'].remove('alexandria')
        db['"united states"'].remove('alexandria')
        db['"united kingdom"'].remove('alexandria')
        db['australia'].remove('alexandria')
        db['canada'].remove('alexandria')
        db['romania'].remove('roma')
        db['"united states"'].remove('roma')
        db['lesotho'].remove('roma')
        db['australia'].remove('roma')
        db['italy'].add('roma')
        db['"united states"'].remove('vienna')
        db['canada'].remove('vienna')
        db['austria'].remove('haag')
        db['"united states"'].remove('cairo')
        db['"united states"'].remove('lyon')
        db['"hong kong"'].add('hong')
        db['"united states"'].remove('paris')
        db["canada"].remove("paris")
        db['canada'].remove("bellevue")
        db['switzerland'].remove('bellevue')
        db['australia'].remove('bellevue')
        db['"united states"'].remove('leon')
        db['"united states"'].remove('versailles')
        db['"united states"'].remove('waterford')
        db['"united states"'].remove('dundalk')
        db['"united states"'].remove('island')
        db['"united states"'].remove('dublin')
        db['"united states"'].remove('galway')
        db['portugal'].remove('lagos')
        db['spain'].remove('cuenca')
        db["australia"].remove('newcastle')
        db["canada"].remove('newcastle')
        db['"south africa"'].remove('newcastle')
        db['"united states"'].remove('newcastle')
        db['spain'].remove('loja')
        db['ireland'].remove('island')
        db['canada'].remove('waterford')
        db['australia'].remove('waterford')
        db["france"].remove("north")
        db['"united states"'].remove('north')
        db['france'].remove('leon')
        db['france'].remove('joinville')
        db['"south africa"'].remove('buffalo')
        db['brazil'].remove('carolina')
        db['"united states"'].remove('prague')
        db['chile'].remove('casablanca')
        db['brazil'].remove('colina')
        db['"south africa"'].remove('carolina')
        db['spain'].remove('almonte')
        db['"el salvador"'].remove('zaragoza')
        db['spain'].remove('guadalajara')
        db['venezuela'].remove('valencia')
        db['"united states"'].remove('valencia')
        db['"united states"'].remove('stockholm')
        db['russia'].remove('china')
        db['"united states"'].remove("china")
        db['"united states"'].remove("wales")
        db['"united kingdom"'].remove("boston")
        db['"united states"'].remove("bradford")
        db['"united states"'].remove("stuttgart")
        db['"united states"'].remove("birmingham")
        db['"united states"'].remove("ottawa")
        db['"united states"'].remove("west")
        db['"united states"'].remove("england")
        db['"united states"'].remove("melbourne")
        db['canada'].remove('houston')
        db['"united kingdom"'].remove('houston')
        db['"united states"'].remove("bristol")
        db['"united states"'].remove("marina")
        db['"united states"'].remove('brighton')
        db['canada'].remove('brighton')
        db['australia'].remove('brighton')
        db['"united states"'].remove('norwich')
        db['canada'].remove('norwich')
        db['canada'].remove('london')
        db['"united states"'].remove('sunderland')
        db['"united states"'].remove('hamburg')
        db['"united states"'].remove('nottingham')
        db['"united kingdom"'].remove('washington')
        db['"united states"'].remove('milan')
        db['"united states"'].remove('moscow')
        db['"united states"'].remove('northampton')
        db['"united states"'].remove('montevideo')
        db['"united kingdom"'].remove('rochester')
        db['australia'].remove('york')
        db['canada'].add('Montréal')
        db['belgium'].add('bruseel')
        db['belgium'].add('België')
        db['switzerland'].add('Zürich')
        db['"united states"'].remove('liverpool')
        db['canada'].remove('liverpool')
        db['australia'].remove('liverpool')
        db['peru'].add('peru')
        db['australia'].remove('melbourne')
        db['mexico'].remove('valladolid')
        db['mexico'].add('México')
        db['colombia'].remove('madrid')
        db['"united states"'].remove('madrid')
        db['"united states"'].remove('leicester')
        db['"united states"'].remove('wimbledon')
        db['"south africa"'].remove('phoenix')
        db['"puerto rico"'].remove('carolina')
        db['"south africa"'].remove('brooklyn')
        db['"united states"'].remove('swansea')
        db['"united states"'].remove('brisbane')
        db['australia'].remove('brooklyn')
        db['ecuador'].add('manabi')
        # db['internet'] = {"internet"}
        # db['global'] = {"global", "worldwide","world","everywhere","earth","planet"}
        # db['europe'] = {"europa","europe"}
        return db

In [69]:
#Getting the countries by location

countries_by_location = dict()

for location in locations:
    countries_by_location[location] = geolocator.f(geolocator.get_country_db(),location.strip(),[])

In [71]:
# Countries where at least one tweet of the MaGa Corpus was generated
set(countries_by_location.values())

{'"antigua and barbuda"',
 '"bosnia and herzegovina"',
 '"hong kong"',
 '"new zealand"',
 '"saudi arabia"',
 '"south africa"',
 '"united arab emirates"',
 '"united kingdom"',
 '"united states"',
 'Not Recognizable',
 'australia',
 'austria',
 'bangladesh',
 'belgium',
 'botswana',
 'bulgaria',
 'canada',
 'czechia',
 'denmark',
 'finland',
 'france',
 'germany',
 'greece',
 'hungary',
 'iceland',
 'india',
 'iran',
 'ireland',
 'israel',
 'italy',
 'japan',
 'libya',
 'malaysia',
 'mauritius',
 'mongolia',
 'netherlands',
 'nigeria',
 'norway',
 'pakistan',
 'peru',
 'philippines',
 'portugal',
 'russia',
 'slovenia',
 'spain',
 'sweden',
 'switzerland',
 'thailand',
 'venezuela',
 'vietnam'}

In [72]:
# Number of countries where at least one tweet of the MaGa Corpus was generated

len(set(countries_by_location.values()))

50