# Most common words and bigrams in the last minutes per location in Spain

**Author: Daniel Roncel Díaz**

**Date: 30/03/2021**

In [1]:
import codecs
import tweepy
import json
import time
import re # RegEx
import pandas as pd
import numpy as np
import os
from datetime import datetime
import emoji
from nltk.util import ngrams
from nltk.corpus import stopwords
from collections import Counter

## 1. Load cities name

**Get the current path to access easily to input and output folders:**

In [2]:
folder_path = os.getcwd()

**Load all the Spain cities and its coordinates:**

In [3]:
cities_file = folder_path+'\input\spain_cities.csv'
df_cities = pd.read_csv(cities_file)

# cities_dict[city] = (latitude, longitude)
cities_dict = dict()
for idx, row in df_cities.iterrows():
    cities_dict[row['town']] = (row['latitude'], row['longitude'])
    
# Show a few cities in the dictionary    
list(cities_dict.keys())[:5]

['Abla', 'Abrucena', 'Adra', 'Albánchez', 'Alboloduy']

In [4]:
def location_filter(location, cities_dict, valid_end):
    """
    Returns the city the location parameter is associated to.
    If the city cannot be identified, returns and empty string.
    
    Arguments:
        location:
            string associated with the city that it is requested to find
        cities_dict:
            dictionary whose keys are the valid possible cities
        valid_end:
            set of possible valid suffixes (ignoring punctuation marks)
    """
    if location in cities_dict:
        coord = cities_dict[location]
        return location
    else:
        # Split location by any character not included below
        # Ensure that the accents are preserved when working with Spanish texts
        location_split  = re.split(r'[^A-ZÁÀÉÈÍÌÏÓÒÚÜa-záàéèíìïóòúùü0-9]+', location)

        # Remove all empty strings
        location_split = [location_split[i] for i in range(len(location_split)) if location_split[i] != '']
        
        if len(location_split) <= 1:
            return ''
        elif location_split[0] in cities_dict:
            if location_split[-1] in valid_end or location_split[-1] in cities_dict:
                coord = cities_dict[location_split[0]]
                return location_split[0]
            else:
                return ''
    return ''

**Load all the valid ends of a user location:**

In [5]:
valid_end = set()

file_name = folder_path+"/input/valid_ends.txt"
f = codecs.open(file_name, 'r', encoding='utf8')

city = f.readline()
while city != 'END':
    valid_end.add(city[:-2])
    city = f.readline()
f.close()

valid_end

{'Andalucía',
 'Aragón',
 'Asturias',
 'Canarias',
 'Cantabria',
 'Castilla y León',
 'Castilla-La Mancha ',
 'Catalunya',
 'Cataluña',
 'Ceuta',
 'Comunidad Valenciana',
 'Comunidad de Madrid',
 'Comunitat Valenciana',
 'España',
 'Euskadi',
 'Extremadura',
 'Galicia',
 'Galiza',
 'Islas Baleares',
 'Islas Canarias',
 'La Rioja',
 'Melilla',
 'Navarra',
 'País Vasco',
 'Principado de Asturias',
 'Región de Murcia',
 'Spain'}

In [6]:
# Test
location = "Madrid, Spain"
location_filter(location, cities_dict, valid_end)

'Madrid'

## 2. Set up our crawler

In [7]:
class Listener(tweepy.streaming.StreamListener):
    """
    Twitter crawler for tweet streaming.
    """
    def __init__(self, cities_dict, valid_end, cities_filter_function, max_minutes, \
                                             json_tweets_file, max_tweets=None, api=None):
        """
        Arguments:
            cities_dict:
                dictionary whose keys are the valid cities
                from which we can get tweets.
            valid_end:
                set of possible valid suffixes (ignoring punctuation marks).
            cities_filter_function:
                returns the city the location parameter is associated to. 
                If the location cannot be processed or it is not valid, 
                returns an empty string.
            max_minutes:
                maximum number of minutes streaming tweets.
            max_tweets:
                maximum number of tweets to stream.
            json_tweets_file:
                file to store the streamed tweets.
            api:
                attribute of the parent class. It won't be used in this code.
        """
        super(tweepy.streaming.StreamListener, self).__init__()
        self.cities_dict = cities_dict
        self.valid_end = valid_end
        self.cities_filter_function = cities_filter_function
        self.num_tweets = 0 # Current read tweets
        self.max_tweets = max_tweets
        self.time_limit = time.time() + max_minutes*60
        self.json_tweets_file = json_tweets_file
        self.api = api
        
    def on_data(self, data):
        """
        Function called every time a tweet is streamed. If the user's location is valid
        and can be retrieved by self.cities_filter_function, stores the tweet with its city 
        in json format in self.json_tweets_file.
        
        """
        try:
            # Append the new tweet in the file, if location is retrived
            with open(self.json_tweets_file, 'a') as f:
                tweet_json = json.loads(data)
                location = tweet_json['user']['location']
                
                if location != None:
                    # Try to retrieve user's location
                    user_location = self.cities_filter_function(location, self.cities_dict, self.valid_end)
                    # If it could be retrieved
                    if user_location != '':
                        tweet_json['user_location'] = user_location
                        f.write(json.dumps(tweet_json) + "\n\n")
                        
                        # Print the retrieved location. Useful for testing cities_filter_function
                        print(location +" -> "+user_location)
                        
                        self.num_tweets += 1
                        # If we read enough tweets our streamed for enough time, stop the exection
                        if self.max_tweets != None and self.num_tweets >= self.max_tweets:
                            return False
                        if self.time_limit < time.time():
                            return False
                        return True
                return True
        # If an error occurs, stop the execution
        except Exception as e:
            print(type(e).__name__)
            return True
        
    def on_error(self, status):
        print('Error :', status)
        return False

In [8]:
# Fill your twitter developer keys and secrets
consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

In [9]:
# Set your keywords for the search
keywords = ["madrid"]
# New folder name: keyword1_keyword2_..._keywordN_DAY-MONTH-YEAR_HOUR.MINUTE.SECOND.jsonl
now = datetime.now()
dt_string = now.strftime("%d-%m-%Y_%H.%M.%S")
output_folder = folder_path+'/output/'+'_'.join(keywords)+'_'+dt_string
os.mkdir(output_folder)
# File name: keyword1_keyword2_..._keywordN.jsonl
# It will contain the tweets in json format
json_tweets_file = output_folder+'/'+'_'.join(keywords)+'.jsonl'

# Execution time
max_minutes = 1
twitter_stream = tweepy.streaming.Stream(auth, Listener(cities_dict=cities_dict, valid_end=valid_end, cities_filter_function=location_filter, max_minutes=max_minutes, json_tweets_file = json_tweets_file))
# Filter for the given keywords in Spanish tweets
twitter_stream.filter(track=keywords, languages=['es']) 

print('_______ End _______')

Esparreguera -> Esparreguera
Aranjuez -> Aranjuez
Madrid - Valencia -> Madrid
Alcobendas.Madrid -> Alcobendas
Madrid -> Madrid
Madrid -> Madrid
Avilés -> Avilés
Madrid -> Madrid
Cádiz -> Cádiz
Madrid (Spain) -> Madrid
Madrid, Comunidad de Madrid -> Madrid
Écija, Sevilla -> Écija
Madrid -> Madrid
Madrid -> Madrid
Valencia -> Valencia
Barcelona -> Barcelona
Madrid -> Madrid
Valencia -> Valencia
Madrid -> Madrid
Madrid -> Madrid
Leganés -> Leganés
Madrid -> Madrid
Madrid -> Madrid
Algeciras / Granada -> Algeciras
Madrid -> Madrid
Madrid - Azeroth - Bilbao  -> Madrid
Sevilla. Natural de Sevilla -> Sevilla
Madrid -> Madrid
Madrid -> Madrid
Madrid -> Madrid
Madrid, Comunidad de Madrid -> Madrid
Utebo (Zaragoza) -> Utebo
Collado Villalba, Madrid -> Collado
Madrid -> Madrid
Zaragoza/ Madrid/ Barcelona / -> Zaragoza
Barcelona y Sitges -> Barcelona
Chiva, Valencia -> Chiva
Madrid -> Madrid
Madrid, Comunidad de Madrid -> Madrid
Avilés -> Avilés
Madrid, Comunidad de Madrid -> Madrid
Santa Cruz de 

## 3. Process the stored data

**Load the stored tweets in a pandas Dataframe:**

In [10]:
df = pd.read_json(json_tweets_file, lines=True)
df.head()

Unnamed: 0,contributors,coordinates,created_at,display_text_range,entities,extended_entities,extended_tweet,favorite_count,favorited,filter_level,...,reply_count,retweet_count,retweeted,retweeted_status,source,text,timestamp_ms,truncated,user,user_location
0,,,2021-03-30 12:33:26,,"{'hashtags': [], 'urls': [], 'user_mentions': ...",,,0,False,low,...,0,0,False,{'created_at': 'Tue Mar 30 12:09:44 +0000 2021...,"<a href=""https://mobile.twitter.com"" rel=""nofo...","RT @marianpy1: No tienen vergüenza, esto deber...",2021-03-30 12:33:26.167,False,"{'id': 636170404, 'id_str': '636170404', 'name...",Esparreguera
1,,,2021-03-30 12:33:27,,"{'hashtags': [], 'urls': [], 'user_mentions': ...",,,0,False,low,...,0,0,False,{'created_at': 'Tue Mar 30 12:29:57 +0000 2021...,"<a href=""http://twitter.com/download/android"" ...",RT @pradobenjamin: Ayuso pone barra libre en M...,2021-03-30 12:33:27.427,False,"{'id': 375801242, 'id_str': '375801242', 'name...",Aranjuez
2,,,2021-03-30 12:33:29,,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,,0,False,low,...,0,0,False,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",DEL AEROPUERTO DE BARAJAS AL CENTRO DE MADRID ...,2021-03-30 12:33:29.931,False,"{'id': 541618181, 'id_str': '541618181', 'name...",Madrid
3,,,2021-03-30 12:33:32,"[15, 62]","{'hashtags': [{'text': 'SiSePuede', 'indices':...",,,0,False,low,...,0,0,False,,"<a href=""http://twitter.com/download/android"" ...",@PabloIglesias Gracias por tu esfuerzo.A por M...,2021-03-30 12:33:32.446,False,"{'id': 1876488498, 'id_str': '1876488498', 'na...",Alcobendas
4,,,2021-03-30 12:33:33,,"{'hashtags': [], 'urls': [], 'user_mentions': ...",,,0,False,low,...,0,0,False,{'created_at': 'Tue Mar 30 12:32:29 +0000 2021...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",RT @Rita_Maestre: Día histórico para Vallecas ...,2021-03-30 12:33:33.362,False,"{'id': 338560854, 'id_str': '338560854', 'name...",Madrid


### 3.1 Normalize the tweets

In [11]:
def normalize(text):
    """
    Remove all the emojis, accents, undesired characters
    in a string and convert it to lowercase.
    
    Arguments:
        text:
            string to normalize.
    """
    # Delete emojis
    text = re.sub(emoji.get_emoji_regexp(), r' ', text)
    # Delete accents. With this approach we can keep the 'ñ' char, common in Spanish
    text = text.lower()
    original_char = 'ÁÀÉÈÍÌÏÓÒÚÙÜáàéèíìïóòúùü'
    new_char = 'AAEEIIIOOUUUaaeeiioouuu'
    
    for original, new in zip(original_char, new_char):
        text = text.replace(original, new)
    
    # Delete all the chars not included below. Transform it to lowercase
    text = re.sub(r'[^A-ZÑa-zñ0-9_ ]+', ' ', text).lower()
    
    return text

**See some of the original tweets to compare them with the normalized ones:**

In [12]:
df['text'].head()

0    RT @marianpy1: No tienen vergüenza, esto deber...
1    RT @pradobenjamin: Ayuso pone barra libre en M...
2    DEL AEROPUERTO DE BARAJAS AL CENTRO DE MADRID ...
3    @PabloIglesias Gracias por tu esfuerzo.A por M...
4    RT @Rita_Maestre: Día histórico para Vallecas ...
Name: text, dtype: object

**Normalize the tweets:**

In [13]:
df['text_normalized'] = df['text'].apply(normalize)
df['text_normalized'].head()

0    rt  marianpy1  no tienen verg enza  esto deber...
1    rt  pradobenjamin  ayuso pone barra libre en m...
2    del aeropuerto de barajas al centro de madrid ...
3     pabloiglesias gracias por tu esfuerzo a por m...
4    rt  rita_maestre  dia historico para vallecas ...
Name: text_normalized, dtype: object

### 3.2 Compute the list of words of each tweet

In [14]:
def text_to_words(text, stopwords_set, min_size):
    """
    Return all the words in a string that aren't stopwords and
    that are at least as large as the specified size.
    
    Arguments:
        text:
            string whose words we are interested in.
        stopwords_set:
            set of stopwords.
        min_size:
            minimum size of a word to be returned.
    """
    words = text.split()
    words = [word for word in words if word != '' and len(word) >= MIN_SIZE and word not in stopwords_set]
    return words

In [15]:
# Stopwords are the words that we want to delete from all tweets
STOPWORDS = set(stopwords.words('spanish') + ["rt", "http", "https", "co", "com", "es"])
# Minimum size of a word to be stored
MIN_SIZE = 2

df['text_words'] = df['text_normalized'].apply(lambda text: text_to_words(text, STOPWORDS, MIN_SIZE))
df['text_words'].head()

0    [marianpy1, verg, enza, deberia, ser, delito, ...
1    [pradobenjamin, ayuso, pone, barra, libre, mad...
2    [aeropuerto, barajas, centro, madrid, j0hvw8ss...
3    [pabloiglesias, gracias, esfuerzo, madrid, sis...
4    [rita_maestre, dia, historico, vallecas, madri...
Name: text_words, dtype: object

### 3.2 Compute the list of bigrams of each tweet

In [16]:
df['text_bigrams'] = df['text_words'].apply(lambda text: list(ngrams(text,2)))
df['text_bigrams'].head()

0    [(marianpy1, verg), (verg, enza), (enza, deber...
1    [(pradobenjamin, ayuso), (ayuso, pone), (pone,...
2    [(aeropuerto, barajas), (barajas, centro), (ce...
3    [(pabloiglesias, gracias), (gracias, esfuerzo)...
4    [(rita_maestre, dia), (dia, historico), (histo...
Name: text_bigrams, dtype: object

### 3.3 Compute the top used words per location

In [17]:
df_wb = df[['text_words', 'text_bigrams', 'user_location']]
df_wb = df_wb.groupby('user_location').sum()
df_wb.head()

Unnamed: 0_level_0,text_words,text_bigrams
user_location,Unnamed: 1_level_1,Unnamed: 2_level_1
Alcobendas,"[pabloiglesias, gracias, esfuerzo, madrid, sis...","[(pabloiglesias, gracias), (gracias, esfuerzo)..."
Algeciras,"[noelia_n, magic, english, spain, means, españ...","[(noelia_n, magic), (magic, english), (english..."
Aranjuez,"[pradobenjamin, ayuso, pone, barra, libre, mad...","[(pradobenjamin, ayuso), (ayuso, pone), (pone,..."
Avilés,"[policia, detenidos, seis, jovenes, banda, lat...","[(policia, detenidos), (detenidos, seis), (sei..."
Barcelona,"[miirerme, ojala, tener, autoestima, madrileño...","[(miirerme, ojala), (ojala, tener), (tener, au..."


In [18]:
# Number of words to be shown per city
TOP = 5
# Compute the TOP words more used per city
df_wb['top_words'] = df_wb['text_words'].apply(lambda words: Counter(words).most_common(TOP))
df_wb['top_words'].head()

user_location
Alcobendas    [(pabloiglesias, 1), (gracias, 1), (esfuerzo, ...
Algeciras     [(noelia_n, 1), (magic, 1), (english, 1), (spa...
Aranjuez      [(pradobenjamin, 1), (ayuso, 1), (pone, 1), (b...
Avilés        [(policia, 1), (detenidos, 1), (seis, 1), (jov...
Barcelona     [(navarra, 2), (miirerme, 1), (ojala, 1), (ten...
Name: top_words, dtype: object

### 3.4 Compute the top used bigrams per location

In [19]:
TOP = 5
df_wb['text_bigrams'] = df_wb['text_bigrams'].apply(lambda bigrams: Counter(bigrams).most_common(TOP))
df_wb['text_bigrams'].head()

user_location
Alcobendas    [((pabloiglesias, gracias), 1), ((gracias, esf...
Algeciras     [((noelia_n, magic), 1), ((magic, english), 1)...
Aranjuez      [((pradobenjamin, ayuso), 1), ((ayuso, pone), ...
Avilés        [((policia, detenidos), 1), ((detenidos, seis)...
Barcelona     [((miirerme, ojala), 1), ((ojala, tener), 1), ...
Name: text_bigrams, dtype: object

### 3.5 Compute general stats

#### 3.5.1 Counter of tweets per city

In [20]:
users_per_location = df['user_location'].value_counts()
users_per_location.head()

Madrid       24
Avilés        2
Barcelona     2
Valencia      2
Sevilla       1
Name: user_location, dtype: int64

#### 3.5.2 Top used words without regarding the location

In [21]:
words_freq = dict()

for location, row in df_wb.iterrows():
    for word, ctr in row['top_words']:
        if word not in words_freq:
            words_freq[word] = ctr
        else:
            words_freq[word] += ctr

In [22]:
TOP = 10
words_freq = Counter(words_freq).most_common(TOP)
print(words_freq)

[('madrid', 15), ('dia', 6), ('30', 3), ('gobierno', 3), ('turistas', 3), ('va', 3), ('navarra', 2), ('cuesta', 2), ('ser', 2), ('paco', 2)]


#### 3.5.3 Top used bigrams without regarding the location

In [23]:
bigrams_freq = dict()

for location, row in df_wb.iterrows():
    for bigrams, ctr in row['text_bigrams']:
        if bigrams not in bigrams_freq:
            bigrams_freq[bigrams] = ctr
        else:
            bigrams_freq[bigrams] += ctr

In [24]:
TOP = 10
bigrams_freq = Counter(bigrams_freq).most_common(TOP)
print(bigrams_freq)

[(('rita_maestre', 'dia'), 2), (('dia', 'historico'), 2), (('historico', 'vallecas'), 2), (('vallecas', 'madrid'), 2), (('madrid', 'ganamos'), 2), (('va', 'dejar'), 2), (('dejar', 'pagar'), 2), (('pabloiglesias', 'gracias'), 1), (('gracias', 'esfuerzo'), 1), (('esfuerzo', 'madrid'), 1)]


### 3.6 Save the results in files

**Store the top words used per location:**

In [25]:
words_file = output_folder+'/words_location.txt'

with open(words_file, 'w') as f:
    for location, row in df_wb.iterrows():
        f.write(location+'\n')
        
        for word, ctr in row['top_words']:
            f.write(word+' '+str(ctr)+'\n')
        f.write('\n')

**Store the top bigrams used per location:**

In [26]:
bigrams_file = output_folder+'/bigrams_location.txt'

with open(bigrams_file, 'w') as f:
    for location, row in df_wb.iterrows():
        f.write(location +'\n')
        
        for bigram, ctr in row['text_bigrams']:
            f.write(bigram[0]+' '+bigram[1]+' '+str(ctr)+'\n')
        f.write('\n')

**Store the number of tweets per location:**

In [27]:
main_file = output_folder+'/counter_location.txt'

with open(main_file, 'w') as f:
    for location in users_per_location.index:
        f.write(location+' '+str(users_per_location[location])+'\n')

**Store the top used words without regarding the location:**

In [28]:
words_file = output_folder+'/words.txt'

with open(words_file, 'w') as f:
    for word, ctr in words_freq:
        f.write(word+' '+str(ctr)+'\n')

**Store the global top used bigrams without regarding the location:**

In [29]:
bigrams_file = output_folder+'/bigrams.txt'

with open(bigrams_file, 'w') as f:
    for bigram, ctr in bigrams_freq:
        f.write(bigram[0]+' '+bigram[1]+' '+str(ctr)+'\n')