<a href="https://colab.research.google.com/github/dieko95/blackouts-C4V/blob/diego-first-iter/twitter_pretagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training Dataset Creation - Tagging 

This notebook aims to create the dataset for Code for Venezuela's Blackout Project. 

This dataset is going to be consumed by an ML model that will aim to predict: 

- If a tweet is from Venezuela
- If so from which state(s) 
- About what public service the user is reporting (sinluz)



## Libraries



In [None]:
import pandas as pd
import re

# For better visualization of text in Pandas DF
pd.set_option('display.max_colwidth', None)

## Helper functions

In [None]:
def cleaner(df,text_col, is_pandas_series = False):
    '''
    Helper function to do basic text cleaning operations. 
    These include: Converting text to lower case, removing spanish accents,and removing links.
    -------------------------------------------------------------------------------------------
    PARAMS
        df: Dataframe or Pandas.Series object. 
        text_col: String. Column to clean. 
        is_pandas_series: Boolean, Optional. If df is pandas.Series
    
    '''
    
  # to lower

    if is_pandas_series == False:
        df[text_col] = df[text_col].str.lower()

      # Convert common spanish accents

        df[text_col] = df[text_col].str.replace("ú", "u")
        df[text_col] = df[text_col].str.replace("ù", "u")
        df[text_col] = df[text_col].str.replace("ü", "u")
        df[text_col] = df[text_col].str.replace("ó", "o")
        df[text_col] = df[text_col].str.replace("ò", "o")
        df[text_col] = df[text_col].str.replace("í", "i")
        df[text_col] = df[text_col].str.replace("ì", "i")
        df[text_col] = df[text_col].str.replace("é", "e")
        df[text_col] = df[text_col].str.replace("è", "e")
        df[text_col] = df[text_col].str.replace("á", "a")
        df[text_col] = df[text_col].str.replace("à", "a")
        df[text_col] = df[text_col].str.replace("ñ", "gn")

        # Remove Punctuation
        df[text_col] = df[text_col].str.replace("[\.\-:,]", " ")

        # Remove links
        df[text_col] = df[text_col].str.replace("http.+", " ")

        return df
    
    elif is_pandas_series == True:
        
        df = df.str.lower()

      # Convert common spanish accents

        df = df.str.replace("ú", "u")
        df = df.str.replace("ù", "u")
        df = df.str.replace("ü", "u")
        df = df.str.replace("ó", "o")
        df = df.str.replace("ò", "o")
        df = df.str.replace("í", "i")
        df = df.str.replace("ì", "i")
        df = df.str.replace("é", "e")
        df = df.str.replace("è", "e")
        df = df.str.replace("á", "a")
        df = df.str.replace("à", "a")
        df = df.str.replace("ñ", "gn")

        # Remove Punctuation
        df = df.str.replace("[\.\-:,]", " ")

        # Remove links
        df = df.str.replace("http.+", " ")
        
        return df

def text_matcher(text,list_to_compare,is_pd_series = False):
    if is_pd_series == False:
     
        for i in list_to_compare:
            if i in text:
                print(i)
            else:
                next
    
    
    elif is_pd_series == True:
        
        text = cleaner(pd.DataFrame({'col':[tst]}), 'col').to_string()

        for i in list_to_compare:
            if i in text:
                print(i)
            else:
                next

## Accessing Data

The untagged dataset originates from scraped tweets by Code For Venezuela's Angostura ETL. A subset of tweets (11,000) was queried from the etl in order for them to be tagged. The first 4,000 tweets have already been tagged. 

- 6578 Tweets without tags


In [None]:
# Read CSV from github 
# tagOriginalDf = pd.read_csv('https://raw.githubusercontent.com/dieko95/blackouts-C4V/diego-first-iter/tagging-set-original_for_jupyter_tagging.csv')
tagOriginalDf = pd.read_csv('tagging-set-original_for_jupyter_tagging.csv')

tagOriginalDf.label_country.fillna('0', inplace = True)
tagOriginalDf = cleaner(tagOriginalDf, 'label_country')

# Corresponding Section
# tags_df = tagOriginalDf.iloc[6001:8499,:].copy()
tags_df = tagOriginalDf.iloc[:,:].copy()

# Tagged Tweets
pre_tag_df = tags_df.loc[tags_df.label_country != '0',['full_text','concat_text_user_description', 'label_country', 'label_state', 'label_type']].copy()
pre_tag_df['label_country'] = pre_tag_df.label_country.str.replace('espa\w+','espagna')
pre_tag_df['label_country'] = pre_tag_df.label_country.str.replace('arg\w+','argentina')


# Tweets to tag
to_tag_df = tags_df.loc[tags_df.label_country == '0',['full_text','concat_text_user_description', 'label_country', 'label_state', 'label_type']].copy()

## Cleaning Text

This is a helper function to quickly clean text.

- Converts all text to low caps. 
- Strips all spanish accents

Pending:

- Strip dots and links (@ and # must remain) 

In [None]:


pre_tag_df = cleaner(pre_tag_df, 'full_text')
to_tag_df = cleaner(to_tag_df, 'concat_text_user_description')
to_tag_df = cleaner(to_tag_df, 'full_text')


# 3546

## Sections to Tag 

- Tag label_type (service reported)
  - Extracting pound signs (\#)

- Tag Country
  - Matches any state? 
  - has keyword 'edo' or 'estado' in it?
  - Follows any of the common accounts?
- Tag State
  - Match with list of venezuela states
  - We can use a list of venezuelan cities as well 

### Classifying Label Type

#### Hashtags

* \#SinLuz






In [None]:

# sinluz         670
# ahora           68
# sinagua         66
# apagon          65
# singasolina     53

hashtags = pd.Series(re.findall('#(\w+)', to_tag_df.concat_text_user_description.to_string())).copy()

# hashtags.value_counts()[hashtags.value_counts() > 10]

# hashtags[hashtags == 'sinl']




### Tagging Country

  - Matches any state? 
  - has keyword 'edo' or 'estado' in it?
  - Follows any of the common accounts?

*Notes*
  - For this section I will use the tweet's original text. If I include the user description it can add noise because a user can be reporting about a power outage of another state (e.g., I'm from caracas and reporting a power outage in Zulia)
  
 - Menciones a cuentas: 

Generales

- @NicolsMaduro
- @DanteRivasQ
- @ReporteYa
- @Gob_Vargas
- @ReporteYa
- @FBritoMaestre
- @efectococuyo


Electricidad

- @CORPOELECinfo
- @CORPOELECgua_
- @corpoelecmerida
- @MPPAAguas
- @CorpoelecCar
- @ClimaMargarita

Agua 
- @hidrovenca
- @hidrocapitalca
- @MPPAAguas


  

#### Tagging non-venezuelan countries

<br>

- Before this subsection: 6,578 non-tagged Tweets
- After this subsection: 

###### New labeled countries  

<br>

These are the results after grouping non-venezuelan accounts and tagging accordingly

<br>

|     label_country    	| frequency 	|
|:--------------------:	|:---------:	|
|           0          	|    6112   	|
|       argentina      	|    410    	|
|        mexico        	|     25    	|
|       not sure       	|     24    	|
| republica domenicana 	|     7     	|

<br>

###### Notes:

- Most common accounts in non-venezuelan tweets

<br>

|     Account    	| total frequency 	|
|:--------------:	|:---------------:	|
|  oficialedesur 	|       340       	|
| edenorclientes 	|       180       	|
|    se_corto    	|        15       	|
| cortes_en_bsas 	|        13       	|
|    alferdez    	|        9        	|
|  todonoticias  	|        6        	|
|  mauriciomacri 	|        6        	|
|    enre_arg    	|        6        	|

In [None]:

# list containing tagged countries except venezuela
non_venezuela_tags = pre_tag_df.label_country[pre_tag_df.label_country != 'venezuela'].value_counts().index.tolist()


# Each element of non venezuelan tags (e.g. Argentina, Republica Domenicana, etc...)
for element in non_venezuela_tags:
    
     # Loop over pre-tagged dataframe
    # Find all @ with at least 3 characters after the @
    common_users = '|'.join(re.findall('@(\w{3,})', # Find all users (characters that begin with a @)
                         pre_tag_df.loc[pre_tag_df.label_country == element,'full_text'].to_string()))
    
    # If the tweet does not have mentions skip to the next
    if common_users == '':
    
        next
    
   # If the tweets have mentions detect which have those mentions
    elif common_users != '':
        
        # loop over untagged dataframe
        for index, value in to_tag_df['full_text'].iteritems():
            
            # Create re object with common users patterns (e.g. 'edesur|macri' etc...)
            regexp = re.compile(common_users)
            
            # If there is a user in the tweet
            if regexp.search(value):
                
                # Replace the tweet's label_country to the country of the loop
                to_tag_df.loc[index,'label_country'] = element
            
            else:
                next
    
    # Simple error handling, be careful of morgoth's bugs!
    else:
        print('Morgoth has introduced an error!')



### SANITY CHECK SECTION 
# re.findall('@(\w{3,})', # Find all users (characters that begin with a @)
#                          pre_tag_df.loc[pre_tag_df.label_country == 'not sure','full_text'].to_string())


# to_tag_df.loc[to_tag_df.label_country == '0','full_text'].str.contains('argentina').sum()

# pre_tag_df.loc[pre_tag_df.label_country == 'not sure','full_text'].str.contains('edesur')

#### Tagging countries that aren't Venezuela

<br>

- Frequently when venezuelan users refer to other countries they are not reporting a specific public service problem in Venezuela. 


| index 	| Text and User Description                                                                                                                                                                                                                                       	|
|-------	|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------	|
| 4540  	| el pais #singasolina por culpa de un gobierno ineficiente  inepto y corrupto que prefiere <br> seguir subsidiando a las mafias para que estas sigan desangrando al pais <br> llevandose la gasolina a colombia  el pais esta #singasolina por falta de gobierno 	|

<br>

| index 	| Text and User Description                                                                                                                                                                                                                                                                                                                                                                                                                         	|
|-------	|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------	|
| 5417  	| #singasolina \n @earisteguieta  "no hay gasolina suficiente para nuestro consumo dejamos de producirla y <br> no hay dinero para comprar asi termina el año y comenzara el proximo pero para cuba si hay <br> y gratis nos toca decidir si seguimos aceptando esto o nos hacemos respetar" <br> "sala de informacion" "saladeinfo" "agencia de noticias y comunicaciones integradas  productora de contenidos informativos  de opinion y analisis 	|


<br> 

**Exceptions**

1.  When streets are named as foreign countries (e.g. Avenida Mexico) or places named as foreign countries (see below).

<br>

| index 	| Text and User Description                                                                                                                                         	|
|-------	|-------------------------------------------------------------------------------------------------------------------------------------------------------------------	|
| 4215  	| #22nov #guayana alta vista  los olivos  castillito  ***villa colombia***  villa asia  villa alianza  ***villa brasil*** #sinluz "leonervis" "leonervis" "periodista📰 locutora 	|


<br>

2. When users report a public service problem and also complain 

| index 	| Text and User Description                                                                                                                                                                                                       	|
|-------	|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------	|
| 5481  	|  #sinluz valencia el parral desde las 6 00 p m   pero tenemos dialogo en barbados que maravilla!!! "mario carrera" "1elprimero" "la unica cosa necesaria para que triunfe el mal   es que los hombres de bien no hagamos nada " 	|

<br>

###### Notes:

<br>

This type of tweet is nearly impossible to tag. There's no information about municipality, state, nor country. 

<br> 

| index 	| Text and User Description                                                                                                    	|
|-------	|------------------------------------------------------------------------------------------------------------------------------ |
| 5610  	| #sinluz terrazas del club hipico   calle bolivia 😭 "dani medina" "dampita02" "me dicen dampita  y me gustan los aguacates 🥑"|


<br> 

Tweets with the venezuelan flag not necessarily refer to power outages. Probably using venezuela or 🇻🇪 is too general and can capture more noise than signal.  

<br>


| index 	| Text and User Description                                                                   	|
|-------	|---------------------------------------------------------------------------------------------	|
| 6357  	| ahora en https://t.co/h1paJ2DKHa \ntambien podras jugar desde chile las carreras de 🇻🇪 y cobrar en pesos chilenos!   |




In [None]:
## Read countries csv 
countries = pd.read_csv('countries.csv', 
                       squeeze = True) # Squeeze parses the csv as a pd.Series
countries = cleaner(countries,
        text_col=None,
        is_pandas_series=True)

# countries = countries.to_list()
# countries[0] = f'\s{countries[0]}'
# countries[-1] = f'{countries[-1]}\s'

regexp = re.compile(common_users)

for index, value in to_tag_df.loc[to_tag_df.label_country == '0','full_text'].iteritems():
    for country in countries:
#         if country in value:
        regexp = re.compile(f'\s{country}\s')
    
        if regexp.search(value):
#             print(country, index) # Sanity Check
            to_tag_df.loc[index,'label_country'] = country
        elif not regexp.search(value):
#             print('NOPE')
            next
        else:
            print('Melkor robbed the silmarils and left a bug! Contact the owner of the notebook')

to_tag_df.loc[[6649,4215,7567,5529,7714,7967,10191,5481,5610,6533,7628], 'label_country'] = 'venezuela'


# 4540, 8824 # General Complaint They are tagged as Colombia, I'm gonna leave them like that. 

# To Others
# Cuba 
# Colombia
# rusia
# libia
# ecuador
# To Argentina
# Mauricio 
# brasil

## Sanity check 

# to_tag_df.label_country.value_counts()
### RESULTS
# # 0                       5992
# argentina                406
# mexico                    26
# uruguay                   25
# not sure                  24
# chile                     13
# venezuela                 11
# colombia                  10
# mauricio                   7
# republica domenicana       7
# paraguay                   6
# cuba                       6
# rusia                      6
# china                      5
# israel                     4
# suiza                      4
# peru                       3
# italia                     3
# canada                     2
# espagna                    2
# angola                     2
# paises bajos               1
# belice                     1
# honduras                   1
# reino unido                1
# japon                      1
# ecuador                    1
# libia                      1
# iran                       1
# nicaragua                  1
# bolivia                    1
# egipto                     1
# barbados                   1
# zimbabue                   1
# brasil                     1

In [None]:
# Read csv with Venezuela's administrative distribution
geo_df = pd.read_excel('../cod.xlsx')
geo_df.fillna('NULL',
              inplace = True)


# Clean Columns
cols = ['parroquia', 'nombrepob', 'estado','nom_mun']

for col in cols:
  geo_df = cleaner(geo_df, col)

# # Unique pob name
#   # CHECK ENCODING ERROR WITH JESUS  carupano
# # geo_df.nombrepob.unique()



# Which indices contain any state 
indices = to_tag_df.full_text.str.contains('|'.join(state), 
                                              case = False) # 6754

def countryTagger(dataframe,text_column,target_column=None):
    
        # Read states
    state = geo_df.estado.unique().tolist() # 612 tagged only with state

    # Read Municipalities
    muni = geo_df.nom_mun.unique().tolist() 

    # Read population names
    pob_name = geo_df.nombrepob.unique().tolist() 

    # Read parish names
    parish = geo_df.parroquia.unique().tolist()
    
    
    
    common_vzla_accounts = ["@NicolsMaduro","@DanteRivasQ",
                            "@ReporteYa","@Gob_Vargas",
                            "@ReporteYa","@FBritoMaestre",
                            "@CORPOELECinfo","@CORPOELECgua_",
                            "@corpoelecmerida","@MPPAAguas",
                            "@CorpoelecCar","@ClimaMargarita",
                            "@hidrovenca","@hidrocapitalca",
                            "@MPPAAguas, @efectococuyo"
                           ]
    
    all_tokens = state + common_vzla_accounts
    

    
#     not_vzla = f'[^{"|".join(countries)}]'
    
    
    vzla_locations = '\s|\s'.join(set(all_tokens))  #+ 
    
    _indices = dataframe[text_column].str.contains(vzla_locations, 
                                                      case = False)
    
    return _indices 

# Testing
######################################
# idx=countryTagger(to_tag_df, 'full_text')
# idx2 = to_tag_df.loc[idx,'full_text'].str.contains('edesur').tolist()

# to_tag_df.loc[idx,'full_text'][idx2]

# Which indices contain any municipality
# indices = to_tag_df.full_text.str.contains('|'.join(set(muni)), 
#                                               case = False)
###########################3

# Which indices contain "edo"
# indices = to_tag_df.full_text.str.contains('\sedo', case = False)

## View results
# to_tag_df[indices]

## Tag Venezuela 
# to_tag_df.loc[indices,'label_country'] = 'venezuela'



## Quality Control 





        # If there's more than one element in state then it's multiple


text_matcher('ACTUALIZADO: Usuarios sin suministro eléctrico: @OficialEdesur @EdenorClientes #SinLuz #SeCorto https://t.co/jSuw4Lz0ng',non_venezuela_tags)


#### Municipality

In [None]:

## There's a municipality calledfull_text
  # Democracy 

# indices = to_tag_df.concat_text_user_description.str.contains('|'.join(muni), case = False) # 6754

# to_tag_df[indices]

In [None]:
# 653 tweets with hashtags Out of 2228 tweets



indices = to_tag_df.concat_text_user_description.str.contains('venezuela', case = False) # 6754

to_tag_df[indices]

## Tagging State


Incluimos cuentas que reportan a nivel nacional? Es ruido porque lo que hacen es repetir lo que otros usuarios dicen? O captura señal porque son reportes de fallas de luz?

~~~
print(tags_df.loc[8122,'full_text'])

#Ahora Reportan más zonas #SinLuz: 

Catia, Distrito Capital ❌💡
Guatire y Guarenas, Edo. Miranda ❌💡
Estado Mérida ❌💡
Estado Aragua ❌💡

Comenta si hay fallas en tu zona #2Oct

~~~

In [None]:

_multiple = []
_single = []
for index,row in to_tag_df.loc[to_tag_df.label_country == 'venezuela', 'full_text'].iteritems():
  _lst = []

  for state_name in state:

    if state_name in row:
      _lst.append(1)
    

      if len(_lst) == 1:
        _single.append(index)

      elif len(_lst) > 1:
        _multiple.append(index)

      else:
        next


      print(len(_lst), '------', index)
    else:
      next

print(_single)

# tst = to_tag_df.loc[8122,'full_text']
# tst = cleaner(pd.DataFrame({'col':[tst]}), 'col').to_string()
# # If there's more than one element in state then it's multiple
# for i in state:
#   if i in tst:
#     print(i)
#   else:
#     next


In [None]:
from nltk import word_tokenize 
from nltk.util import ngrams
import nltk

# import nltk 
# nltk.download('punkt')

bigram = []

for index,row in to_tag_df.loc[to_tag_df.label_country == 'venezuela', 'full_text'].iteritems():
     
    token = nltk.word_tokenize(row)
    bigram.append( list(ngrams(token, 3)) )

#     print(index)
#     print(bigram)
    

    



In [None]:
# to_tag_df.loc[6378,'full_text']

bigram_dict = dict(zip(to_tag_df.loc[to_tag_df.label_country == 'venezuela', 'full_text'].index.tolist(), bigram))

for k,v in bigram_dict.items():
    for token in v: 
        if 'municipio' in token and 'miranda' in token:
#             print(k,v)
            print(token)