<a href="https://colab.research.google.com/github/dieko95/blackouts-C4V/blob/diego-first-iter/twitter_pretagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Automated Tagging

## Libraries



In [0]:
from google.colab import auth
import gspread
from oauth2client.client import GoogleCredentials
import pandas as pd

## Accessing Data

In [0]:
## Installing package to access
!pip install --upgrade -q google-auth
!pip install --upgrade -q gspread

# For better visualization of text in Pandas DF
pd.set_option('display.max_colwidth', None)

In [0]:
# Authentication to access the untagged spreadhseet
auth.authenticate_user()

# Authenticating to open google sheets
gc = gspread.authorize(GoogleCredentials.get_application_default())

# Open File's sheet 1
worksheet = gc.open('Training Set - #SinLuz Country Classifier').sheet1

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()

In [0]:
# Convert to a DataFrame and render.

tagOriginalDf = pd.DataFrame(rows)

# First row has the column names
  # Rename columns
tagOriginalDf.columns = tagOriginalDf.iloc[0,:]

# Drop Columns
tagOriginalDf = tagOriginalDf.drop(index = 0)

In [5]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Corresponding Section
tags_df = tagOriginalDf.iloc[6001:8499,:].copy()

# Tagged Tweets
pre_tag_df = tags_df[['full_text','concat_text_user_description', 'label_country', 'label_state', 'label_type']].iloc[:271,:].copy()

# Tweets to tag
to_tag_df = tags_df[['full_text','concat_text_user_description', 'label_country', 'label_state', 'label_type']].iloc[271:,:].copy()

## Cleaning Text

In [0]:
def cleaner(df,text_col):
  # to lower

  df[text_col] = df[text_col].str.lower()

  # Convert common spanish accents

  df[text_col] = df[text_col].str.replace("ú", "u")
  df[text_col] = df[text_col].str.replace("ù", "u")
  df[text_col] = df[text_col].str.replace("ü", "u")
  df[text_col] = df[text_col].str.replace("ó", "o")
  df[text_col] = df[text_col].str.replace("ò", "o")
  df[text_col] = df[text_col].str.replace("í", "i")
  df[text_col] = df[text_col].str.replace("ì", "i")
  df[text_col] = df[text_col].str.replace("é", "e")
  df[text_col] = df[text_col].str.replace("è", "e")
  df[text_col] = df[text_col].str.replace("á", "a")
  df[text_col] = df[text_col].str.replace("à", "a")
  df[text_col] = df[text_col].str.replace("ñ", "gn")

  return df


to_tag_df = cleaner(to_tag_df, 'concat_text_user_description')
to_tag_df = cleaner(to_tag_df, 'full_text')

## Functions 

- Classify label_type (service reported)
  - Extracting pound signs (\#)
- Classify Country
  - Matches any state? 
  - has keyword 'edo' or 'estado' in it?
  - Follows any of the common accounts?
- Classify State
  - Match with list of venezuela states
  - We can use a list of venezuelan cities as well 

### Classifying Label Type

#### Hashtags

* \#SinLuz






In [0]:
import re

# sinluz         670
# ahora           68
# sinagua         66
# apagon          65
# singasolina     53

hashtags = pd.Series(re.findall('#(\w+)', to_tag_df.concat_text_user_description.to_string())).copy()

# hashtags.value_counts()[hashtags.value_counts() > 10]

# hashtags[hashtags == 'sinl']



### Tagging Country

  - Matches any state? 
  - has keyword 'edo' or 'estado' in it?
  - Follows any of the common accounts?

*Notes*
  - For this section I will use the tweet's original text. If I include the user description it can add noise because a user can be reporting about a power outage of another state (e.g., I'm from caracas and reporting a power outage in Zulia)
  

In [61]:
# Read csv with Venezuela's administrative distribution
geo_df = pd.read_csv('/content/drive/My Drive/monitor_ciudad/cod.csv')

# Clean Columns
cols = ['parroquia', 'nombrepob', 'estado','nom_mun']

for col in cols:
  geo_df = cleaner(geo_df, col)

# # Unique pob name
#   # CHECK ENCODING ERROR WITH JESUS  car�pano
# # geo_df.nombrepob.unique()

state = geo_df.estado.unique().tolist()

muni = geo_df.nom_mun.unique().tolist()


indices = to_tag_df.concat_text_user_description.str.contains('|'.join(state), case = False) # 6754

indices = to_tag_df.concat_text_user_description.str.contains('\sedo', case = False)

## View results
to_tag_df[indices]

## Tag Venezuela 
to_tag_df.loc[indices,'label_country'] = 'venezuela'

_multiple = []
for index,row in to_tag_df.loc[to_tag_df.label_country == 'venezuela', 'full_text'].iteritems():
  _lst = []

  for state_name in state:
    if state_name in row:
      _lst.append(1)
      if len(_lst) > 1:
        _multiple.append(index)
      else:
        next
      # print(len(_lst), '------', index)
    else:
      next

print(_multiple)


# Quality Control 



# tst = 'Todavía #SinLuz y son las 6:30  pm nueve horas sin luz en San Cristóbal  estado tachira sector los #kioskos #SinLuz vía seguro social exigimos el servicio ya dejen la burla y el cinismo al pueblo hijos de putas @CORPOELECinfo @corpoelectachir @NicolasMaduro @dcabellor "Luis Suarez" "LuisSua45172692" "soy abogado egresado de la universidad Javeriana de Bogotá Colombia  especialista en derecho internacional"'
# tst = cleaner(pd.DataFrame({'col':[tst]}), 'col').to_string()
# # If there's more than one element in state then it's multiple
# for i in state:
  # if i in tst:
  #   print(i)
  # else:
  #   next




[7724, 7791, 8011, 8122, 8122, 8122, 8153]


#### Municipality

In [0]:

## There's a municipality called
  # Democracy 

# indices = to_tag_df.concat_text_user_description.str.contains('|'.join(muni), case = False) # 6754

# to_tag_df[indices]

In [0]:
# 653 tweets with hashtags Out of 2228 tweets



indices = to_tag_df.concat_text_user_description.str.contains('venezuela', case = False) # 6754

to_tag_df[indices]