In [None]:
# NOTE: You must include swifter and multiprocessing in your lock file. The lock file won't be merged to master
import pandas as pd
import unidecode
import requests
from bs4 import BeautifulSoup
import multiprocessing as mp
import numpy as np
import swifter


# Reading data and normalizing columns

In [None]:
df = pd.read_excel("../../data/raw/ovsp_bdd_octubre.xlsx")
column_names_normalized = df.columns.str.strip().str.lower().str.replace(" ", "_")
df.columns = column_names_normalized

# Quick and Dirty EDA: Confirming that I'm subsetting the correct number of elpitazo news

There are 37 duplicated `el pitazo` rows

In [None]:

# Confirming that I'm subsetting the correct number of elpitazo links
    # Result: 2483 el pitazo links
pitazo_mask = df.link_de_la_noticia.str.contains("https://elpitazo.net",na=False)
df_elpitazo = df[pitazo_mask]
# df.link_de_la_noticia[pitazo_mask].str.split("/", expand = True)[2].value_counts()

In [None]:
# duplicated values

mask_duplicated = df_elpitazo.duplicated()
# df_elpitazo[mask_duplicated].sort_values("link_de_la_noticia") ## Uncomment to check duplicated rows

mask_duplicated.sum()

# Elpitazo webscraper

Common classes from `elpitazo`:

- Titlo de noticia `tdb-title-text`
- info general de la noticia `tdb-title-text`
- Texto completo de la noticia class="tdb-block-inner td-fix-index"
                            id="bsf_rt_marker"

## Testing with different classes

In [None]:
# Testing different classes
# The easiest way to extract the news content is to extract all p tags and then clean them.

headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
    }

URL = "https://elpitazo.net/los-llanos/vecinos-de-varias-comunidades-de-acarigua-tienen-siete-anos-sin-agua/"
page = requests.get(URL, headers=headers, timeout=20)

soup = BeautifulSoup(page.content, "html.parser")

soup.find_all("h1", {"class": "tdb-title-text"}) # This snippet works well to find the news article title
# # soup.find_all("div", {"class": "tdb-block-inner td-fix-index"})
# soup.find_all("p", {"class": "tdb-block-inner td-fix-index"})

# ls = []

# tags_p = soup.find_all("p")
title = soup.get_element_text(".tdb-title-text", response) or ""
date = soup.get_element_text(".entry-date", response) or ""
author = soup.get_element_text(".tdb-author-name", response) or ""

# body = self._get_body(response)

# for tags in tags_p:
    
#     if tags.has_attr('class') and tags['class'][0] in ['contacto-datos', 'text-white', '__cf_email__']:
#         continue
        
#     ls.append(tags.get_text())

# ' '.join(ls)


### Webscraper first pass

TODO: Use multiprocessing instead of pandas.apply. There are too many links to do it in a sequential fashion 

In [None]:
def webscraper_elpitazo(url:str):
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
    }
    
    page = requests.get(url, headers=headers, timeout=20)

    soup = BeautifulSoup(page.content, "html.parser")
    
    # Title
    title = soup.find("h1", {"class": "tdb-title-text"}) or ""
    if type(title) != str:
        title = title.get_text()

    # Date and time
    date = soup.find("time", {"class":"entry-date"}) or ""
    if type(date) != str:
        date = date.get_text()

    # Author
    author = soup.find("a", {"class":"tdb-author-name"}) or ""
    if type(author) != str:
        author = author.get_text()

    # Tags. E.g., Acarigua, falta de agua, portuguesa
    etiquetas = soup.find("ul", {"class":"tdb-tags"}) or ""
    if type(etiquetas) != str:
        etiquetas = ", ".join(list(etiquetas.stripped_strings))

    # Category. E.g., los llanos
    category = soup.find("a", {"class":"tdb-entry-category"}) or ""
    if type(category) != str:
        category = category.get_text() or ""

    # News Text
    tags_p = soup.find_all("p")

    ls = [] # Temporary list
    for tags in tags_p:

        if tags.has_attr('class') and tags['class'][0] in ['contacto-datos', 'text-white', '__cf_email__']:
            continue

        ls.append(tags.get_text())

    text = ' '.join(ls)
    
    webscraped_dic = {
        "url":url,
        "title":title,
        "date":date,
        "author":author,
        "tags":etiquetas,
        "category":category,
        "text":text
    }
    
    
    return webscraped_dic

In [None]:
# Single URL Test
URL = "https://elpitazo.net/los-llanos/el-gas-domestico-en-acarigua-araure-cuesta-entre-10-y-20-dolares/"
webscraper_elpitazo(url=URL)

elpitazo_text = df_elpitazo.head().link_de_la_noticia.apply(webscraper_elpitazo)

## Multiprocessing implementation

In [None]:
# Parallel processing. 
# First try: Didn't work
# def parallelize_dataframe(df, func):
#     num_processes = mp.cpu_count()
    
#     df_split = np.array_split(df, num_processes)
    
#     with mp.Pool(num_processes) as p:
#         df = pd.concat(p.map(func, df_split))
#     return df

# parallelize_dataframe(df_elpitazo.link_de_la_noticia.head(),webscraper_elpitazo )

# Second try with swifter library: THIS WORKED, however there must be a faster solution

# Swifter uses a normal pandas apply. Main constraint: It takes quite a bit of time (around 30 min)
elpitazo_textfull = df_elpitazo.link_de_la_noticia.swifter.apply(webscraper_elpitazo)

In [None]:
elpitazo_webscraped_df = pd.DataFrame.from_dict(list(elpitazo_textfull))

In [None]:
# Save webscraped text

elpitazo_textfull.to_csv("../../data/interim/webscraping/json_links_elpitazo_04242021.csv") # I'm saving the json file just in case
elpitazo_webscraped_df.to_csv("../../data/interim/webscraping/links_elpitazo_04242021.csv")

# Merge webscraped data with tagged dataframe
    # Note: In the merge, duplicated values are created. 
    # This is due that I'm joining by URL. In the original tagged data, urls are not unique. 
    # The urls are not unique because one url may have different event types (falta de servicio, protesta)
df_elpitazo_webscrape_join_tags = pd.merge(df_elpitazo,elpitazo_webscraped_df, left_on= "link_de_la_noticia", 
                                right_on= "url", how = "left", suffixes = ("_original", "_scraped"))

# Drop duplicates
df_elpitazo_webscrape_join_tags_nodups = df_elpitazo_webscrape_join_tags.drop_duplicates()
f"Total amount of duplicates after join: {df_elpitazo_webscrape_join_tags.shape[0] - df_elpitazo_webscrape_join_tags_nodups.shape[0]} records"

df_elpitazo_webscrape_join_tags_nodups.to_csv("../../data/processed/webscraping/elpitazo_positivelabels_devdataset.csv",index = False)

# # Save link noticia and full text to check the quality of the web scraper
# df_elpitazo_fulltext[["link_de_la_noticia","link_de_la_noticia_fulltext"]].to_csv("../../data/interim/webscraping/fulltext_links_elpitazo_03132021.csv")

# # Remove columns that were empty (this happened because I read the data from excel)
# unnamed_cols = df_elpitazo_fulltext.columns[df_elpitazo_fulltext.columns.str.contains("unnamed")]
# df_elpitazo_fulltext = df_elpitazo_fulltext.drop(unnamed_cols, axis = 1)
# df_elpitazo_fulltext.to_csv("../../data/processed/webscraping/ovsp_bdd_elpitazotext.csv")