In [None]:
import pandas as pd
import unidecode
import requests
from bs4 import BeautifulSoup
import multiprocessing as mp
import numpy as np
import swifter


# Reading data and normalizing columns

In [None]:
df = pd.read_excel("../../data/raw/ovsp_bdd_octubre.xlsx")
column_names_normalized = df.columns.str.strip().str.lower().str.replace(" ", "_")
df.columns = column_names_normalized

# Quick and Dirty EDA: Confirming that I'm subsetting the correct number of elpitazo news

In [None]:

# Confirming that I'm subsetting the correct number of elpitazo links
    # Result: 2483 el pitazo links
pitazo_mask = df.link_de_la_noticia.str.contains("https://elpitazo.net",na=False)
df_elpitazo = df[pitazo_mask]
# df.link_de_la_noticia[pitazo_mask].str.split("/", expand = True)[2].value_counts()

# Elpitazo webscraper

Common classes from `elpitazo`:

- Titlo de noticia `tdb-title-text`
- info general de la noticia `tdb-title-text`
- Texto completo de la noticia class="tdb-block-inner td-fix-index"
                            id="bsf_rt_marker"

## Testing with different classes

In [None]:
# Testing different classes
# The easiest way to extract the news content is to extract all p tags and then clean them.

headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
    }

URL = "https://elpitazo.net/los-llanos/vecinos-de-varias-comunidades-de-acarigua-tienen-siete-anos-sin-agua/"
page = requests.get(URL, headers=headers, timeout=20)

soup = BeautifulSoup(page.content, "html.parser")

soup.find_all("h1", {"class": "tdb-title-text"}) # This snippet works well to find the news article title
# # soup.find_all("div", {"class": "tdb-block-inner td-fix-index"})
# soup.find_all("p", {"class": "tdb-block-inner td-fix-index"})

ls = []

tags_p = soup.find_all("p")
for tags in tags_p:
    
    if tags.has_attr('class') and tags['class'][0] in ['contacto-datos', 'text-white', '__cf_email__']:
        continue
        
    ls.append(tags.get_text())

' '.join(ls)


### Webscraper first pass

TODO: Use multiprocessing instead of pandas.apply. There are too many links to do it in a sequential fashion 

In [None]:
def webscraper_elpitazo(url:str):
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
    }
    
    page = requests.get(url, headers=headers, timeout=20)

    soup = BeautifulSoup(page.content, "html.parser")
    
    tags_p = soup.find_all("p")


    ls = [] # Temporary list
    for tags in tags_p:

        if tags.has_attr('class') and tags['class'][0] in ['contacto-datos', 'text-white', '__cf_email__']:
            continue

        ls.append(tags.get_text())

    complete_news = ' '.join(ls)
    
    return complete_news

In [None]:
# Single URL Test
URL = "https://elpitazo.net/los-llanos/el-gas-domestico-en-acarigua-araure-cuesta-entre-10-y-20-dolares/"
webscraper_elpitazo(url=URL)

# pd.series text 
elpitazo_text = df_elpitazo.head().link_de_la_noticia.apply(webscraper_elpitazo)

## Multiprocessing implementation

In [None]:
# Parallel processing. 

# First try: Didn't work
# def parallelize_dataframe(df, func):
#     num_processes = mp.cpu_count()
    
#     df_split = np.array_split(df, num_processes)
    
#     with mp.Pool(num_processes) as p:
#         df = pd.concat(p.map(func, df_split))
#     return df

# parallelize_dataframe(df_elpitazo.link_de_la_noticia.head(),webscraper_elpitazo )

# Second try with swifter library: THIS WORKED, however there must be a faster solution
elpitazo_textfull = df_elpitazo.link_de_la_noticia.swifter.apply(webscraper_elpitazo)

In [None]:
# Save webscraped text
elpitazo_textfull.to_csv("../../data/interim/webscraping/links_elpitazo_03132021.csv")

# Join with original df
df_elpitazo_fulltext = df.join(elpitazo_textfull,rsuffix="_fulltext", how = "left")

# Save link noticia and full text to check the quality of the web scraper
df_elpitazo_fulltext[["link_de_la_noticia","link_de_la_noticia_fulltext"]].to_csv("../../data/interim/webscraping/fulltext_links_elpitazo_03132021.csv")

# Remove columns that were empty (this happened because I read the data from excel)
unnamed_cols = df_elpitazo_fulltext.columns[df_elpitazo_fulltext.columns.str.contains("unnamed")]
df_elpitazo_fulltext = df_elpitazo_fulltext.drop(unnamed_cols, axis = 1)
df_elpitazo_fulltext.to_csv("../../data/processed/webscraping/ovsp_bdd_elpitazotext.csv")