# Python Web Scraping
https://www.analyticsvidhya.com/blog/2015/10/beginner-guide-web-scraping-beautiful-soup-python/?utm_source=blog&utm_medium=5-popular-python-libraries-web-scraping

Web scraping is a computer software technique of extracting information from websites. This technique mostly focuses on the transformation of unstructured data (HTML format) on the web into structured data (database or spreadsheet)

In [1]:
#import the library used to query a website
import urllib.request

In [2]:
#!pip install beautifulsoup4
#https://www.crummy.com/software/BeautifulSoup/bs4/doc/
#import the Beautiful soup functions to parse the data returned from the website
from bs4 import BeautifulSoup

In [None]:
import re

In [109]:
from datetime import datetime

In [147]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=Warning)
# Seteos de Pandas para ver mejor la info. https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html
# In max_columns and max_rows ‘None’ value means unlimited.
pd.set_option('display.max_rows', 100)
# Conviene setear min_rows también porque cuando se superan las max_rows, sólo muestra la cantidad indicada en min_rows.
pd.set_option('display.min_rows', 10)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.max_colwidth = 100
pd.options.display.float_format = '{:.3f}'.format

In [1]:
import os
print('getcwd:      ', os.getcwd())

getcwd:       C:\Users\COLMO\OneDrive\0-Data Science\data_science\datascience\web_scrapping


In [45]:
#specify the url
url = "https://www.enargas.gob.ar/secciones/transporte-y-distribucion/despacho-diario-indicador-itm1-demanda-prioritaria.php"
#Query the website and return the html to the variable 'page'
page = urllib.request.urlopen(url)

In [46]:
#Parse the html in the 'page' variable, and store it in Beautiful Soup format
soup = BeautifulSoup(page)
# Imprime la estructura del html (es muy larga)
#print(soup.prettify())

### Tables
Extract the information to DataFrame: Here, we need to iterate through each row (tr) and then assign each element of tr (td) to a variable and append it to a list. Let’s first look at the HTML structure of the table (I am not going to extract information for table heading <th>)
    
Table HTML 101:
    <Table>
        <thead>
            <tr> --> table row (en este caso, de la cabecera)
                <th> columna 1 de cabecera
            <tr>
        <thead>
        <tbody>
            

In [47]:
soup.find_all('table')
# Now to identify the right table, we will use attribute “class” of table and use it to filter the right table. 
# In chrome, you can check the class name by right click on the required table of web page –> Inspect element –> 
# Copy the class name OR go through the output of above command find the class name of right table.
right_table=soup.find('table', class_='table tablesaw tablesaw-stack tablesaw-row-zebra')
right_table

<table class="table tablesaw tablesaw-stack tablesaw-row-zebra" data-tablesaw-mode="stack">
<thead>
<tr>
<th class="col-xs-3" style="width:1%"> </th>
<th class="col-xs-3" style="width:29%">Licenciataria</th>
<th class="col-xs-3" style="width:14%">23/06/22<sup>(1)</sup></th>
<th bgcolor="999999" class="col-xs-3" style="width:14%">24/06/22<sup>(2)</sup></th>
<th class="col-xs-3" style="width:14%">25/06/22<sup>(2)</sup></th>
<th class="col-xs-3" style="width:14%">26/06/22<sup>(2)</sup></th>
<th class="col-xs-3" style="width:14%">27/06/22<sup>(2)</sup></th>
</tr>
</thead>
<tbody>
<tr>
<td bgcolor="FF0000" class="text-left"> </td>
<td class="text-left">T<sub>MED</sub> GBA</td>
<td class="text-right">7.7 °</td>
<td class="text-right">9.5 °</td>
<td class="text-right">7.5 °</td>
<td class="text-right">9 °</td>
<td class="text-right">10 °</td>
</tr>
<tr>
<td class="text-left"> </td>
<td class="text-left"><b>TOTALES:</b></td>
<td class="text-right"><b>90.3909</b></td>
<td class="text-right"><b>

In [148]:
# Encabezado --> Obtengo las fechas
#for row in right_table.findAll("tr"):
df_table = pd.DataFrame()
header = soup.findAll('th')
dates = []
for cell in header[2:]:
    date = datetime.strptime(cell.find(text=True), '%d/%m/%y').date()
    print(date)
    dates.append(date)
df_table['date'] = dates
df_table

2022-06-23
2022-06-24
2022-06-25
2022-06-26
2022-06-27


Unnamed: 0,date
0,2022-06-23
1,2022-06-24
2,2022-06-25
3,2022-06-26
4,2022-06-27


In [149]:
# Temperaturas:
rows = soup.find(string=re.compile('GBA')).parent.parent.contents
idx = 0
temps = []
for row in rows[4:]:
    #print(row)
    # OJO: hay líneas donde se setea el tipo de letra y color con un \n de texto, por eso pregunto strip()!=''
    if row.name =='td' and row.string.strip() != column and row.string.strip() != '':
        #print(row.string)
        temp = row.string.replace('\xa0°', '')
        temps.append(float(temp))
        idx+=1
df_table['temp'] = temps
df_table

Unnamed: 0,date,temp
0,2022-06-23,7.7
1,2022-06-24,9.5
2,2022-06-25,7.5
3,2022-06-26,9.0
4,2022-06-27,10.0


In [150]:
#'TOTALES:', 
columns=['MetroGAS', 'Naturgy BAN', 'Camuzzi Gas Pampeana', 'Camuzzi Gas del Sur', 'Distribuidora de Gas Cuyana'
        , 'Distribuidora de Gas del Centro', 'Litoral Gas', 'Gasnor', 'GasNea']
for column in columns:
    #print(column)
    # children genera un iterador y contents devuelve una lista:
    rows = soup.find('td', string=re.compile(column)).parent.children #contents
    idx = 0
    rows_x_df = []
    for row in rows:
        # OJO: hay líneas donde se setea el tipo de letra y color con un \n de texto, por eso pregunto strip()!=''
        if row.name =='td' and row.string.strip() != column and row.string.strip() != '':
            #print(row.string)
            rows_x_df.append(float(row.string))
            idx+=1
    df_table[column] = rows_x_df
df_table

Unnamed: 0,date,temp,MetroGAS,Naturgy BAN,Camuzzi Gas Pampeana,Camuzzi Gas del Sur,Distribuidora de Gas Cuyana,Distribuidora de Gas del Centro,Litoral Gas,Gasnor,GasNea
0,2022-06-23,7.7,15.57,14.73,16.684,20.033,6.737,6.877,6.63,2.05,1.079
1,2022-06-24,9.5,15.827,14.7,16.722,20.553,6.862,6.75,6.6,1.95,1.075
2,2022-06-25,7.5,14.842,14.76,15.881,19.485,6.649,6.602,6.1,1.98,1.08
3,2022-06-26,9.0,13.592,13.83,14.86,18.192,6.171,6.694,5.8,1.8,1.027
4,2022-06-27,10.0,15.941,15.07,16.101,19.786,6.268,6.931,6.1,2.09,1.074
