# Download ENEE Monthly Data

This code downloads ENEE's (Honduras's power company) monthly reports on power generation and demand. It downloads monthly pdfs for all months provided on their website and stores them in a specified location. It uses BeautifulSoup library for interfacing with Honduras' website, BeautifulSoup library can be found here: https://www.crummy.com/software/BeautifulSoup/

**SYPA 2019**<br/>
**Authors**: Danny Barjum, Yalda Amini

<hr style="height:2pt">

In [1]:
import os
import numpy as np
import pandas as pd
import requests
import re
import time
from bs4 import BeautifulSoup

In [2]:
"""
Global Variables containing important information such as websites and file locations.
"""
ENEE_DOMAIN = 'http://www.enee.hn'
ENEE_HTTP_SITE = 'http://www.enee.hn/index.php/planificacionicono/182-boletines-estadisticos'
PATH_TO_WRITE = os.path.expanduser('~/Downloads/ENEE/')
MONTHS = ['ENERO', 'FEBRERO', 'MARZO', 'ABRIL', 'MAYO', 'JUNIO', 'JULIO', 'AGOSTO',
          'SEPTIEMBRE', 'OCTUBRE', 'NOVIEMBRE', 'DICIEMBRE']
YEAR = 2018

In [3]:
"""
Download ENEE webpage, try 10 times at most.
"""

for i in range(10):
    html_page = requests.get(ENEE_HTTP_SITE)
    if html_page.status_code == 200:
        print("Success requesting page")
        break
    else:
        print("Could not get page, trying again in 10 seconds")
        time.sleep(10)
        if i == 9:
            print("Could not get page after 10 attempts, quitting")

Success requesting page


In [4]:
enee_stats_page = html_page.text
enee_soup = BeautifulSoup(enee_stats_page, "html.parser")

In [5]:
print (enee_soup.prettify()[:])

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html lang="es-es" xml:lang="es-es" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta content="Juergen Koller - http://www.lernvid.com" name="designer"/>
  <meta content="Creative Commons 3.0" name="licence"/>
  <link href="/templates/allrounder-j1.6/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <!--- ADD CSS Files -->
  <link href="/templates/allrounder-j1.6/css/template.css" media="all" rel="stylesheet" type="text/css"/>
  <link href="/templates/allrounder-j1.6/css/joomla.css" media="all" rel="stylesheet" type="text/css"/>
  <link href="/templates/allrounder-j1.6/css/colors.css" media="all" rel="stylesheet" type="text/css"/>
  <link href="/templates/allrounder-j1.6/css/lvdropdown.css" media="all" rel="stylesheet" type="text/css"/>
  <link href="/templates/allrounder-j1.6/css/typo.css" media="all" rel="styleshee

In [6]:
"""
Extract Links from HTML
"""
links = enee_soup.find_all('a')
links_2_pdfs = []

for i in links:
    if any(c in str(i) for c in MONTHS):
        links_2_pdfs.append(i)

In [7]:
"""
Check Links and Display them
"""
print(str(len(links_2_pdfs))+" links have been found:")
print()
for i in links_2_pdfs:
    print(i['href'])

140 links have been found:

/planificacion/2018/boletines/Boletin%20Estadistico%20Enero%2018.pdf
/planificacion/2018/boletines/Boletin%20Estadistico%20Febrero%2018.pdf
/planificacion/2018/boletines/Boletin%20Estadistico%20Marzo%202018.pdf
/planificacion/2018/boletines/Boletin%20Estadistico%20Abril%202018.pdf
/Bid/2018/agosto/Boletin%20estadistico%20Mayo%2018.pdf
/planificacion/2018/boletines/Boletin%20Estadistico%20Junio%2018.pdf
/planificacion/2018/boletines/BOLETIN%20ESTADISTICO%20JULIO%202018%20PDF.pdf
/planificacion/2018/boletines/BOLETIN%20ESTADISTICO%20AGOSTO%202018%20PDF.pdf
/planificacion/2017/boletines/Boletin%20Estadistico%20(Enero%202017).pdf
/planificacion/2017/boletines/Boletin%20Estadistico%20(Febrero%202017).pdf
/planificacion/2017/boletines/Boletin%20Estadistico%20(MARZO%202017).pdf
/planificacion/2017/boletines/Boletin%20Estadistico%20(Abril%202017).pdf
/planificacion/2017/boletines/Boletin%20Estadistico%20Mayo%202017.pdf
/planificacion/2017/boletines/Boletin%20Estadis

In [8]:
"""
Clean links for ease of use
"""
clean_links = []
for i in links_2_pdfs:
    if 'http' in str(i['href']):
        clean_links.append(str(i['href']))
    else:
        clean_links.append(ENEE_DOMAIN+str(i['href']))

In [9]:
"""
Display Clean Links
"""
for i in clean_links:
    print(i)

http://www.enee.hn/planificacion/2018/boletines/Boletin%20Estadistico%20Enero%2018.pdf
http://www.enee.hn/planificacion/2018/boletines/Boletin%20Estadistico%20Febrero%2018.pdf
http://www.enee.hn/planificacion/2018/boletines/Boletin%20Estadistico%20Marzo%202018.pdf
http://www.enee.hn/planificacion/2018/boletines/Boletin%20Estadistico%20Abril%202018.pdf
http://www.enee.hn/Bid/2018/agosto/Boletin%20estadistico%20Mayo%2018.pdf
http://www.enee.hn/planificacion/2018/boletines/Boletin%20Estadistico%20Junio%2018.pdf
http://www.enee.hn/planificacion/2018/boletines/BOLETIN%20ESTADISTICO%20JULIO%202018%20PDF.pdf
http://www.enee.hn/planificacion/2018/boletines/BOLETIN%20ESTADISTICO%20AGOSTO%202018%20PDF.pdf
http://www.enee.hn/planificacion/2017/boletines/Boletin%20Estadistico%20(Enero%202017).pdf
http://www.enee.hn/planificacion/2017/boletines/Boletin%20Estadistico%20(Febrero%202017).pdf
http://www.enee.hn/planificacion/2017/boletines/Boletin%20Estadistico%20(MARZO%202017).pdf
http://www.enee.hn/p

In [10]:
def download_pdfs(links):
    """
    This function takes in all links provided and attempts to download the pdf files
    the links point to. If a file cannot be downloaded, the file is flaged and stored
    in a list which is returned.
    """
    
    latest_month = len(links) % 12
    current_month = 1
    failed_links = []
    year = YEAR
    
    if latest_month:
        for i in range(latest_month):
            try:
                html_page = requests.get(links[i], allow_redirects=True)
                
                if html_page.status_code == 200:
                    if current_month < 10:
                        filepath = PATH_TO_WRITE+str(year)+'0'+str(current_month)+'_ENEEstats.pdf'
                    else:
                        filepath = PATH_TO_WRITE+str(year)+str(current_month)+'_ENEEstats.pdf'

                    with open(filepath, 'wb') as fp:
                        fp.write(html_page.content)
                else:
                    failed_links.append(links[i])
            except requests.exceptions.RequestException as e:
                print(e)
                sys.exit(1)          
            
            current_month += 1
            #time.sleep(1)
    
        year -= 1
    
    current_month = 1
    for i in links[latest_month:]: # To Do start at index 8
        try:
            html_page = requests.get(i, allow_redirects=True)
            
            if html_page.status_code == 200:
                if current_month < 10:
                    filepath = PATH_TO_WRITE+str(year)+'0'+str(current_month)+'_ENEEstats.pdf'
                else:
                    filepath = PATH_TO_WRITE+str(year)+str(current_month)+'_ENEEstats.pdf'
            
                with open(filepath, 'wb') as fp:
                    fp.write(html_page.content)
            else:
                failed_links.append(i)
            
        except requests.exceptions.RequestException as e:
            print(e)
            sys.exit(1) 
            
        
        
        current_month += 1
        
        if current_month > 12:
            current_month = 1
            year -= 1
            
        #time.sleep(1)

    return failed_links

In [11]:
missing_files = download_pdfs(clean_links)

if missing_files:
    print("Could not retreive " + str(len(missing_files)) + " file(s):")
    for i in missing_files:
        print(i)
else:
    print("Success downloading files")

Success downloading files
