In [179]:
import requests
import re
from collections import defaultdict
from unidecode import unidecode
from bs4 import BeautifulSoup

In [184]:
class BillFinder:
    
    def __init__(self):
        
        self.source_urls_energy = {'Canada': 'https://en.wikipedia.org/wiki/List_of_Canadian_electric_utilities',
                                   'USA': 'https://en.wikipedia.org/wiki/List_of_United_States_electric_companies',
                                   'UK': 'https://www.utilitysavingexpert.com/energy/suppliers/'}
        
        self.source_urls_water = {'USA': 'https://en.wikipedia.org/wiki/List_of_United_States_water_companies',
                                  'UK': 'https://en.wikipedia.org/wiki/United_Kingdom_water_companies'}
        
        self.source_urls_gas = {'USA': 'https://en.wikipedia.org/wiki/List_of_United_States_natural_gas_companies',
                                'UK': 'https://en.wikipedia.org/wiki/List_of_British_natural_gas_companies'}
        
        self.stop_words = 'company inc corporation ltd'.split()
        
        self.collected_companies = defaultdict()
    
    def get_energy_companies(self, country: str = None):
    
        company_set = set()
        soup = BeautifulSoup(requests.get(self.source_urls_energy[country]).text)
        
        if country == 'Canada':
            for t in soup.find_all('table', role="presentation"):
                for _ in t.find_all('li'):
                    for a in _.find_all('a'):
                        company_set.add(a.text)
        elif country == 'USA':
            for t in soup.find('span', id="List_of_US_electric_companies_by_state"):
                for e in t.parent.next_elements:
                    if e.name == 'table':
                        break
                    if e.name == 'ul':
                        for _ in e.find_all('li'):
                            if company_name := _.text.strip():
                                company_set.add(company_name)
        elif country == 'UK':
            for header_text in ['List of UK energy suppliers', 'List of business energy suppliers']:
                p = soup.find('h2', string=header_text).parent
                for _ in p.find_all('div', class_="table-responsive"):
                    for row in _.find_all('tr'):
                        if tds := row.find_all('td'):
                            company_set.add(tds[1].text)
                            
        print(f'found {len(company_set):,} companies')
        
        self.collected_companies['energy_companies'] = sorted(company_set)
                        
        return self
    
    def get_water_companies(self, country: str = None):
    
        company_set = set()
        soup = BeautifulSoup(requests.get(self.source_urls_water[country]).text)
        
        if country == 'USA':
            for t in soup.find_all('h2'):
                if t.find_all('a', href=True):
                    if list_of_company_names := t.find_next_sibling('ul'):
                        for name_line in list_of_company_names.find_all('li'):
                            if company_name := name_line.text.strip():
                                company_set.add(company_name)
        elif country == 'UK':
            for s in soup.find('span', id='Water_and_sewerage').parent.next_siblings:
                if s.name == 'table':
                    for row in s.find_all('tr'):
                        if tds := row.find_all('td'):
                            company_set.add(tds[0].text.strip()) 
                    break
            for s in soup.find('span', id='Water_only').parent.next_siblings:
                if s.name == 'table':
                    for row in s.find_all('tr'):
                        if tds := row.find_all('td'):
                            company_set.add(tds[0].text.strip()) 
                    break
                    
            for id_ in ['Scotland', 'Northern_Ireland', 'Crown_dependencies']:
                for s in soup.find('span', id=id_).parent.next_siblings:
                    if s.name == 'ul':
                        for item_ in s.find_all('li'):
                            company_set.add(item_.text)
                        break
                            
        print(f'found {len(company_set):,} companies')
                        
        self.collected_companies['water_companies'] = sorted(company_set)
                        
        return self
    
    def get_gas_companies(self, country: str = None):
    
        company_set = set()
        soup = BeautifulSoup(requests.get(self.source_urls_gas[country]).text)
        
        if country == 'USA':
            for list_item in soup.find('span', id='List').parent.find_next_sibling('ul').find_all('li'):
                for c in unidecode(list_item.text).split('-')[1].split(','):
                    if company_name := c.strip():
                        company_set.add(company_name)
        elif country == 'UK':
            for _ in soup.find(lambda t: (t.name == 'p') and ('Utilities' in t.text)).next_siblings:
                if _.name == 'ul':
                    for list_item in _.find_all('li'):
                        if s := list_item.text.strip():
                            if '(' not in s:
                                company_set.add(s)
                            else:
                                for st in re.split(r'[(),]| and ', s):
                                    if potential_name := ' '.join([w for w in st.split() if (w.isupper() or w.istitle()) and w.lower() not in self.stop_words]).strip():
                                        company_set.add(potential_name)
                    break

        print(f'found {len(company_set):,} companies')
        
        self.collected_companies['gas_companies'] = sorted(company_set)
                        
        return self

In [185]:
bf = BillFinder()

In [186]:
bf.get_gas_companies(country='UK') \
        .get_gas_companies(country='USA') \
        .get_water_companies('USA')

found 23 companies
found 159 companies
found 558 companies


<__main__.BillFinder at 0x108df9970>

In [None]:
bf.collected_companies