## Project: Lawyer namecard collector
    - short description: Scrape public information of Hungarian lawyers from the official website
    - Opens website, goes to first page of list, then Selenium clicks on next page until the last one
    - Several inconsistencies through out the original webpage, so I had to figure out a few workarounds
    - webpage has been updated, last successful run: 2016.11.01

In [2]:
# [1] Packages
import numpy as np

import requests
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import csv
import sys
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from lxml import etree
try:
    from urlparse import urljoin
except:
    from urllib.parse import urljoin
    
mainurl = "http://www.magyarugyvedikamara.hu/tart/lawyerdata?status=aktiv&functional=sf.lawyer"

In [4]:
# [2] downloads all the data for one lawyer in one cell, variables are in Hungarian
def lawyer_card (driver,k):
    page_list = list()
    try:
        pag=driver.page_source
        soup=BeautifulSoup(pag, "lxml")
        main = soup.find('div', attrs={'id':'main'})
        content=main.find_next('div', attrs={'id':'content'})
        all_lawyer=content.find_all("div", class_="ugyvedcard")
        for i in all_lawyer:
            lawyer_card_dict= dict()
            name_=i.find_all('th')
            lawyer_card_dict['name']=(name_[1].text)[:-1]
            rows=i.find_all('td')
            for k in range(1,len(rows)):
                texts_=['kamara' in rows[k].text,
                        'iroda cím' in rows[k].text,
                        'jogterület' in rows[k].text,
                        'irodai e-mail cím:' in rows[k].text,
                        'honlap:' in rows[k].text,
                        'telefon:' in rows[k].text]
                keys_=['megye','cim','jogkor','mail','honlap','tel']
                lawyer_card_dict[np.select(texts_,keys_)]=rows[k+1].text      
            page_list.append(lawyer_card_dict)
    except:
        page_error(driver, k)
    return (page_list)

In [5]:
# [3] a few random pages differ from the others, this is a debug function to tell me what pages acts weird.
def page_error (driver,k):
    try:
        pag=driver.page_source
        soup=BeautifulSoup(pag, "lxml")
        main = soup.find('div', attrs={'id':'main'})
        content=main.find_next('div', attrs={'id':'content'})
        page_num=content.find_next("ul", class_="pucu")
        return ('Error on the following page: '+str(page_num.text[0:12]))
    except:
        print ("Error without page number counter is at: " + str(k))

In [6]:
#[4] Reads url, opens website, collects data by calling previous functions
driver=webdriver.Firefox()
driver.get(mainurl)
driver.find_element_by_xpath('/html/body/div/div[3]/div[2]/div[1]/form/table/tbody/tr[6]/td[2]/button').click()
solution_list =list()
print ("11:01")
for j in range(0,4):
    print (j)
    solution_list.append(lawyer_card(driver,j))
    try:
        driver.find_element_by_xpath('/html/body/div/div[3]/div[2]/ul/li/h4/a['+str(j)+']').click()
    except: 
        page_error(driver, j)
for k in range(0,120):
    print (k)
    solution_list.append(lawyer_card(driver,k))
    try:
        driver.find_element_by_xpath('/html/body/div/div[3]/div[2]/ul/li/h4/a[3]').click()
    except: 
        page_error(driver,k)
try:
    driver.find_element_by_xpath('/html/body/div/div[3]/div[2]/ul/li/h4/a[4]').click()
    solution_list.append(lawyer_card(driver,k=4))
except:
    print ("Is there last page?")
driver.close()

WebDriverException: Message: 'geckodriver' executable needs to be in PATH. 


In [48]:
#[5] cleanse database when all data got scraped

clean_list = list()
for i in range(0,len(solution_list)):
    clean_list.append(solution[i])
full_list=list()
for i in range(1, len(clean_list)):
    for j in range(1,   len(clean_list[i])):
        full_list.append(clean_list[i][j])
print ("Fist name to found: " + full_list[0]['name'])
print ("Last name to found: " + full_list[-1]['name'])

Az első név akit beolvasott: Dr ABAY PÉTER
Az utolsó név akit beolvasott: Dr BÁKONYI LÁSZLÓ


In [44]:
#[6] export it to a csv
with open('lawyer_results.csv', encoding='utf-8', mode='w+') as csvfile:
        # a csvfile oszlopnevei kerüljenek a listába
        fieldnames =  ['name', 'megye', 'cim', 'tel', 'mail','jogkor','honlap']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for i in range(1,len(clean_list)):
            for j in range (1, len(clean_list[i])):
                #writer.writerow({k:v.encode('utf8') for k,v in mukodolista[i][j].items()})
                writer.writerow(clean_list[i][j])