<h1>AIA Webscraping Firms</h1>

This jupyter notebook contains the code used to webscrape AIA Firms

<h2>Importing webscraping libraries</h2>

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess

<h2>Defining the Webscraping Spider</h2>

This code block is defining what our web spider will do - note that it isn't running it, just defining it. See that we are extending the exsiting scrapy.Spider class rather than doing everything from scratch, so we only have minimal coding to do.

We tested this code using one page:

http://www.aiaworldwide.com/find-an-international-accountant

AIA are helpful enough to just list all of their members on one page, but the details are relatively limited. Luckily each member has a link to their full profile, so we can scrape those links as stage one and then use those links to get the remaining details.

#Ran this code once to get url list, which is saved

class MySpider(scrapy.Spider):
    name = "AIA"
    
    def start_requests(self):
        
        urls = []
        
        url = 'http://www.aiaworldwide.com/find-an-international-accountant'
        urls.append(url)
        
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
            
    def parse(self, response):
        f_url = response.xpath('//tbody//tr//a//@href').extract()
            
        firm_url = [' '.join(a.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) for a in f_url]
                        
        full_url_output.append(firm_url)
        
full_url_output = []

process = CrawlerProcess()
process.crawl(MySpider)
process.start()

final_url_output = []

for i in full_url_output[0]:
    final_url_output.append(i.encode('ascii','replace'))
    
f = open('/home/de-admin/Documents/Webscraping/AIA_links.txt', 'w')

for url in final_url_output:
    print>>f, url

f.close()

<h2>Defining the Webscraping Spider</h2>

We now have the list of urls and can proceed to the final webscraping

In [2]:
with open('/home/de-admin/Documents/Webscraping/AIA_links.txt') as f:
    url_saved = f.read().splitlines()

In [3]:
class MySpider2(scrapy.Spider):
    name = "AIA_FIRMS"
    
    custom_settings = {
        'DOWNLOAD_DELAY': 1,
    }
    
    def start_requests(self):
        
        urls = []
        
        for i in url_saved[10:13]:
            url = 'http://www.aiaworldwide.com'+i
            urls.append(url)
                
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
            
    def parse(self, response):
        f_titl = response.xpath('//div[@class="field field-name-field-person-title field-type-taxonomy-term-reference field-label-inline clearfix"]//div[@class="field-item even"]//text()').extract()
        f_init = response.xpath('//div[@class="field field-name-field-person-initials field-type-text field-label-inline clearfix"]//div[@class="field-item even"]//text()').extract()
        f_name = response.xpath('//h1//text()').extract()        
        f_qual = response.xpath('//div[@class="field field-name-field-designatory-letters field-type-text field-label-above"]//div[@class="field-item even"]//text()').extract()
        f_comp = response.xpath('//div[@class="field field-name-field-company-name field-type-text field-label-inline clearfix"]//div[@class="field-item even"]//text()').extract()
        f_town = response.xpath('//div[@class="field field-name-field-city field-type-text field-label-inline clearfix"]//div[@class="field-item even"]//text()').extract()
        f_ctry = response.xpath('//div[@class="field field-name-field-country field-type-country field-label-inline clearfix"]//div[@class="field-item even"]//text()').extract()
        f_tphn = response.xpath('//div[@class="field field-name-field-telephone field-type-text field-label-inline clearfix"]//div[@class="field-item even"]//text()').extract()
        f_faxn = response.xpath('//div[@class="field field-name-field-fax field-type-text field-label-inline clearfix"]//div[@class="field-item even"]//text()').extract()
        f_webs = response.xpath('//div[@class=" field field-name-field-website field-type-link-field field-label-inline clearfix"]//div[@class="field-item even"]//text()').extract()
        f_stat = response.xpath('//div[@class="field field-name-field-image field-type-image field-label-hidden"]//img//@alt').extract()     
        
        firm_title = [' '.join(a.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) for a in f_titl]
        firm_initials = [' '.join(a.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) for a in f_init]
        firm_name = [' '.join(a.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) for a in f_name]
        firm_qualifications = [' '.join(a.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) for a in f_qual]
        firm_company = [' '.join(a.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) for a in f_comp]
        firm_town = [' '.join(a.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) for a in f_town]
        firm_country = [' '.join(a.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) for a in f_ctry]
        firm_telephone = [' '.join(a.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) for a in f_tphn]
        firm_fax = [' '.join(a.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) for a in f_faxn]
        firm_website = [' '.join(a.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) for a in f_webs]
        firm_status = [' '.join(a.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) for a in f_stat]
        
        full_firm_output.append([firm_title,firm_initials,firm_name,firm_qualifications,firm_company,firm_town,firm_country,firm_telephone,firm_fax,firm_website,firm_status])

<h2>Running the Webscraping</h2>

Note, you can't re run the code below in a single session for one reason or another, so you need to restart the kernel between runs.

This code creates a lightweight container for our webspider and then runs it - to be honest understanding this is probably optional unless it breaks.

In [None]:
#add fax

In [4]:
full_firm_output = []

process = CrawlerProcess()
process.crawl(MySpider2)
process.start()

2017-09-25 12:23:55 [scrapy.utils.log] INFO: Scrapy 1.3.3 started (bot: scrapybot)
2017-09-25 12:23:55 [scrapy.utils.log] INFO: Overridden settings: {}
2017-09-25 12:23:55 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.corestats.CoreStats']
2017-09-25 12:23:55 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddle

We've now downloaded all the pages that we want to scrape. The first thing to do is to examine what we got back

In [5]:
print(full_firm_output)

[[[u'Mr'], [u'GY'], [u'Njini'], [u'FAIA ACIS FCEA FIPFM MBA'], [u'Accountancy Services & Audit (ASA Group)'], [u'Bamenda'], [u'Cameroon'], [u'00237 7743 9468'], [u'Member in Practice']], [[u'Mr'], [u'SD'], [u'Bowman'], [u'FAIA FFA FMAAT'], [u'Stephen Bowman Associates'], [u'Bangor'], [u'United Kingdom'], [u'02891 459 455'], [u'Member in Practice']], [[u'Mr'], [u'GG'], [u'Gench'], [u'MIL FAIA'], [u'Gench & Company'], [u'Barking'], [u'United Kingdom'], [u'020 8220 6614'], []]]


row_index_ad = []
row_index_co = []
row_index_wb = []
row_index_em = []

for i in range(len(full_nm_output)):
    ad = []
    for j in full_nm_output[i]:
        if j.encode('ascii','replace') in full_ad_output[i]:
            ad.append(full_ad_output[i].index(j.encode('ascii','replace')))
    row_index_ad.append(ad)


for i in range(len(full_nm_output)):
    co = []
    for j in full_nm_output[i]:
        if j.encode('ascii','replace') in full_co_output[i]:
            co.append(full_co_output[i].index(j.encode('ascii','replace')))
    row_index_co.append(co)
    

for i in range(len(full_nm_output)):
    wb = []
    for j in full_nm_output[i]:
        if j.encode('ascii','replace') in full_wb_output[i]:
            wb.append(full_wb_output[i].index(j.encode('ascii','replace')))
    row_index_wb.append(wb)


for i in range(len(full_nm_output)):
    em = []
    for j in full_nm_output[i]:
        if j.encode('ascii','replace') in full_em_output[i]:
            em.append(full_em_output[i].index(j.encode('ascii','replace')))
    row_index_em.append(em)

split_output_nm = []
split_output_ad = []
split_output_co = []
split_output_wb = []
split_output_em = []

for i in range(len(full_nm_output)):
    for j in range(len(full_nm_output[i])):
        split_output_nm.append(full_nm_output[i][j].encode('ascii','replace'))

for i in range(len(row_index_ad)):
    start = 0
    for index in row_index_ad[i][1:]:
        split_output_ad.append(full_ad_output[i][start+1:index])
        start = index
    
    split_output_ad.append(full_ad_output[start+1:])

for i in range(len(row_index_co)):
    start = 0
    for index in row_index_co[i][1:]:
        split_output_co.append(full_co_output[i][start+1:index])
        start = index
    
    split_output_co.append(full_co_output[start+1:])

for i in range(len(row_index_wb)):
    start = 0
    for index in row_index_wb[i][1:]:
        split_output_wb.append(full_wb_output[i][start+1:index])
        start = index
    
    split_output_wb.append(full_wb_output[start+1:])

for i in range(len(row_index_em)):
    start = 0
    for index in row_index_em[i][1:]:
        split_output_em.append(full_em_output[i][start+1:index])
        start = index
    
    split_output_em.append(full_em_output[start+1:])

print(len(split_output_nm),split_output_nm[0:3])
print(len(split_output_ad),split_output_ad[0:3])
print(len(split_output_co),split_output_co[0:3])
print(len(split_output_wb),split_output_wb[0:3])
print(len(split_output_em),split_output_em[0:3])

split_output_nm2 = []
split_output_ql = []
split_output_ql2 = []

for i in split_output_nm: 
    split_output_nm2.append(i.split(",")[0])
    split_output_ql.append(i.split(",")[1].split(" "))

for i in split_output_ql:
    sq = []
    for j in range(len(i)):
        if i[j]:          
            sq.append(i[j])
    split_output_ql2.append(sq)

split_output_ad2 = []

for i in range(len(split_output_ad)):
    ad = []
    for j in range(len(split_output_ad[i])):
        ad.append(split_output_ad[i][j].encode('ascii','replace'))
    split_output_ad2.append(ad)

split_output_co2 = []

for i in range(len(split_output_co)):
    if split_output_co[i]:
        for j in range(len(split_output_co[i])):
            split_output_co2.append(split_output_co[i][j].encode('ascii','replace'))  
    else:
        split_output_co2.append("")

split_output_em2 = []

for i in range(len(split_output_em)):
    if split_output_em[i]:
        for j in range(len(split_output_em[i])):
            split_output_em2.append(split_output_em[i][j].encode('ascii','replace'))
    else:
        split_output_em2.append("")

split_output_wb2 = []

for i in range(len(split_output_wb)):
    if split_output_wb[i]:
        wb = ""
        for j in range(len(split_output_wb[i])):
            wb += split_output_wb[i][j].encode('ascii','replace')
        split_output_wb2.append(wb)                
    else:
        split_output_wb2.append("")

firms_matrix_IFA = []

for i in range(len(split_output_nm)):
    firms_matrix_IFA.append([split_output_nm2[i],split_output_ql2[i],split_output_ad2[i],split_output_co2[i],split_output_wb2[i],split_output_em2[i]])

print(firms_matrix_IFA[0])
print(len(firms_matrix_IFA))

final_matrix_IFA = []
for i in firms_matrix_IFA:
    if not i in final_matrix_IFA:
        final_matrix_IFA.append(i)

print(final_matrix_IFA[0])
print(len(final_matrix_IFA))