In [78]:
#imports
from bs4 import BeautifulSoup
import requests
import time

import pandas as pd

#set ups
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [79]:
#visiting the webpage https://aaiasb.gr/publications/investigation-reports
response = requests.get('https://aaiasb.gr/publications/investigation-reports', ) 
response

<Response [200]>

In [80]:
#parsing the webpage
soup = BeautifulSoup(response.text, 'html.parser')
soup

<!DOCTYPE html>

<html dir="ltr" lang="el-gr" vocab="http://schema.org/">
<head>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="/images/favicon.png" rel="shortcut icon"/>
<link href="/images/apple-touch-icon.png" rel="apple-touch-icon"/>
<meta charset="utf-8">
<base href="https://aaiasb.gr/publications/investigation-reports"/>
<meta content="Hellenic AAIASB,AAIASB,EDAAP, ΕΔΑΑΠ, Διερεύνηση Αεροπορικών Ατυχημάτων , Air Accident Investigation and Aviation Safety Board" name="keywords">
<meta content="Hellenic Air Accident Investigation and Aviation Safety Board (AAIASB) Official Webpage
Επιτροπή Διερεύνησης Ατυχημάτων &amp; Ασφάλειας Πτήσεων (ΕΔΑΑΠ)" name="description"/>
<meta content="Joomla! - Open Source Content Management" name="generator"/>
<title>Πορίσματα / Εκθέσεις</title>
<link href="/media/cck/css/cck.css" rel="stylesheet">
<link href="/media/cck/css/cck.responsive.css" rel="stylesheet">
<l

In [81]:
# we want to create a df that contains all the urls we want to scrape at once
base_url = 'https://aaiasb.gr/publications/investigation-reports'
ending = '?start='
numbers = [50, 100, 150]

In [82]:
urls = [base_url]
for n in numbers:
    url = base_url+ending+str(n)
    urls.append(url)
  

In [83]:
print(urls)

['https://aaiasb.gr/publications/investigation-reports', 'https://aaiasb.gr/publications/investigation-reports?start=50', 'https://aaiasb.gr/publications/investigation-reports?start=100', 'https://aaiasb.gr/publications/investigation-reports?start=150']


In [84]:
# checking our df
df1 = pd.DataFrame(urls) 
df1 = df1.rename(columns={df1.columns[0]:'url'})
df1

Unnamed: 0,url
0,https://aaiasb.gr/publications/investigation-r...
1,https://aaiasb.gr/publications/investigation-r...
2,https://aaiasb.gr/publications/investigation-r...
3,https://aaiasb.gr/publications/investigation-r...


In [85]:
df1.url[3]

'https://aaiasb.gr/publications/investigation-reports?start=150'

In [89]:
# #create an empty list named entries, so as to store the table's data there later
entries = []

for url in urls:  
    
    response = requests.get(url)
    time.sleep(3)
    soup_doc = BeautifulSoup(response.text, 'html.parser')
 # after inspecting, we grab the part of the page we really need
    page=soup.select('div.cck_page_items')[0]

#after inspecting, we grab all "tr" in the "table" located in our "page". 
#we store those tr in a variable called rows
#we don't need the header, so we gonna scrape all the items of the list from the second to the last one
    rows = page.find('table').find_all('tr')[1:]
 #loop through those tr

    for tr in rows:
    #we are gonna scrape its element we need seperately. So we will inspect the page and scrape it through the html elements
    
    #starting with the first date mentioned under the "Τελικό Πόρισμα" verbatim, which is located in the first cell of each row (td[0])
    #each first cell contains three 'div'. The second 'div' found in the first cell of each row is the fist thing we will grab
    
        conclusion_date1 = tr.find_all('td')[0].find_all('div')[1].text.strip()
  
        conclusion_date2 = tr.find_all('td')[0].find_all('div')[2].text.strip()
    
    #incident info, including date and category, is found in the second cell of each row and
    #is the first div in it
        incident_info = tr.find_all('td')[1].find_all('div')[0].text.strip()
    #incident type is the second div in it
        incident_type = tr.find_all('td')[1].find_all('div')[1].text.strip()
    #incident description found in a tooltip is a 'span' and, specifically, the first span found in each row
        incident_description = str(tr.find_all('td')[1].find_all('span', attrs={'uk-icon':'info'})[0])
    #if that incident had fatalities, that is found in the second cell of each row and
    #is the third div in it
        fatalities = tr.find_all('td')[1].find_all('div')[2].text.strip()
    #fatalities description is found in a tooltip and is a 'span' -specifically, the second span found in each row
        fatalities_description = str(tr.find_all('td')[1].find_all('span', attrs={'uk-icon':'info'})[1])
    #area is found in the third cell of each row and is the first div in it
        area = tr.find_all('td')[2].find_all('div')[0].text.strip()
    #registry is found in the third cell of each row and is the second div in it
        registry = tr.find_all('td')[2].find_all('div')[1].text.strip()
        #aircraft type is found in the third cell of each row and is in the second to last div in it
        aircraft_type = tr.find_all('td')[2].find_all('div')[-2].text.strip()
    #more aircraft info is found in the third cell of each row and is the last div in it
        aircraft_info = tr.find_all('td')[2].find_all('div')[-1].text.strip()
    #because the structure of the last column of the table changes sometimes, we gonna scrape 
    #all info found in the third cell of each row also
    #so, we can extract missing info in the cleaning stage of the project
        area_info = tr.find_all('td')[2].text.strip()
  
    #now that we've scraped the desired data and we've stored them in variables
    #we gonna create a dictionary
    #each one of the dict keys is the column name of our future df
    #each key holds the respective variable with our scraped data as a value
    dict = {'conclusion_date1': conclusion_date1,
            'conclusion_date2': conclusion_date2,
            'incident_info': incident_info,
            'incident_type': incident_type,
            'incident_description': incident_description,
            'fatalities': fatalities,
            'fatalities_description': fatalities_description,
            'area': area,
            'registry': registry,
            'aircraft_type': aircraft_type,
            'aircraft_info': aircraft_info,
            'area_info': area_info}
    
    #now, we append our initially empty list called entries with the dictionary we created
    entries.append(dict)
         
    #let's turn our entries list into a df
    df =pd.DataFrame(entries)

#check out how our df looks like!
    df


In [90]:
df = pd.DataFrame(entries)
df.incident_info.value_counts()

17/07/2015\nΑΤΥΧΗΜΑ    4
Name: incident_info, dtype: int64

In [88]:
df.to_csv('all_incidents.csv',index=False)