In [1]:
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import requests
import joblib
import re

In [2]:
url = 'https://www.dha.gov.za/index.php/immigration-services/south-african-ports-of-entry'

In [3]:
page = requests.get(url)
soup = BeautifulSoup(page.text, "html")

In [4]:
tables = soup.find_all('table')
all_ports_of_entries_sa = []

In [5]:
def replace_escape_seq(text):
    return text.replace('\t', '').replace('\n', '').replace('\r', '')
    

In [6]:
for table in tables:
    country = replace_escape_seq(table.find_previous('h4').text.strip()) 
    #headers have bgcolor
    headers = [td.text.strip() for td in table.find_all('td', {'bgcolor': True})]

    #skip first one as header
    for row in table.find_all('tr')[1:]:
        cols =[replace_escape_seq(td.text.strip()) for td in row.find_all('td')]
        all_ports_of_entries_sa.append([country] + cols)

In [7]:
df = pd.DataFrame(all_ports_of_entries_sa, columns=['Country', 'PortControlOffice', 'Hours', 'Tel'])

In [8]:
new = df.rename(columns={'Country':'countryTwo', 'PortControlOffice': 'name'})
new

Unnamed: 0,countryTwo,name,Hours,Tel
0,BOTSWANA,Bray,07:00 - 16:00,Tel: (053) 937 0026
1,BOTSWANA,Derdepoort,06:00 - 19:00,Tel: (014) 778 0725
2,BOTSWANA,Groblersbrug,08:00 – 22:00,Tel: (014) 767 1019Fax: (014) 767 1264
3,BOTSWANA,Kopfontein,06:00 - 24:00,Tel: (018) 365 9055Fax: (018) 365 9026
4,BOTSWANA,Makopong,08:00 - 16:00,Tel: Fax:
...,...,...,...,...
68,SOUTH AFRICAN AIRPORTS,Kruger Mpumalanga International Airport,07:00 - 19:00,Tel: (013) 750 2937 Fax: (013) 750 2971
69,SOUTH AFRICAN AIRPORTS,Pilanesberg,07:00 - 19:00,Tel: (014) 552 2320
70,SOUTH AFRICAN AIRPORTS,Port Elizabeth,24 Hours standby,Tel: (041) 404 8323 082 809 5237/38Fax: (041) ...
71,SOUTH AFRICAN AIRPORTS,Upington,24hrs on call-out,Tel: (054) 332 3117/8Standby No.:076 987 3944


In [9]:
df_country_one_sa = pd.DataFrame(columns = ['name', 'countryOne', 'countryTwo', 'type'])

In [10]:
final_df = pd.concat([new, df_country_one_sa])
final_df

Unnamed: 0,countryTwo,name,Hours,Tel,countryOne,type
0,BOTSWANA,Bray,07:00 - 16:00,Tel: (053) 937 0026,,
1,BOTSWANA,Derdepoort,06:00 - 19:00,Tel: (014) 778 0725,,
2,BOTSWANA,Groblersbrug,08:00 – 22:00,Tel: (014) 767 1019Fax: (014) 767 1264,,
3,BOTSWANA,Kopfontein,06:00 - 24:00,Tel: (018) 365 9055Fax: (018) 365 9026,,
4,BOTSWANA,Makopong,08:00 - 16:00,Tel: Fax:,,
...,...,...,...,...,...,...
68,SOUTH AFRICAN AIRPORTS,Kruger Mpumalanga International Airport,07:00 - 19:00,Tel: (013) 750 2937 Fax: (013) 750 2971,,
69,SOUTH AFRICAN AIRPORTS,Pilanesberg,07:00 - 19:00,Tel: (014) 552 2320,,
70,SOUTH AFRICAN AIRPORTS,Port Elizabeth,24 Hours standby,Tel: (041) 404 8323 082 809 5237/38Fax: (041) ...,,
71,SOUTH AFRICAN AIRPORTS,Upington,24hrs on call-out,Tel: (054) 332 3117/8Standby No.:076 987 3944,,


In [11]:
final_df['countryOne'] = 'South Africa'
final_df['type'] = 'Land'

In [12]:
final_df = final_df.drop(columns=['Tel', 'Hours'])

Unnamed: 0,countryTwo,name,countryOne,type
0,BOTSWANA,Bray,South Africa,Land
1,BOTSWANA,Derdepoort,South Africa,Land
2,BOTSWANA,Groblersbrug,South Africa,Land
3,BOTSWANA,Kopfontein,South Africa,Land
4,BOTSWANA,Makopong,South Africa,Land
...,...,...,...,...
68,SOUTH AFRICAN AIRPORTS,Kruger Mpumalanga International Airport,South Africa,Land
69,SOUTH AFRICAN AIRPORTS,Pilanesberg,South Africa,Land
70,SOUTH AFRICAN AIRPORTS,Port Elizabeth,South Africa,Land
71,SOUTH AFRICAN AIRPORTS,Upington,South Africa,Land


In [13]:
df.to_csv('final_df.csv')