In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [9]:
#headers to simulate browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [10]:
#get request with headers
url = "https://visaguide.world/digital-nomad-visa/digital-nomad-index/"
response = requests.get(url, headers=headers)

In [11]:
print("Status Code:", response.status_code)

Status Code: 200


In [12]:
if response.status_code == 200:
    print("Page fetched successfully!")
else:
    print("Failed to retrieve the page")

Page fetched successfully!


In [13]:
soup = BeautifulSoup(response.content, 'html.parser')

In [14]:
table = soup.find('table', {'class': 'tablepress'})

In [15]:
data = []
for row in table.find_all('tr')[1:]:
    cells = row.find_all('td')
    if len(cells) > 1:  # Ensure there is data in the row
        rank = cells[0].text.strip()
        country = cells[1].text.strip()
        taxes = cells[2].text.strip()
        tax_free_period = cells[3].text.strip()
        internet_speed = cells[4].text.strip()
        minimum_income = cells[5].text.strip()
        living_cost = cells[6].text.strip()
        health_score = cells[7].text.strip()
        digital_nomad_score = cells[8].text.strip()
        
        # Append the extracted data to the list
        data.append([rank, country, taxes, tax_free_period, internet_speed,
                     minimum_income, living_cost, health_score, digital_nomad_score])

# Step 5: Create a pandas DataFrame
visa_nomads_df = pd.DataFrame(data, columns=["Rank", "Country", "Taxes", "Tax-free period", 
                                 "Internet Speed", "Minimum income", "Living cost", 
                                 "Health Score", "Digital Nomad Score"])


In [16]:
visa_nomads_df.head()

Unnamed: 0,Rank,Country,Taxes,Tax-free period,Internet Speed,Minimum income,Living cost,Health Score,Digital Nomad Score
0,1st,Spain,15%,Six months,248.25,2140.0,673.7,60.9,4.5
1,2nd,Argentina,0%,Full visa length,111.23,2500.0,418.5,54.4,3.78
2,3rd,Romania,0%,Full visa length,260.97,3300.0,539.7,45.7,3.74
3,4th,UAE,0%,Full visa length,256.04,5000.0,917.0,39.6,3.65
4,5th,Croatia,0%,Full visa length,109.27,2539.31,686.8,48.8,3.62


In [17]:
countries = [
    'Portugal', 'Estonia', 'Georgia', 'Spain', 'United States', 
    'Thailand', 'Serbia', 'Vietnam', 'Philippines', 'Mexico', 
    'Colombia', 'Uruguay', 'Argentina'
]

In [18]:
visa_nomads_new_df = visa_nomads_df[visa_nomads_df['Country'].isin(countries)]
visa_nomads_new_df

Unnamed: 0,Rank,Country,Taxes,Tax-free period,Internet Speed,Minimum income,Living cost,Health Score,Digital Nomad Score
0,1st,Spain,15%,Six months,248.25,2140,673.7,60.9,4.5
1,2nd,Argentina,0%,Full visa length,111.23,2500,418.5,54.4,3.78
5,6th,Portugal,20%,Six months,205.11,3548,618.0,54.7,3.58
6,7th,Uruguay,0%,Full visa length,177.74,Not required,812.1,40.3,3.55
13,14th,Mexico,30%,No tax free period,77.07,2595,587.2,57.0,3.15
19,20th,Estonia,20%,No tax free period,103.48,3500,795.5,55.5,3.01
23,24th,Georgia,20%,Six months,30.28,2000,566.5,52.6,2.89
32,33rd,Colombia,19%,Six months,125.86,684,439.2,53.2,2.61


In [19]:
visa_nomads_new_df.columns

Index(['Rank', 'Country', 'Taxes', 'Tax-free period', 'Internet Speed',
       'Minimum income', 'Living cost', 'Health Score', 'Digital Nomad Score'],
      dtype='object')

In [20]:
visa_nomads_clean_df=visa_nomads_new_df.drop(columns=['Rank',
        'Living cost', 'Health Score', 'Digital Nomad Score'])


In [21]:
visa_nomads_clean_df = visa_nomads_clean_df.sort_values('Country', ascending=True)
visa_nomads_clean_df= visa_nomads_clean_df.reset_index(drop=True)
visa_nomads_clean_df

Unnamed: 0,Country,Taxes,Tax-free period,Internet Speed,Minimum income
0,Argentina,0%,Full visa length,111.23,2500
1,Colombia,19%,Six months,125.86,684
2,Estonia,20%,No tax free period,103.48,3500
3,Georgia,20%,Six months,30.28,2000
4,Mexico,30%,No tax free period,77.07,2595
5,Portugal,20%,Six months,205.11,3548
6,Spain,15%,Six months,248.25,2140
7,Uruguay,0%,Full visa length,177.74,Not required


In [22]:
# New rows to add
new_rows = [
    {'Country': 'Philippines', 'Taxes': '0%', 'Tax-free period': 'Full visa length', 'Internet Speed': 88, 'Minimum income': 2000},
    {'Country': 'Vietnam', 'Taxes': None, 'Tax-free period': None, 'Internet Speed': 70, 'Minimum income': None},  
    {'Country': 'Thailand', 'Taxes': '5-35%', 'Tax-free period': 'Six months', 'Internet Speed': 100, 'Minimum income': None},  
    {'Country': 'Serbia', 'Taxes': '10%', 'Tax-free period': 'Six months', 'Internet Speed': 69, 'Minimum income': 3500}
]

In [23]:
updated_nomads_df= pd.concat([visa_nomads_clean_df, pd.DataFrame(new_rows)], ignore_index=True)
updated_nomads_df

Unnamed: 0,Country,Taxes,Tax-free period,Internet Speed,Minimum income
0,Argentina,0%,Full visa length,111.23,2500
1,Colombia,19%,Six months,125.86,684
2,Estonia,20%,No tax free period,103.48,3500
3,Georgia,20%,Six months,30.28,2000
4,Mexico,30%,No tax free period,77.07,2595
5,Portugal,20%,Six months,205.11,3548
6,Spain,15%,Six months,248.25,2140
7,Uruguay,0%,Full visa length,177.74,Not required
8,Philippines,0%,Full visa length,88.0,2000.0
9,Vietnam,,,70.0,


In [24]:
visa_nomads_updated_df = updated_nomads_df.sort_values('Country', ascending=True)
visa_nomads_updated_df

Unnamed: 0,Country,Taxes,Tax-free period,Internet Speed,Minimum income
0,Argentina,0%,Full visa length,111.23,2500
1,Colombia,19%,Six months,125.86,684
2,Estonia,20%,No tax free period,103.48,3500
3,Georgia,20%,Six months,30.28,2000
4,Mexico,30%,No tax free period,77.07,2595
8,Philippines,0%,Full visa length,88.0,2000.0
5,Portugal,20%,Six months,205.11,3548
11,Serbia,10%,Six months,69.0,3500.0
6,Spain,15%,Six months,248.25,2140
10,Thailand,5-35%,Six months,100.0,


In [25]:
visa_nomads_updated_df.columns = visa_nomads_updated_df.columns.str.lower().str.replace(' ', '_')

In [26]:
visa_nomads_updated_df.to_csv('../data/clean/nomads_tax_data.csv', index=False)