In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## 1. Get raw data

In [11]:
def crawl(page, data):
    url = f"https://geokeo.com/database/state/vn/{page}/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    for row in soup.find_all("tr"):
        cols = row.find_all("td")
        cols = [ele.text.strip() for ele in cols]
        data.append(cols)  
    return data

data = []
for page in range(1, 5):
    crawl(page, data)

df = pd.DataFrame(data, columns=["Number", "Province", "Country", "Latitude", "Longitude", "Other"], index=None)
df


Unnamed: 0,Number,Province,Country,Latitude,Longitude,Other
0,,,,,,
1,1,An Giang province,Vietnam,10.514902499466372,105.11317919999999,"""ref""=>""AG"", ""name""=>""An Giang"", ""name:en""=>""A..."
2,2,Bac Giang province,Vietnam,21.309286300408615,106.61651279999998,"""ref""=>""BG"", ""name""=>""Bắc Giang"", ""name:en""=>""..."
3,3,Bắc Kạn province,Vietnam,22.257170100404753,105.85889579999998,"""ref""=>""BK"", ""name""=>""Bắc Kạn"", ""name:en""=>""Bắ..."
4,4,Bac Lieu,Vietnam,9.347731699411282,105.50970639999998,"""name""=>""Bạc Liêu"", ""name:en""=>""Bac Lieu"", ""na..."
...,...,...,...,...,...,...
60,58,Tra Vinh,Vietnam,9.770460599427402,106.35638059999998,"""name""=>""Trà Vinh"", ""name:en""=>""Tra Vinh"", ""na..."
61,59,Tuyen Quang province,Vietnam,22.125675500406818,105.2089725,"""ref""=>""TQ"", ""name""=>""Tuyên Quang"", ""name:en""=..."
62,60,Vinh Phuc Province,Vietnam,21.31135600040864,105.60329439999998,"""name""=>""Vĩnh Phúc"", ""name:en""=>""Vinh Phuc Pro..."
63,,,,,,


## 2. Cleaning data

In [None]:
# Check null values
df.isnull().sum()

Number       4
Province     4
Country      4
Latitude     4
Longitude    4
Other        4
dtype: int64

In [None]:
# Remove null values
df.dropna(inplace=True)

In [14]:
# Remove Unnecessary Columns
df.drop(columns=["Number", "Country", "Other"], inplace=True)

In [16]:
df

Unnamed: 0,Province,Latitude,Longitude
1,An Giang province,10.514902499466372,105.11317919999999
2,Bac Giang province,21.309286300408615,106.61651279999998
3,Bắc Kạn province,22.257170100404753,105.85889579999998
4,Bac Lieu,9.347731699411282,105.50970639999998
5,Bac Ninh Province,21.121205100406353,106.0880245
...,...,...,...
59,Thua Thien–Hue province,16.337537400062946,107.55637079999998
60,Tra Vinh,9.770460599427402,106.35638059999998
61,Tuyen Quang province,22.125675500406818,105.2089725
62,Vinh Phuc Province,21.31135600040864,105.60329439999998


In [17]:
# Remove 'province' in 'Province' column
df["Province"] = df["Province"].str.replace(" Province", "")
df["Province"] = df["Province"].str.replace(" province", "")

In [18]:
df

Unnamed: 0,Province,Latitude,Longitude
1,An Giang,10.514902499466372,105.11317919999999
2,Bac Giang,21.309286300408615,106.61651279999998
3,Bắc Kạn,22.257170100404753,105.85889579999998
4,Bac Lieu,9.347731699411282,105.50970639999998
5,Bac Ninh,21.121205100406353,106.0880245
...,...,...,...
59,Thua Thien–Hue,16.337537400062946,107.55637079999998
60,Tra Vinh,9.770460599427402,106.35638059999998
61,Tuyen Quang,22.125675500406818,105.2089725
62,Vinh Phuc,21.31135600040864,105.60329439999998


## 3. Export file

In [19]:
# Save to CSV
df.to_csv("vietnam_provinces.csv", index=False)

In [7]:
import pandas as pd

df = pd.read_csv("vietnam_provinces.csv")
df

Unnamed: 0,Province,Latitude,Longitude
0,An Giang,10.514902,105.113179
1,Bac Giang,21.309286,106.616513
2,Bắc Kạn,22.257170,105.858896
3,Bac Lieu,9.347732,105.509706
4,Bac Ninh,21.121205,106.088025
...,...,...,...
56,Thua Thien–Hue,16.337537,107.556371
57,Tra Vinh,9.770461,106.356381
58,Tuyen Quang,22.125676,105.208973
59,Vinh Phuc,21.311356,105.603294
