In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## 1. Get raw data

In [11]:
def crawl(page, data):
    url = f"https://geokeo.com/database/state/vn/{page}/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    for row in soup.find_all("tr"):
        cols = row.find_all("td")
        cols = [ele.text.strip() for ele in cols]
        data.append(cols)  
    return data

data = []
for page in range(1, 5):
    crawl(page, data)

df = pd.DataFrame(data, columns=["Number", "Province", "Country", "Latitude", "Longitude", "Other"], index=None)
df


Unnamed: 0,Number,Province,Country,Latitude,Longitude,Other
0,,,,,,
1,1,An Giang province,Vietnam,10.514902499466372,105.11317919999999,"""ref""=>""AG"", ""name""=>""An Giang"", ""name:en""=>""A..."
2,2,Bac Giang province,Vietnam,21.309286300408615,106.61651279999998,"""ref""=>""BG"", ""name""=>""Bắc Giang"", ""name:en""=>""..."
3,3,Bắc Kạn province,Vietnam,22.257170100404753,105.85889579999998,"""ref""=>""BK"", ""name""=>""Bắc Kạn"", ""name:en""=>""Bắ..."
4,4,Bac Lieu,Vietnam,9.347731699411282,105.50970639999998,"""name""=>""Bạc Liêu"", ""name:en""=>""Bac Lieu"", ""na..."
...,...,...,...,...,...,...
60,58,Tra Vinh,Vietnam,9.770460599427402,106.35638059999998,"""name""=>""Trà Vinh"", ""name:en""=>""Tra Vinh"", ""na..."
61,59,Tuyen Quang province,Vietnam,22.125675500406818,105.2089725,"""ref""=>""TQ"", ""name""=>""Tuyên Quang"", ""name:en""=..."
62,60,Vinh Phuc Province,Vietnam,21.31135600040864,105.60329439999998,"""name""=>""Vĩnh Phúc"", ""name:en""=>""Vinh Phuc Pro..."
63,,,,,,


## 2. Cleaning data

In [None]:
# Check null values
df.isnull().sum()

Number       4
Province     4
Country      4
Latitude     4
Longitude    4
Other        4
dtype: int64

In [None]:
# Remove null values
df.dropna(inplace=True)

In [14]:
# Remove Unnecessary Columns
df.drop(columns=["Number", "Country", "Other"], inplace=True)

In [16]:
df

Unnamed: 0,Province,Latitude,Longitude
1,An Giang province,10.514902499466372,105.11317919999999
2,Bac Giang province,21.309286300408615,106.61651279999998
3,Bắc Kạn province,22.257170100404753,105.85889579999998
4,Bac Lieu,9.347731699411282,105.50970639999998
5,Bac Ninh Province,21.121205100406353,106.0880245
...,...,...,...
59,Thua Thien–Hue province,16.337537400062946,107.55637079999998
60,Tra Vinh,9.770460599427402,106.35638059999998
61,Tuyen Quang province,22.125675500406818,105.2089725
62,Vinh Phuc Province,21.31135600040864,105.60329439999998


In [17]:
# Remove 'province' in 'Province' column
df["Province"] = df["Province"].str.replace(" Province", "")
df["Province"] = df["Province"].str.replace(" province", "")

In [18]:
df

Unnamed: 0,Province,Latitude,Longitude
1,An Giang,10.514902499466372,105.11317919999999
2,Bac Giang,21.309286300408615,106.61651279999998
3,Bắc Kạn,22.257170100404753,105.85889579999998
4,Bac Lieu,9.347731699411282,105.50970639999998
5,Bac Ninh,21.121205100406353,106.0880245
...,...,...,...
59,Thua Thien–Hue,16.337537400062946,107.55637079999998
60,Tra Vinh,9.770460599427402,106.35638059999998
61,Tuyen Quang,22.125675500406818,105.2089725
62,Vinh Phuc,21.31135600040864,105.60329439999998


## 3. Export file

In [19]:
# Save to CSV
df.to_csv("vietnam_provinces.csv", index=False)

In [7]:
import pandas as pd

df = pd.read_csv("vietnam_provinces.csv")
df

Unnamed: 0,Province,Latitude,Longitude
0,An Giang,10.514902,105.113179
1,Bac Giang,21.309286,106.616513
2,Bắc Kạn,22.257170,105.858896
3,Bac Lieu,9.347732,105.509706
4,Bac Ninh,21.121205,106.088025
...,...,...,...
56,Thua Thien–Hue,16.337537,107.556371
57,Tra Vinh,9.770461,106.356381
58,Tuyen Quang,22.125676,105.208973
59,Vinh Phuc,21.311356,105.603294


In [8]:
pro = df['Province']
for i in pro:
    print(i)

An Giang
Bac Giang
Bắc Kạn
Bac Lieu
Bac Ninh
Ba Ria-Vung Tau
Ben Tre
Ben Tre
Binh Dinh
Binh Duong
Binh Duong
Binh Phuoc
Binh Phuoc
Binh Thuan
Ca Mau
Can Tho
Cao Bằng
Dak Lak
Dak Nong
Dien Bien
Dong Nai
Dong Tháp
Gia Dinh
Gia Lai
Ha Giang
Hai Duong
Ha Nam
Ha Tinh
Hau Giang
Hoa Binh
Hung Yen
Khanh Hoa
Kien Giang
Kon Tum
Lai Chau
Lam Dong
Lang Son
Lao Cai
Nam Dinh
Nghe An
Ninh Binh
Ninh Thuan
Phu Tho
Phu Yen
Quang Binh
Quang Nam
Quang Ngai
Quang Ninh
Quang Tri
Sa Dec
Soc Trang
Son La
Tay Ninh
Thai Binh
Thai Nguyen
Thanh Hoa
Thua Thien–Hue
Tra Vinh
Tuyen Quang
Vinh Phuc
Yen Bai


In [22]:
# test this api https://api.open-meteo.com/v1/forecast?latitude=10.823&longitude=106.6296&hourly=temperature_2m,relative_humidity_2m,rain,wind_speed_10m,wind_gusts_10m,visibility,dew_point_2m,wind_direction_10m,cloud_cover

import requests
import json

def get_weather(latitude, longitude):
    url = f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}&hourly=temperature_2m,relative_humidity_2m,rain,wind_speed_10m,wind_gusts_10m,visibility,dew_point_2m,wind_direction_10m,cloud_cover&forecast_days=1"
    response = requests.get(url)
    return response.json()

latitude = 10.823
longitude = 106.6296
weather = get_weather(latitude, longitude)
hourly = weather["hourly"]
hourly


{'time': ['2025-03-11T00:00',
  '2025-03-11T01:00',
  '2025-03-11T02:00',
  '2025-03-11T03:00',
  '2025-03-11T04:00',
  '2025-03-11T05:00',
  '2025-03-11T06:00',
  '2025-03-11T07:00',
  '2025-03-11T08:00',
  '2025-03-11T09:00',
  '2025-03-11T10:00',
  '2025-03-11T11:00',
  '2025-03-11T12:00',
  '2025-03-11T13:00',
  '2025-03-11T14:00',
  '2025-03-11T15:00',
  '2025-03-11T16:00',
  '2025-03-11T17:00',
  '2025-03-11T18:00',
  '2025-03-11T19:00',
  '2025-03-11T20:00',
  '2025-03-11T21:00',
  '2025-03-11T22:00',
  '2025-03-11T23:00'],
 'temperature_2m': [25.8,
  27.3,
  29.0,
  30.8,
  32.1,
  33.0,
  33.0,
  33.3,
  33.0,
  30.6,
  29.4,
  29.0,
  28.5,
  28.2,
  27.7,
  27.1,
  26.5,
  26.2,
  25.9,
  25.7,
  25.6,
  25.6,
  25.5,
  25.3],
 'relative_humidity_2m': [91,
  84,
  77,
  65,
  59,
  54,
  56,
  54,
  56,
  65,
  74,
  76,
  81,
  82,
  81,
  81,
  83,
  85,
  86,
  87,
  88,
  88,
  89,
  89],
 'rain': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0

In [23]:
da = hourly['temperature_2m']
print(da[1])

27.3


In [24]:
len(hourly['temperature_2m'])

24