In [335]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
import json

In [336]:
# <<< 
# Extract 
# >>>
url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29"
def getSoup(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
soup = getSoup(url)

table = soup.select_one('table.wikitable.sortable')
head = table.select('tr.static-row-header')
body = table.find_all('tr')

In [337]:
# 표의 컬럼 추출
category = ''
organization = []
for i, item in enumerate(head[0].find_all('th')):
    if (i == 0):
        print(item.text, end='')
        category = item.text.strip()
    else:
        print(item.find('a').text)
        organization.append(item.find('a').text)

IndentationError: unexpected indent (1143443539.py, line 5)

In [339]:
regionDf = pd.read_csv('./region.csv')

In [344]:
# Table Parsing
infoAll = []
for rank, row in enumerate(body):
    if rank < 3: continue
    # 한 행의 정보를 담을 리스트
    info = []
    # 불필요한 정보를 제거
    while (row.sup != None):
        row.sup.decompose()
    # 정보 저장
    for idx, item in enumerate(row):
        value = item.text.strip()
        # 빈 셀 스킵
        if (value == ''): continue
        # 해당 기관의 정보가 없으면 예상치와 년도를 모두 0으로 설정
        elif (value == '—'): 
            info.append('0')
            info.append('0')
        # 정상 정보면 저장
        else: info.append(item.text.strip())
    # 문자열로 저장된 정보를 숫자로 변환
    for i in range(1, len(info)):
        # GDP 정보면 float
        if i % 2 != 0: info[i] = float(info[i].replace(',',''))
        # 년도 정보면 int
        else: info[i] = int(info[i].replace(',',''))
    # region 정보를 국가 이름과 매칭
    region = regionDf[regionDf['name'] == info[0]]['region'].values[0]
    info.insert(1, region)
    print(info)
    infoAll.append(info)
    print("---")

['United States', 'Americas', 30337162.0, 2025, 27360935.0, 2023, 25744100.0, 2022]
---
['China', 'Asia', 19534894.0, 2025, 17794782.0, 2023, 17963170.0, 2022]
---
['Germany', 'Europe', 4921563.0, 2025, 4456081.0, 2023, 4076923.0, 2022]
---
['Japan', 'Asia', 4389326.0, 2025, 4212945.0, 2023, 4232173.0, 2022]
---
['India', 'Asia', 4271922.0, 2025, 3549919.0, 2023, 3465541.0, 2022]
---
['United Kingdom', 'Europe', 3730261.0, 2025, 3340032.0, 2023, 3089072.0, 2022]
---
['France', 'Europe', 3283429.0, 2025, 3030904.0, 2023, 2775316.0, 2022]
---
['Italy', 'Europe', 2459597.0, 2025, 2254851.0, 2023, 2046952.0, 2022]
---
['Canada', 'Americas', 2330308.0, 2025, 2140086.0, 2023, 2137939.0, 2022]
---
['Brazil', 'Americas', 2307162.0, 2025, 2173666.0, 2023, 1920095.0, 2022]
---
['Russia', 'Europe', 2195708.0, 2025, 2021421.0, 2023, 2240422.0, 2022]
---
['South Korea', 'Asia', 1947133.0, 2025, 1712793.0, 2023, 1673916.0, 2022]
---
['Australia', 'Oceania', 1881140.0, 2025, 1723827.0, 2023, 1776577.

In [338]:
# DataFrame 컬럼 리스트 생성
tempColumn = head[1].text.strip('\n').split('\n')
columnList = [category, 'region'] + tempColumn

In [345]:
# 모든 기관의 정보가 담긴 DataFrame
gdpDf = pd.DataFrame(infoAll, columns=columnList)
gdpDf

Unnamed: 0,Country/Territory,region,Forecast,Year,Estimate,Year.1,Estimate.1,Year.2
0,United States,Americas,30337162.0,2025,27360935.0,2023,25744100.0,2022
1,China,Asia,19534894.0,2025,17794782.0,2023,17963170.0,2022
2,Germany,Europe,4921563.0,2025,4456081.0,2023,4076923.0,2022
3,Japan,Asia,4389326.0,2025,4212945.0,2023,4232173.0,2022
4,India,Asia,4271922.0,2025,3549919.0,2023,3465541.0,2022
...,...,...,...,...,...,...,...,...
204,Kiribati,Oceania,311.0,2024,279.0,2023,223.0,2022
205,Palau,Oceania,308.0,2024,263.0,2023,225.0,2022
206,Marshall Islands,Oceania,305.0,2024,284.0,2023,279.0,2022
207,Nauru,Oceania,161.0,2024,154.0,2023,147.0,2022


In [346]:
# IMF의 정보만 분리
gdpImf = gdpDf.iloc[:,:4]
gdpImf['Forecast'] = round((gdpImf['Forecast'] / 1000), 2)
gdpImf.sort_values('Forecast', ascending=False, inplace=True)
gdpImf.reset_index(drop=True, inplace=True)
gdpImf

Unnamed: 0,Country/Territory,region,Forecast,Year
0,United States,Americas,30337.16,2025
1,China,Asia,19534.89,2025
2,Germany,Europe,4921.56,2025
3,Japan,Asia,4389.33,2025
4,India,Asia,4271.92,2025
...,...,...,...,...
204,Bermuda,Americas,0.00,0
205,Monaco,Europe,0.00,0
206,North Korea,Asia,0.00,0
207,Syria,Asia,0.00,0


In [347]:
# GDP가 100B 이상 국가
gdpImf[gdpImf['Forecast'] > 100]

Unnamed: 0,Country/Territory,region,Forecast,Year
0,United States,Americas,30337.16,2025
1,China,Asia,19534.89,2025
2,Germany,Europe,4921.56,2025
3,Japan,Asia,4389.33,2025
4,India,Asia,4271.92,2025
...,...,...,...,...
67,Uzbekistan,Asia,112.65,2024
68,Guatemala,Americas,112.37,2024
69,Oman,Asia,109.99,2024
70,Bulgaria,Europe,108.42,2024


In [417]:
# 각 Region 별 상위 5개국 평균 GDP
gdpImfGrouped = gdpImf.set_index(['region'])
temp = gdpImfGrouped.sort_values(by=['region', 'Forecast'], ascending=[True, False]).groupby('region').head(5)['Forecast']
gdpRegion = temp.groupby(temp.index).mean()
gdpRegion

region
Africa       285.184
Americas    7473.330
Asia        6327.178
Europe      3318.112
Oceania      436.658
Name: Forecast, dtype: float64