In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
wikiurl="https://en.wikipedia.org/wiki/Prefectures_of_Japan"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
print(response.status_code)

200


In [3]:
# parse data from html to beautifulsoup object

soup = BeautifulSoup(response.text, 'html.parser')
kentable=soup.find('table',{'class':"wikitable"})

In [4]:
df=pd.read_html(str(kentable))
#convert list to dataframe
df=pd.DataFrame(df[0])
print(df.head())

  Prefecture Prefecture.1    Capital Capital.1   Region Major Island  \
0      Aichi          愛知県     Nagoya      名古屋市    Chūbu       Honshū   
1      Akita          秋田県      Akita       秋田市   Tōhoku       Honshū   
2     Aomori          青森県     Aomori       青森市   Tōhoku       Honshū   
3      Chiba          千葉県      Chiba       千葉市    Kantō       Honshū   
4      Ehime          愛媛県  Matsuyama       松山市  Shikoku      Shikoku   

   Population(December 2022)  Area(km2)[15]  Density(per km2)  Distr.  \
0                    7571000        5173.07            1458.0       7   
1                    1011000       11637.52              82.4       6   
2                    1250000        9645.64             128.3       8   
3                    6311000        5157.57            1218.5       6   
4                    1353000        5676.19             235.2       7   

   Municipalities    ISO Areacode  
0              54  JP-23      052  
1              25  JP-05      018  
2              40  J

In [5]:
# drop unwanted columns
data = df.drop(["ISO", "Areacode", "Municipalities", "Distr.", "Density(per km2)", "Population(December 2022)", "Area(km2)[15]"], axis=1)
# rename columns
data = data.rename(columns={"Prefecture.1":"Prefecture_JPN", "Capital.1":"Capital_JPN"})
print(data.head())

  Prefecture Prefecture_JPN    Capital Capital_JPN   Region Major Island
0      Aichi            愛知県     Nagoya        名古屋市    Chūbu       Honshū
1      Akita            秋田県      Akita         秋田市   Tōhoku       Honshū
2     Aomori            青森県     Aomori         青森市   Tōhoku       Honshū
3      Chiba            千葉県      Chiba         千葉市    Kantō       Honshū
4      Ehime            愛媛県  Matsuyama         松山市  Shikoku      Shikoku


In [6]:
# in progress
# correcting long vowels in text
data["Prefecture"] = data["Prefecture"].replace({"Hyōgo": "Hyogo", "Kōchi":"Kochi", "Kyōto":"Kyoto", "Ōita":"Oita", "Ōsaka":"Osaka", "Tōkyō":"Tokyo"})

data["Prefecture"]

0         Aichi
1         Akita
2        Aomori
3         Chiba
4         Ehime
5         Fukui
6       Fukuoka
7     Fukushima
8          Gifu
9         Gunma
10    Hiroshima
11     Hokkaido
12        Hyogo
13      Ibaraki
14     Ishikawa
15        Iwate
16       Kagawa
17    Kagoshima
18     Kanagawa
19        Kochi
20     Kumamoto
21        Kyoto
22          Mie
23       Miyagi
24     Miyazaki
25       Nagano
26     Nagasaki
27         Nara
28      Niigata
29         Oita
30      Okayama
31      Okinawa
32        Osaka
33         Saga
34      Saitama
35        Shiga
36      Shimane
37     Shizuoka
38      Tochigi
39    Tokushima
40        Tokyo
41      Tottori
42       Toyama
43     Wakayama
44     Yamagata
45    Yamaguchi
46    Yamanashi
Name: Prefecture, dtype: object

In [6]:
def assign_visit_dict(Prefecture):
    visit = {
        "Hokkaido": "Yes",
        "Aomori": "No",
        "Iwate": "No",
        "Miyagi": "No",
        "Akita": "No",
        "Yamagata": "No",
        "Fukushima": "No",
        "Ibaraki": "No",
        "Tochigi": "No",
        "Gunma": "No",
        "Saitama": "Yes",
        "Chiba": "Yes",
        "Tōkyō": "Yes",
        "Kanagawa": "Yes",
        "Niigata": "No",
        "Toyama": "No",
        "Ishikawa": "No",
        "Fukui": "No",
        "Yamanashi": "No",
        "Nagano": "Yes",
        "Gifu": "No",
        "Shizuoka": "No",
        "Aichi": "Yes",
        "Mie": "No",
        "Shiga": "No",
        "Kyōto": "Yes",
        "Ōsaka": "Yes",
        "Hyōgo": "No",
        "Nara": "Yes",
        "Wakayama": "No",
        "Tokushima": "No",
        "Kagawa": "No",
        "Ehime": "No",
        "Kōchi": "No",
        "Fukuoka": "No",
        "Saga": "No",
        "Nagasaki": "Yes",
        "Kumamoto": "Yes",
        "Ōita": "No",
        "Miyazaki": "Yes",
        "Kagoshima": "No",
        "Okinawa": "No",
        "Hiroshima": "Yes",
        "Okayama": "No",
        "Shimane": "No",
        "Tottori": "No",
        "Yamaguchi": "Yes"
    }
    
    return visit.get(Prefecture, 'Unknown')
data['Visit_hx'] = data['Prefecture'].apply(assign_visit_dict)

In [8]:
print(data.head())

  Prefecture Prefecture_JPN    Capital Capital_JPN   Region Major Island  \
0      Aichi            愛知県     Nagoya        名古屋市    Chūbu       Honshū   
1      Akita            秋田県      Akita         秋田市   Tōhoku       Honshū   
2     Aomori            青森県     Aomori         青森市   Tōhoku       Honshū   
3      Chiba            千葉県      Chiba         千葉市    Kantō       Honshū   
4      Ehime            愛媛県  Matsuyama         松山市  Shikoku      Shikoku   

  Visit_hx  
0      Yes  
1       No  
2       No  
3      Yes  
4       No  


In [7]:
# export final dataset
data.to_excel('pref_visited.xlsx', index = False)