In [15]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pathlib import Path

In [3]:
cwd = Path.cwd()
html_directory = cwd / 'html'

if not html_directory.exists():
    html_directory.mkdir()

### Get response and save to a file

In [14]:
# Get the HTML and cache it in local disk
url = 'https://en.wikipedia.org/wiki/list_of_companies_of_Japan'
output_file = html_directory / 'wikipedia_page.html'

if not output_file.exists():
    html_content = requests.get(url).content
    html_content = html_content.decode()
    output_file.write_text(html_content, encoding='UTF-8')

output_file.read_text()

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of companies of Japan - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"dffa3b4a-084b-4919-bacb-8fc4595e4869","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_companies_of_Japan","wgTitle":"List of companies of Japan","wgCurRevisionId":1056573292,"wgRevisionId":1056573292,"wgArticleId":76562,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Lists of companies by country

### Parse HTML page

In [18]:
soup = BeautifulSoup(output_file.read_text(), 'html.parser')
type(soup)

bs4.BeautifulSoup

In [22]:
tables = soup.find_all(class_='wikitable')
print(len(tables))
tables

26


[<table class="wikitable">
 <tbody><tr>
 <th>Rank</th>
 <th>Company</th>
 <th>Revenue ($ million)
 </th></tr>
 <tr>
 <td>6</td>
 <td><a class="mw-redirect" href="/wiki/Toyota_Motor" title="Toyota Motor">Toyota Motor</a></td>
 <td>$265,172
 </td></tr>
 <tr>
 <td>30</td>
 <td><a class="mw-redirect" href="/wiki/Honda_Motor" title="Honda Motor">Honda Motor</a></td>
 <td>$138,646
 </td></tr>
 <tr>
 <td>45</td>
 <td><a href="/wiki/Japan_Post_Holdings" title="Japan Post Holdings">Japan Post Holdings</a></td>
 <td>$116,616
 </td></tr>
 <tr>
 <td>54</td>
 <td><a class="mw-redirect" href="/wiki/Nissan_Motor" title="Nissan Motor">Nissan Motor</a></td>
 <td>$107,868
 </td></tr>
 <tr>
 <td>55</td>
 <td><a class="mw-redirect" href="/wiki/Nippon_Telegraph_%26_Telephone" title="Nippon Telegraph &amp; Telephone">Nippon Telegraph &amp; Telephone</a></td>
 <td>$106,500
 </td></tr>
 <tr>
 <td>79</td>
 <td><a href="/wiki/Hitachi" title="Hitachi">Hitachi</a></td>
 <td>$84,559
 </td></tr>
 <tr>
 <td>85</td>


In [46]:
filtered_table_list = []
target_column_names = ['English', 'Japanese', 'Rōmaji', 'TSE']

print(f'Unfiltered table list: {len(tables)}')

for table in tables:
    column_names = table.find_all('th')
    column_names = [c.text.strip() for c in column_names]
    column_names = set(column_names)

    if column_names == set(target_column_names):
        filtered_table_list.append(table)

print(f'Filtered table list: {len(filtered_table_list)}')

Unfiltered table list: 26
Filtered table list: 24


In [95]:
# Get all the data points we need from the rows
company_list = []

for table in filtered_table_list:
    row_list = table.find_all('tr')[1:]
    for row in row_list:
        cells = row.find_all('td')
        cells = [c.get_text().strip() for c in cells]

        def get_company_dict(cells):
            def get_value(idx):
                try:
                    return cells[idx]
                except IndexError:
                    return None

            try:
                company = {
                    'English': get_value(0),
                    'Japanese': get_value(1),
                    'Romaji': get_value(2),
                    'TSE': get_value(3),
                }
                return company
            except IndexError as e:
                print(cells)
                print(str(e))

        company_list.append(get_company_dict(cells))

print(len(company_list))
company_list

601


[{'English': '77 Bank',
  'Japanese': '株式会社七十七銀行',
  'Romaji': 'Kabushiki-gaisha Shichijūshichi Ginkō',
  'TSE': 'TYO: 8341'},
 {'English': '81 Produce',
  'Japanese': '81プロデュース',
  'Romaji': 'Eitiwan Purodyūsu',
  'TSE': ''},
 {'English': 'Acom',
  'Japanese': 'アコム株式会社',
  'Romaji': 'Akomu Kabushiki-gaisha',
  'TSE': 'TYO: 8572'},
 {'English': 'Advantest',
  'Japanese': '株式会社アドバンテスト',
  'Romaji': 'Kabushiki-gaisha Adobantesuto',
  'TSE': 'TYO: 6857'},
 {'English': 'Æon',
  'Japanese': 'イオン株式会社',
  'Romaji': 'Ion Kabushiki-gaisha',
  'TSE': 'TYO: 8267'},
 {'English': 'AGC',
  'Japanese': 'AGC',
  'Romaji': 'Ei Jī Shī',
  'TSE': 'TYO: 5201'},
 {'English': 'Aichi Bank',
  'Japanese': '株式会社愛知銀行',
  'Romaji': 'Kabushiki-gaisha Aichi Ginkō',
  'TSE': 'TYO: 8527'},
 {'English': 'Aichi Steel',
  'Japanese': '愛知製鋼株式会社',
  'Romaji': 'Aichi Seikō Kabushiki-gaisha',
  'TSE': 'TYO: 5482'},
 {'English': 'Aiful',
  'Japanese': 'アイフル株式会社',
  'Romaji': 'Aifuru Kabushiki-gaisha',
  'TSE': 'TYO: 8515'},

### Convert company list to a pandas dataframe

In [97]:
df = pd.DataFrame(company_list, dtype='object')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   English   601 non-null    object
 1   Japanese  601 non-null    object
 2   Romaji    601 non-null    object
 3   TSE       598 non-null    object
dtypes: object(4)
memory usage: 18.9+ KB


Unnamed: 0,English,Japanese,Romaji,TSE
0,77 Bank,株式会社七十七銀行,Kabushiki-gaisha Shichijūshichi Ginkō,TYO: 8341
1,81 Produce,81プロデュース,Eitiwan Purodyūsu,
2,Acom,アコム株式会社,Akomu Kabushiki-gaisha,TYO: 8572
3,Advantest,株式会社アドバンテスト,Kabushiki-gaisha Adobantesuto,TYO: 6857
4,Æon,イオン株式会社,Ion Kabushiki-gaisha,TYO: 8267


Clean data

In [98]:
df['TSE'] = df['TSE'].str.replace('TYO: ', '').str.strip()
df.head()

Unnamed: 0,English,Japanese,Romaji,TSE
0,77 Bank,株式会社七十七銀行,Kabushiki-gaisha Shichijūshichi Ginkō,8341.0
1,81 Produce,81プロデュース,Eitiwan Purodyūsu,
2,Acom,アコム株式会社,Akomu Kabushiki-gaisha,8572.0
3,Advantest,株式会社アドバンテスト,Kabushiki-gaisha Adobantesuto,6857.0
4,Æon,イオン株式会社,Ion Kabushiki-gaisha,8267.0


In [99]:
df.to_csv('japanese_companies_wiki.csv', index=False, encoding='UTF-8')
print('Finished')

Finished
