# import packages
Let's start by importing the following packages!
* requests
* BeautifulSoup
* seaborn
* matplotlib
* pandas
* re

In [1]:
# import package
import requests
from bs4 import BeautifulSoup
import seaborn as sns
import matplotlib.pyplot as plt
# % matplotlib inline
import pandas as pd
import re

# crawler THSR information
Let's start to crawl the imformation of each station of THSR!
* Target web: [THSR Homepage](https://www.thsrc.com.tw)  
* Choose each station:
    1. Homepage -> Travel Information -> Stations
    2. choose different stations by changing url
* Task:
    1. crawler the name, address, operation hours and ticketing hours of each station
    2. organize into a DataFrame

In [2]:
# setting each station name and their url
url="https://www.thsrc.com.tw/"
response=requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')

step1 = soup.find_all('ul',class_='menu-lv3')[6]
step2=step1.find_all('li')

# ['南港站', '台北站', '板橋站', '桃園站', '新竹站', '苗栗站', '台中站', '彰化站', '雲林站', '嘉義站', '台南站', '左營站']
station_name_list=[]
for names in step2:
    station_name_list.append(names.find('a').string)
print(station_name_list)

station_url_list=[]
for each_urls in step2:
    station_url_list.append(each_urls.select_one("a").get("href"))
print(station_url_list)

['南港站', '台北站', '板橋站', '桃園站', '新竹站', '苗栗站', '台中站', '彰化站', '雲林站', '嘉義站', '台南站', '左營站']
['/ArticleContent/2f940836-cedc-41ef-8e28-c2336ac8fe68', '/ArticleContent/977abb69-413a-4ccf-a109-0272c24fd490', '/ArticleContent/e6e26e66-7dc1-458f-b2f3-71ce65fdc95f', '/ArticleContent/fbd828d8-b1da-4b06-a3bd-680cdca4d2cd', '/ArticleContent/a7a04c89-900b-4798-95a3-c01c455622f4', '/ArticleContent/e8fc2123-2aaf-46ff-ad79-51d4002a1ef3', '/ArticleContent/3301e395-46b8-47aa-aa37-139e15708779', '/ArticleContent/38b8c40b-aef0-4d66-b257-da96ec51620e', '/ArticleContent/5f4c7bb0-c676-4e39-8d3c-f12fc188ee5f', '/ArticleContent/60831846-f0e4-47f6-9b5b-46323ebdcef7', '/ArticleContent/9c5ac6ca-ec89-48f8-aab0-41b738cb1814', '/ArticleContent/f2519629-5973-4d08-913b-479cce78a356']


In [3]:
station_address_list=[]
for urls in station_url_list:
    response=requests.get('https://www.thsrc.com.tw'+urls)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    address=soup.find(class_='google-map-link orange')
    station_address_list.append(address.text)
print(station_address_list)

['台北市南港區南港路一段313號', '台北市北平西路3號', '新北市板橋區縣民大道二段7號', '桃園市中壢區高鐵北路一段6號', '新竹縣竹北市高鐵七路6號', '苗栗縣後龍鎮高鐵三路268號', '台中市烏日區站區二路8號', '彰化縣田中鎮站區路二段99號', '雲林縣虎尾鎮站前東路301號', '嘉義縣太保市高鐵西路168號', '台南市歸仁區歸仁大道100號', '高雄市左營區高鐵路105號']


In [4]:
# crawler the operation hours and ticketing hours of each station
station_operation_list=[]
station_ticketing_list=[]

for urls in station_url_list:
    response=requests.get('https://www.thsrc.com.tw'+urls)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    operation1=soup.find_all(class_='gray')[0]
    operation2=operation1.string
    re_operation=operation2.replace('營業時間：','')
    
    ticketing1=soup.find_all(class_='gray')[1]
    ticketing2=ticketing1.string
    re_ticketing=ticketing2.replace('售票時間：','')
    
    station_operation_list.append(re_operation)
    station_ticketing_list.append(re_ticketing)
    
print(station_operation_list)
print(station_ticketing_list)


['05:50 ~ 24:00', '06:00 ~ 24:00', '06:00 ~ 24:00', '06:20 ~ 23:45', '06:15 ~ 23:45', '06:05 ~ 23:45', '05:40 ~ 24:00', '06:20 ~ 24:00', '06:15 ~ 23:45', '06:00 ~ 23:45', '05:45 ~ 24:00', '05:25 ~ 24:00']
['05:50 ~ 末班車發車時間', '06:00 ~ 末班車發車時間', '06:00 ~ 末班車發車時間', '06:20 ~ 末班車發車時間 ', '06:15 ~ 末班車發車時間', '06:05 ~ 末班車發車時間', '05:40 ~ 末班車發車時間', '06:20~末班車發車時間', '06:15~末班車發車時間', '06:00~末班車發車時間', '05:45~末班車發車時間 ', '05:25~末班車發車時間 ']


In [5]:
# create a DataFrame
df_station=pd.DataFrame(station_name_list,columns=['station'])
df_address=pd.DataFrame(station_address_list,columns=['address'])
df_operation_hours=pd.DataFrame(station_operation_list,columns=['operation_hours'])
df_ticketing_hours=pd.DataFrame(station_ticketing_list,columns=['ticketing_hours'])

df_all=pd.concat([df_station,df_address,df_operation_hours,df_ticketing_hours],axis=1)
df_all

Unnamed: 0,station,address,operation_hours,ticketing_hours
0,南港站,台北市南港區南港路一段313號,05:50 ~ 24:00,05:50 ~ 末班車發車時間
1,台北站,台北市北平西路3號,06:00 ~ 24:00,06:00 ~ 末班車發車時間
2,板橋站,新北市板橋區縣民大道二段7號,06:00 ~ 24:00,06:00 ~ 末班車發車時間
3,桃園站,桃園市中壢區高鐵北路一段6號,06:20 ~ 23:45,06:20 ~ 末班車發車時間
4,新竹站,新竹縣竹北市高鐵七路6號,06:15 ~ 23:45,06:15 ~ 末班車發車時間
5,苗栗站,苗栗縣後龍鎮高鐵三路268號,06:05 ~ 23:45,06:05 ~ 末班車發車時間
6,台中站,台中市烏日區站區二路8號,05:40 ~ 24:00,05:40 ~ 末班車發車時間
7,彰化站,彰化縣田中鎮站區路二段99號,06:20 ~ 24:00,06:20~末班車發車時間
8,雲林站,雲林縣虎尾鎮站前東路301號,06:15 ~ 23:45,06:15~末班車發車時間
9,嘉義站,嘉義縣太保市高鐵西路168號,06:00 ~ 23:45,06:00~末班車發車時間


# Crawler GDP and CPI
Let's start to crawl GDP and CPI!
* Target web: 
    1. [Wiki GDP](https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal))  
    2. [Wiki CPI](https://en.wikipedia.org/wiki/Corruption_Perceptions_Index)
* Task:
    1. crawler GDP table (top 50: United States ~ New Zealand)
    2. organize into a DataFrame (columns: Country, Region, IMF_Estimate, IMF_Year, United_Nations_Estimate, United_Nations_Year, World_Bank_Estimate, World_Bank_Year)
    3. crawler CPI table which contains country and 2020 CPI (top 100: Denmark ~ Suriname)
    4. organize into a DataFrame (columns: Country, CPI_2020)
    5. merge GDP(DataFrame) and CPI(DataFrame), based on Country of GDP
    6. plot and text the names of GDP top 10 countrys

In [37]:
# crawler GDP
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
whole_table=soup.find('table',class_='static-row-numbers')

gdp_country=[]
region=[]
imf_estimate=[]
imf_year=[]
united_nations_estimate=[]
united_nations_year=[]
world_bank_estimate=[]
world_bank_year=[]

# td=whole_table.find_all('td')
# td

# 八個td一個循環
# 8*50=400
# https://docs.python.org/zh-tw/3/tutorial/introduction.html
for i in range(0,400,8):
    td=whole_table.find_all('td')[i]
    # 去掉\xa0
    gdp_country.append(td.text[1:])
# print(gdp_country)
    
for i in range(1,400,8):
    td=whole_table.find_all('td')[i]
    region.append(td.text)

for i in range(2,400,8):
    td=whole_table.find_all('td')[i]
    imf_estimate.append(td.text)

for i in range(3,400,8):
    td=whole_table.find_all('td')[i]
    # 去掉[n 4]
    imf_year.append(td.text[-4:])
# print(imf_year)
    
for i in range(4,400,8):
    td=whole_table.find_all('td')[i]
    united_nations_estimate.append(td.text)

for i in range(5,400,8):
    td=whole_table.find_all('td')[i]
    united_nations_year.append(td.text[-4:])
    
for i in range(6,400,8):
    td=whole_table.find_all('td')[i]
    world_bank_estimate.append(td.text)
    
for i in range(7,400,8):
    td=whole_table.find_all('td')[i]
    # 去掉最後\n
    world_bank_year.append(td.text[:4])
# print(world_bank_year)


# gdp_dict={'country':[],'region':[],'imf_estimate':[],'imf_year':[],'united_nations_estimate':[],
#           'united_nations_year':[],'world_bank_estimate':[],'world_bank_year':[]}
# # 返回字典所有的鍵
# keys = list(gdp_dict.keys())

# # 所有表格內容
# gdp_table=soup.find_all(['table'],class_='static-row-numbers')[0]
# gdp_content=gdp_table.find_all('td')

# for i in gdp_content:
# #     print(i.text.strip())
#     i.text.strip()


# for i ,content in enumerate(gdp_content):
#         gdp_dict[keys[i % 8]].extend(re.findall('[\w\,]+',content.text))
# print(gdp_dict)    


In [38]:
# table = soup.find_all(['table','tbody','tr'],class_='static-row-numbers')[0]
# rows = table.find_all('tr')

# region=[]
# # # List of all links
# for row in rows:
#     cells = row.find_all('td')
#     if len(cells) > 1:
#         country_link = cells[1].find('a')
#         region.append(country_link)
        
# for i in range(0,50):
#     print(region[i].string)


In [39]:
# create GDP DataFrame
df_gdp_country=pd.DataFrame(gdp_country,columns=['Country'])
df_region=pd.DataFrame(region,columns=['Region'])
df_imf_estimate=pd.DataFrame(imf_estimate,columns=['IMF_Estimate'])
df_imf_year=pd.DataFrame(imf_year,columns=['IMF_Year'])
df_united_nations_estimate=pd.DataFrame(united_nations_estimate,columns=['United_Nations_Estimate'])
df_united_nations_year=pd.DataFrame(united_nations_year,columns=['United_Nations_Year'])
df_world_bank_estimate=pd.DataFrame(world_bank_estimate,columns=['World_Bank_Estimate'])
df_world_bank_year=pd.DataFrame(world_bank_year,columns=['World_Bank_Year'])

df_gdp=pd.concat([df_gdp_country,df_region,df_imf_estimate,df_imf_year,df_united_nations_estimate,
                 df_united_nations_year,df_world_bank_estimate,df_world_bank_year],axis=1)
df_gdp

Unnamed: 0,Country,Region,IMF_Estimate,IMF_Year,United_Nations_Estimate,United_Nations_Year,World_Bank_Estimate,World_Bank_Year
0,United States,Americas,22675271,2021,21433226,2019,20936600,2020
1,China,Asia,16642318,2021,14342933,2019,14722731,2020
2,Japan,Asia,5378136,2021,5082465,2019,4975415,2020
3,Germany,Europe,4319286,2021,3861123,2019,3806060,2020
4,United Kingdom,Europe,3124650,2021,2826441,2019,2707744,2020
5,India,Asia,3049704,2021,2891582,2019,2622984,2020
6,France,Europe,2938271,2021,2715518,2019,2603004,2020
7,Italy,Europe,2106287,2021,2003576,2019,1886445,2020
8,Canada,Americas,1883487,2021,1741496,2019,1643408,2020
9,South Korea,Asia,1806707,2021,1646539,2019,1630525,2020


In [53]:
# crawler CPI
url = 'https://en.wikipedia.org/wiki/Corruption_Perceptions_Index'
response = requests.get(url)

cpi_dict={'Rank':[],'CPI':[],'CPI_2020':[]}
keys = list(cpi_dict.keys())

soup = BeautifulSoup(response.text, 'html.parser')
table_soup=soup.find_all(['table'])[4]
# table_content=table_soup.find_all(['td'])
# table_content
cpi_country=[]
cpi_2020=[]

for i in range(1,1900,19):
    td=table_soup.find_all('td')[i]
    cpi_country.append(td.text[1:])
# print(cpi_country)

for i in range(2,1900,19):
    td=table_soup.find_all('td')[i]
    cpi_2020.append(td.text)
# print(cpi_2020)

# # 20個一個循環
# for i in table_content:
#     print(i.text.strip())
    
# for i ,content in enumerate(table_content):
#     if i % 19 <3:
#         cpi_dict[keys[i % 19]].extend(re.findall('[\w]+',content.text))
# print(cpi_dict)

In [54]:
# # CPI country
# cpi_table=soup.find('table',{'class','wikitable sortable'})
# cpi_country=cpi_table.find_all('a')
# cpi_country_list=[]
# for i in cpi_country:
#     cpi_country_list.append(i.get("title"))

# for i in range(10,110):
#     print(cpi_country_list[i])

In [109]:
# CPI DataFrame
# 抓下來再做資料清洗
df_cpi_country=pd.DataFrame(cpi_country,columns=['Country'])
df_cpi_2020=pd.DataFrame(cpi_2020,columns=['CPI_2020'])

df_cpi=pd.concat([df_cpi_country,df_cpi_2020],axis=1)
df_cpi

Unnamed: 0,Country,CPI_2020
0,Denmark,88
1,New Zealand,88
2,Finland,85
3,Singapore,85
4,Sweden,85
...,...,...
95,Kazakhstan,38
96,Peru,38
97,Serbia,38
98,Sri Lanka,38


In [60]:
# merge GDP and CPI DataFrame
# 依照GDP的國家merge
df_merge=df_gdp.merge(df_cpi)
df_merge

Unnamed: 0,Country,Region,IMF_Estimate,IMF_Year,United_Nations_Estimate,United_Nations_Year,World_Bank_Estimate,World_Bank_Year,CPI_2020
0,United States,Americas,22675271,2021,21433226,2019,20936600,2020,67
1,China,Asia,16642318,2021,14342933,2019,14722731,2020,42
2,Japan,Asia,5378136,2021,5082465,2019,4975415,2020,74
3,Germany,Europe,4319286,2021,3861123,2019,3806060,2020,80
4,United Kingdom,Europe,3124650,2021,2826441,2019,2707744,2020,77
5,India,Asia,3049704,2021,2891582,2019,2622984,2020,40
6,France,Europe,2938271,2021,2715518,2019,2603004,2020,69
7,Italy,Europe,2106287,2021,2003576,2019,1886445,2020,53
8,Canada,Americas,1883487,2021,1741496,2019,1643408,2020,77
9,South Korea,Asia,1806707,2021,1646539,2019,1630525,2020,61


In [104]:
# plot
# 用seaborn畫，標出前十名GDP的點
# CPI用2020，GDP也用2020(world_bank_year)

int_list=[]
int_list.append(df_merge['World_Bank_Estimate'].str.replace(',','').astype(int))
new_world_bank_estimate=[x/10000000 for x in int_list]

new_cpi_2020=[]
new_cpi_2020.append(df_merge['CPI_2020'])

df_new_world_bank_estimate=pd.DataFrame(new_world_bank_estimate,columns=['World_Bank_Estimate'])
df_new_cpi_2020=pd.DataFrame(new_cpi_2020,columns=['CPI_2020'])
new_cpi_2020

# df_plot=pd.concat([df_new_world_bank_estimate,df_new_cpi_2020],axis=1)
# df_plot
# sns.scatterplot(x=df_cpi_2020, y=df_new_world_bank_estimate, hue=df_merge['Region'])

# df_cpi_country=pd.DataFrame(cpi_country,columns=['Country'])
# df_cpi_2020=pd.DataFrame(cpi_2020,columns=['CPI_2020'])

# df_cpi=pd.concat([df_cpi_country,df_cpi_2020],axis=1)
# df_cpi

[0     67
 1     42
 2     74
 3     80
 4     77
 5     40
 6     69
 7     53
 8     77
 9     61
 10    77
 11    38
 12    62
 13    82
 14    53
 15    40
 16    65
 17    56
 18    85
 19    76
 20    76
 21    72
 22    60
 23    84
 24    42
 25    71
 26    88
 27    51
 28    85
 29    77
 30    44
 31    67
 32    85
 33    39
 34    44
 35    54
 36    61
 37    88
 Name: CPI_2020, dtype: object]