In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm

In [7]:
cities = pd.read_csv("worldcities.csv")
cities

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6897,139.6922,Japan,JP,JPN,Tōkyō,primary,37977000.0,1392685764
1,Jakarta,Jakarta,-6.2146,106.8451,Indonesia,ID,IDN,Jakarta,primary,34540000.0,1360771077
2,Delhi,Delhi,28.6600,77.2300,India,IN,IND,Delhi,admin,29617000.0,1356872604
3,Mumbai,Mumbai,18.9667,72.8333,India,IN,IND,Mahārāshtra,admin,23355000.0,1356226629
4,Manila,Manila,14.5958,120.9772,Philippines,PH,PHL,Manila,primary,23088000.0,1608618140
...,...,...,...,...,...,...,...,...,...,...,...
26564,Nord,Nord,81.7166,-17.8000,Greenland,GL,GRL,Sermersooq,,10.0,1304217709
26565,Timmiarmiut,Timmiarmiut,62.5333,-42.2167,Greenland,GL,GRL,Kujalleq,,10.0,1304206491
26566,Cheremoshna,Cheremoshna,51.3894,30.0989,Ukraine,UA,UKR,Kyyivs’ka Oblast’,,0.0,1804043438
26567,Ambarchik,Ambarchik,69.6510,162.3336,Russia,RU,RUS,Sakha (Yakutiya),,0.0,1643739159


In [8]:
wiki_base_url = "https://en.wikipedia.org/wiki/"
# for saving the areas of the cities
city_areas = {}

### Crawling the Areas

1. Problem: The box on the right city of each of these city Wikipedia articles is a html table and contains rows and areas. The problem here is, that there are sometimes multiple entries for the area, denoting different area properties. In order to solve this problem, we will simply take the largest of these entries.

In [15]:


# go through every city
for city in tqdm(cities["city"]):
    if not city in city_areas.keys():
        # get the html
        page = requests.get(wiki_base_url + city.replace(" ", "_"))

        # transform to soup object
        soup = BeautifulSoup(page.content, 'html.parser')

        # find the table with all information (right, top of wikipedia article)
        table = soup.find('table', class_='infobox geography vcard')


        # if no table was found, something wrent wong and we insert None and deal with it later
        if not table:
            print("Not Found: ", city)
            city_areas[city] = None
        else:
            # for saving all tale entries of one webpage that containg km²
            temporary_list = []

            # get all rows of the table on the right of the wikipedia article
            rows = table.find_all("tr")

            # go over all rows
            for indx, row in enumerate(rows):
                # get all the cells withing the rows
                cells = row.find_all('td')
                # go through each cell
                for cell in cells:
                    # finding all km² entries and putting them into the temporary list, together with 
                    # the index of the corresponding row
                    # also delete the comma that separates thousand from hundreds
                    if "km2" in cell.text:
                        temporary_list.append((indx, float(re.sub("[^0-9.]", "", re.findall(r"\d{1,6}\.?\d{0,2}", cell.text.replace(",", ""))[0]))))
                        if not float(re.sub("[^0-9.]", "", re.findall(r"\d{1,6}\.?\d{0,2}", cell.text.replace(",", ""))[0])):
                            print("problem", city, ":", cell.text)

            temporary_list_2 = []
            found_indx = False
            for indx, cell_v in temporary_list:

                if not found_indx:
                    found_indx = indx
                    temporary_list_2.append(cell_v)
                else:
                    if indx != found_indx + 1:
                        break
                    else:
                        temporary_list_2.append(cell_v)
                        found_indx += 1
            if not temporary_list_2:
                city_areas[city] = None
                continue
            city_areas[city] = max(temporary_list_2)


  2%|▏         | 484/26569 [00:00<00:21, 1210.03it/s]

Not Found:  Yucheng
Not Found:  Ximeicun
Not Found:  Jianguang
Not Found:  Xushan
Not Found:  Huangshan
Not Found:  Huazhou
problem Shīrāz : 0 km2 (0 sq mi)  0%


  2%|▏         | 511/26569 [00:16<1:51:46,  3.89it/s]

Not Found:  Kawanakajima
Not Found:  Lianjiang


  2%|▏         | 516/26569 [00:18<3:10:21,  2.28it/s]

Not Found:  Birstall


  2%|▏         | 524/26569 [00:22<4:07:14,  1.76it/s]

Not Found:  Farīdābād


  2%|▏         | 533/26569 [00:27<2:29:12,  2.91it/s]

Not Found:  Yutan


  2%|▏         | 539/26569 [00:30<3:43:44,  1.94it/s]

Not Found:  Adelaide


  2%|▏         | 543/26569 [00:31<2:26:03,  2.97it/s]

Not Found:  Córdoba


  2%|▏         | 545/26569 [00:32<2:24:32,  3.00it/s]

Not Found:  Juárez


  2%|▏         | 552/26569 [00:36<3:00:22,  2.40it/s]

Not Found:  Saitama


  2%|▏         | 555/26569 [00:37<2:24:37,  3.00it/s]

Not Found:  Yushu


  2%|▏         | 556/26569 [00:37<2:43:31,  2.65it/s]

Not Found:  Rongcheng


  2%|▏         | 558/26569 [00:38<2:14:04,  3.23it/s]

Not Found:  Haicheng


  2%|▏         | 561/26569 [00:38<2:11:12,  3.30it/s]

Not Found:  Huaiyin


  2%|▏         | 562/26569 [00:39<2:38:57,  2.73it/s]

Not Found:  Wuzhong


  2%|▏         | 564/26569 [00:40<3:35:35,  2.01it/s]

Not Found:  Thāne


  2%|▏         | 569/26569 [00:43<4:45:17,  1.52it/s]

Not Found:  Yangshe


  2%|▏         | 571/26569 [00:44<3:58:39,  1.82it/s]

Not Found:  Dhanbād


  2%|▏         | 575/26569 [00:45<3:26:29,  2.10it/s]

Not Found:  Dayan


  2%|▏         | 584/26569 [00:49<3:13:40,  2.24it/s]

Not Found:  Beidao


  2%|▏         | 586/26569 [00:50<3:44:57,  1.93it/s]

Not Found:  Shuangshui


  2%|▏         | 594/26569 [00:55<3:07:02,  2.31it/s]

Not Found:  Providence


  2%|▏         | 612/26569 [01:04<3:38:52,  1.98it/s]

Not Found:  Guankou


  2%|▏         | 618/26569 [01:07<3:39:44,  1.97it/s]

Not Found:  Kaiyuan


  2%|▏         | 629/26569 [01:12<2:22:12,  3.04it/s]

Not Found:  Yingchuan


  2%|▏         | 634/26569 [01:14<2:50:43,  2.53it/s]

Not Found:  Nezahualcóyotl


  2%|▏         | 658/26569 [01:24<2:12:06,  3.27it/s]

Not Found:  Yicheng


  3%|▎         | 666/26569 [01:28<2:02:57,  3.51it/s]

Not Found:  Richmond
Not Found:  São Luís


  3%|▎         | 669/26569 [01:29<2:46:35,  2.59it/s]

Not Found:  Memphis


  3%|▎         | 671/26569 [01:30<2:56:58,  2.44it/s]

Not Found:  Bezwāda


  3%|▎         | 673/26569 [01:31<3:15:41,  2.21it/s]

Not Found:  Xishan


  3%|▎         | 683/26569 [01:35<3:52:02,  1.86it/s]

Not Found:  Cartagena


  3%|▎         | 690/26569 [01:40<4:01:46,  1.78it/s]

Not Found:  E’zhou


  3%|▎         | 697/26569 [01:43<3:27:11,  2.08it/s]

Not Found:  Shubrā al Khaymah


  3%|▎         | 716/26569 [01:52<2:33:14,  2.81it/s]

Not Found:  Shuizhai
Not Found:  Kota


  3%|▎         | 745/26569 [02:11<5:01:57,  1.43it/s]

Not Found:  Kingston


  3%|▎         | 750/26569 [02:20<9:47:26,  1.37s/it] 

Not Found:  Kitchener


  3%|▎         | 755/26569 [02:24<5:37:17,  1.28it/s]

Not Found:  Canberra


  3%|▎         | 759/26569 [02:25<3:31:30,  2.03it/s]

Not Found:  Halifax


  3%|▎         | 764/26569 [02:27<2:41:32,  2.66it/s]

Not Found:  Victoria


  3%|▎         | 770/26569 [02:29<2:00:41,  3.56it/s]

Not Found:  San José


  3%|▎         | 773/26569 [02:31<2:47:15,  2.57it/s]

Not Found:  Nassau


  3%|▎         | 779/26569 [02:33<2:16:10,  3.16it/s]

Not Found:  Georgetown


  3%|▎         | 782/26569 [02:35<3:29:15,  2.05it/s]

Not Found:  Male


  3%|▎         | 793/26569 [02:41<3:13:46,  2.22it/s]

Not Found:  Moroni


  3%|▎         | 808/26569 [02:49<4:04:28,  1.76it/s]

Not Found:  Tarawa


  3%|▎         | 813/26569 [02:51<3:14:04,  2.21it/s]

Not Found:  Saint John’s


  3%|▎         | 823/26569 [02:55<3:54:53,  1.83it/s]

Not Found:  Saint George’s


  3%|▎         | 826/26569 [02:58<5:08:23,  1.39it/s]

Not Found:  Capitol Hill


  3%|▎         | 831/26569 [03:01<4:26:21,  1.61it/s]

Not Found:  Kitaku


  3%|▎         | 839/26569 [03:06<3:57:02,  1.81it/s]

Not Found:  Chiba


  3%|▎         | 840/26569 [03:06<3:51:30,  1.85it/s]

Not Found:  Danyang


  3%|▎         | 841/26569 [03:07<3:18:46,  2.16it/s]

Not Found:  Natal


  3%|▎         | 844/26569 [03:08<3:34:24,  2.00it/s]

Not Found:  Xibeijie


  3%|▎         | 846/26569 [03:09<2:44:17,  2.61it/s]

Not Found:  Huilong


  3%|▎         | 847/26569 [03:10<3:23:41,  2.10it/s]

Not Found:  Tongjin


  3%|▎         | 853/26569 [03:13<1:36:59,  4.42it/s]

Not Found:  Luocheng





ConnectionError: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/Guw%C4%81h%C4%81ti (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f31ea6a61f0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

In [5]:
print(city_areas)

{}
