## Data Collection and Data Cleaning

### Finding data for Graz, Austria

#### Instaling and importing Beatifull Soup for Web Scraping

In [1]:
!pip install beautifulsoup4



In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

__Web scraping__

On a website https://www.graz.at/cms/beitrag/10034856/7769112/Die_Bezirke.html there are zip codes for all boroughs in city of Graz in Austria


In [3]:
#requesting a url

url = requests.get("https://www.graz.at/cms/beitrag/10034856/7769112/Die_Bezirke.html")
url = url.content

In [4]:
#web scraping with beautifulsoup
soup = BeautifulSoup(url, 'html.parser')

In [5]:
table = soup.find_all('div', {'class':"txtblock-content standard"})

In [6]:
table = table[0]
table = table.ol

In [7]:
gplz_list = []
lis = table.find_all("li")
a = lis[0].text.strip().split(":")
a[1].split(" ")

['', '1,16', 'Quadratkilometer,', '3.899', '(3.933)', 'EinwohnerInnen']

In [8]:
for li in lis:
    l = li.text.strip().split(":")
    gplz_list.append(l)


In [9]:
plz_city = []
area_num = []
for plz in gplz_list:
    plz_city.append(plz[0])
    area_num.append(plz[1])
plz_city
area_num

[' 1,16 Quadratkilometer, 3.899 (3.933) EinwohnerInnen',
 ' 1,83 Quadratkilometer, 16.235 (16.123) EinwohnerInnen',
 ' 5,50 Quadratkilometer, 25.300 (24.990) EinwohnerInnen',
 ' 3,70 Quadratkilometer, 30.966 (30.891) EinwohnerInnen',
 ' 5,05 Quadratkilometer, 28.735 (27.732) EinwohnerInnen',
 ' 4,06 Quadratkilometer, 33.283 (33.082) EinwohnerInnen',
 ' 7,99 Quadratkilometer, 14.417 (14.170) EinwohnerInnen',
 ' 8,86 Quadratkilometer, 15.139 (14.937) EinwohnerInnen',
 ' 4,48 Quadratkilometer, 11.906 (11.869) EinwohnerInnen',
 ' 10,16 Quadratkilometer,\xa0 5.910 (5.886) EinwohnerInnen',
 ' 13,99 Quadratkilometer, 9.756 (9.647) EinwohnerInnen',
 ' 18,47 Quadratkilometer, 19.197 (19.022) EinwohnerInnen',
 ' 10,83 Quadratkilometer, 11.129 (10.900) EinwohnerInnen',
 ' 7,79 Quadratkilometer, 20.553 (20.075) EinwohnerInnen',
 ' 5,77 Quadratkilometer, 15.630 (15.215) EinwohnerInnen',
 ' 11,75 Quadratkilometer, 16.003 (15.590) EinwohnerInnen',
 ' 6,18 Quadratkilometer, 8.628 (8.417) EinwohnerInne

In [10]:
plz = [plz[0:4] for plz in plz_city]
borough = [bor[4:].strip() for bor in plz_city]

In [11]:
area_num = [are.split(" ") for are in area_num]
area = []
population = []

In [12]:
#geting the area of borouhg
for are in area_num:
    area.append(float(are[1].replace(",",".")))
area

[1.16,
 1.83,
 5.5,
 3.7,
 5.05,
 4.06,
 7.99,
 8.86,
 4.48,
 10.16,
 13.99,
 18.47,
 10.83,
 7.79,
 5.77,
 11.75,
 6.18]

In [13]:
#geting the population of borough
for pop in area_num:
    population.append(int(pop[3].replace(".","")))
population

[3899,
 16235,
 25300,
 30966,
 28735,
 33283,
 14417,
 15139,
 11906,
 5910,
 9756,
 19197,
 11129,
 20553,
 15630,
 16003,
 8628]

### Making Pandas Dataframe from Scraped Data

In [14]:
graz = pd.DataFrame({'Post Code':plz, 'Borough':borough, 'Area [km2]':area, 'Population':population })

In [15]:
graz

Unnamed: 0,Post Code,Borough,Area [km2],Population
0,8010,Innere Stadt,1.16,3899
1,8010,St. Leonhard,1.83,16235
2,8010,Geidorf,5.5,25300
3,8020,Lend,3.7,30966
4,8020,Gries,5.05,28735
5,8010,Jakomini,4.06,33283
6,8041,Liebenau,7.99,14417
7,8042,St. Peter,8.86,15139
8,8010,Waltendorf,4.48,11906
9,8010,Ries,10.16,5910


### Geting latitudes and longitudes of boroughs in Graz

__Installing and importing geopy library__

In [16]:
!pip install geopy



__From geopy I use Nominatim to get latitude and longitude__

In [17]:
import geopy
from geopy.geocoders import Nominatim

In [18]:
locator = Nominatim(user_agent="myGeocoder")
#location = locator.geocode("Gries, Graz, Austria")

In [19]:
latitude =[]
longitude = []

for bor in borough:
    print(bor + ", Graz, Austria")
    location = locator.geocode(bor + ", Graz, Austria")
    if location!=None:
        latitude.append(location.latitude)
        longitude.append(location.longitude)
    else:
        latitude.append(None)
        longitude.append(None)

Innere Stadt, Graz, Austria
St. Leonhard, Graz, Austria
Geidorf, Graz, Austria
Lend, Graz, Austria
Gries, Graz, Austria
Jakomini, Graz, Austria
Liebenau, Graz, Austria
St. Peter, Graz, Austria
Waltendorf, Graz, Austria
Ries, Graz, Austria
Mariatrost, Graz, Austria
Andritz, Graz, Austria
Gösting, Graz, Austria
Eggenberg, Graz, Austria
Wetzelsdorf, Graz, Austria
Straßgang, Graz, Austria
Puntigam, Graz, Austria


__Adding latitude and longitude into padndas dataframe__

In [20]:
graz['Latitude'] = latitude
graz['Longitude'] = longitude

In [21]:
graz

Unnamed: 0,Post Code,Borough,Area [km2],Population,Latitude,Longitude
0,8010,Innere Stadt,1.16,3899,47.074261,15.438466
1,8010,St. Leonhard,1.83,16235,47.068287,15.456344
2,8010,Geidorf,5.5,25300,47.084668,15.442896
3,8020,Lend,3.7,30966,47.079675,15.420325
4,8020,Gries,5.05,28735,47.061222,15.42737
5,8010,Jakomini,4.06,33283,47.059623,15.444707
6,8041,Liebenau,7.99,14417,47.040169,15.449265
7,8042,St. Peter,8.86,15139,47.058701,15.469985
8,8010,Waltendorf,4.48,11906,47.067741,15.477172
9,8010,Ries,10.16,5910,47.088113,15.49718


## Now I need to find all datafor Stuttgart, Germany

__Web scraping with beautifulsoup__

In [22]:
url = "https://www.suche-postleitzahl.org/stuttgart-plz-70173-70629.608e"
url = requests.get(url)
url = url.content

Pandas library has a method read_html, which puts data from table in one dataframe

In [23]:
stuttgart_plz = pd.read_html(url)

In [24]:
stuttgart_plz = stuttgart_plz[1]

In [25]:
url = "https://de.wikipedia.org/wiki/Liste_der_Stadtbezirke_und_Stadtteile_von_Stuttgart"
url = requests.get(url).content

In [26]:
soup = BeautifulSoup(url,'html.parser')
table = soup.find_all('table',{'class':"wikitable sortable mw-datatable"})

In [27]:
trs = table[0].find_all('tr')
stutt = []
line = []
s_borough = []
s_pop = []
s_area_ha = []

for tr in trs:
    line = []
    tds = tr.find_all("td")
    for td in tds:
        line.append(td.text)
    stutt.append(line)

In [28]:
for st in stutt:
    if len(st) != 0:
        s_borough.append(st[1])
        s_pop.append(st[2].replace(".",""))
        s_area_ha.append(st[3].replace(",","."))

In [29]:
len(s_borough)

23

In [30]:
len(s_pop)

23

In [31]:
len(s_area_ha)

23

In [32]:
stuttgart = pd.DataFrame({'Borough':s_borough, 'Area [ha]':s_area_ha, 'Population': s_pop})

__Converting data from String to float and int__

In [33]:
stuttgart['Area [ha]'] = stuttgart['Area [ha]'].astype(float)
stuttgart['Population'] = stuttgart['Population'].astype(int)

In [34]:
stuttgart.dtypes

Borough        object
Area [ha]     float64
Population      int32
dtype: object

__Converting Area from [ha] to [km2]__

In [35]:
stuttgart['Area [km2]'] = stuttgart['Area [ha]']/100

In [36]:
stuttgart.drop('Area [ha]', axis = 1, inplace=True)

In [37]:
stuttgart

Unnamed: 0,Borough,Population,Area [km2]
0,Stuttgart-Mitte,23956,3.808
1,Stuttgart-Nord,27629,6.815
2,Stuttgart-Ost,48730,9.035
3,Stuttgart-Süd,44050,9.586
4,Stuttgart-West,52668,18.643
5,Bad Cannstatt,71285,15.713
6,Birkach,7149,3.089
7,Botnang,13165,2.135
8,Degerloch,16686,8.021
9,Feuerbach,30417,11.554


In [38]:
stuttgart_plz = stuttgart_plz.loc[stuttgart_plz['Stadtteil'].isin(stuttgart['Borough'])]

In [39]:
stuttgart_plz.rename(columns={'Stadtteil':'Borough'}, inplace=True)
stuttgart_plz.rename(columns={'Postleitzahl':'Post Code'}, inplace=True)

In [40]:
stuttgart_plz

Unnamed: 0,Borough,Post Code
1,Bad Cannstatt,"70191, 70372, 70374, 70376, 70378"
3,Birkach,70599
4,Botnang,70195
8,Degerloch,70597
11,Feuerbach,"70192, 70469, 70499"
17,Hedelfingen,"70327, 70329"
25,Möhringen,"70565, 70567, 70597"
27,Mühlhausen,70378
28,Münster,70376
31,Obertürkheim,70329


In [41]:
stuttgart

Unnamed: 0,Borough,Population,Area [km2]
0,Stuttgart-Mitte,23956,3.808
1,Stuttgart-Nord,27629,6.815
2,Stuttgart-Ost,48730,9.035
3,Stuttgart-Süd,44050,9.586
4,Stuttgart-West,52668,18.643
5,Bad Cannstatt,71285,15.713
6,Birkach,7149,3.089
7,Botnang,13165,2.135
8,Degerloch,16686,8.021
9,Feuerbach,30417,11.554


__Joining two dataframes together__

In [42]:
stuttgart = stuttgart.join(stuttgart_plz.set_index('Borough'), on='Borough')

In [43]:
latitude =[]
longitude = []

for bor in stuttgart['Borough']:
    
    print(bor + ", Stuttgart, Germany")
    location = locator.geocode(bor + ", Stuttgart, Germany")
    if location!=None:
        latitude.append(location.latitude)
        longitude.append(location.longitude)
    else:
        latitude.append(None)
        longitude.append(None)

Stuttgart-Mitte, Stuttgart, Germany
Stuttgart-Nord, Stuttgart, Germany
Stuttgart-Ost, Stuttgart, Germany
Stuttgart-Süd, Stuttgart, Germany
Stuttgart-West, Stuttgart, Germany
Bad Cannstatt, Stuttgart, Germany
Birkach, Stuttgart, Germany
Botnang, Stuttgart, Germany
Degerloch, Stuttgart, Germany
Feuerbach, Stuttgart, Germany
Hedelfingen, Stuttgart, Germany
Möhringen, Stuttgart, Germany
Mühlhausen, Stuttgart, Germany
Münster, Stuttgart, Germany
Obertürkheim, Stuttgart, Germany
Plieningen, Stuttgart, Germany
Sillenbuch, Stuttgart, Germany
Stammheim, Stuttgart, Germany
Untertürkheim, Stuttgart, Germany
Vaihingen, Stuttgart, Germany
Wangen, Stuttgart, Germany
Weilimdorf, Stuttgart, Germany
Zuffenhausen, Stuttgart, Germany


In [44]:
stuttgart['Latitude'] = latitude
stuttgart['Longitude'] = longitude

In [45]:
stuttgart = stuttgart[['Post Code', 'Borough', 'Area [km2]', 'Population', 'Latitude', 'Longitude']]
stuttgart

Unnamed: 0,Post Code,Borough,Area [km2],Population,Latitude,Longitude
0,"70173, 70174, 70176, 70178, 70180, 70182, 7018...",Stuttgart-Mitte,3.808,23956,48.7759,9.1798
1,"70174, 70191, 70192, 70193",Stuttgart-Nord,6.815,27629,48.796661,9.176252
2,"70184, 70186, 70188, 70190, 70327",Stuttgart-Ost,9.035,48730,48.776972,9.207365
3,"70178, 70180, 70184, 70199",Stuttgart-Süd,9.586,44050,48.753021,9.132492
4,"70174, 70176, 70178, 70193, 70197",Stuttgart-West,18.643,52668,48.777659,9.151351
5,"70191, 70372, 70374, 70376, 70378",Bad Cannstatt,15.713,71285,48.804883,9.21468
6,70599,Birkach,3.089,7149,48.728574,9.203406
7,70195,Botnang,2.135,13165,48.778495,9.129532
8,70597,Degerloch,8.021,16686,48.749597,9.170345
9,"70192, 70469, 70499",Feuerbach,11.554,30417,48.812305,9.159031


## Let us compare this two cities

In [46]:
stuttgart.describe()

Unnamed: 0,Area [km2],Population,Latitude,Longitude
count,23.0,23.0,23.0,23.0
mean,9.015087,26762.73913,48.778547,9.186186
std,5.227646,17346.05574,0.038593,0.047979
min,2.135,6796.0,48.711395,9.088648
25%,4.898,12833.5,48.751309,9.154328
50%,8.021,24067.0,48.776972,9.1798
75%,12.274,35885.0,48.808594,9.221109
max,20.893,71285.0,48.849798,9.268515


In [47]:
graz.describe()

Unnamed: 0,Area [km2],Population,Latitude,Longitude
count,17.0,17.0,17.0,17.0
mean,7.504118,16863.882353,47.069096,15.439408
std,4.495399,8570.006395,0.023491,0.03025
min,1.16,3899.0,47.027102,15.394168
25%,4.48,11129.0,47.058701,15.420325
50%,6.18,15630.0,47.068287,15.438466
75%,10.16,20553.0,47.084668,15.456344
max,18.47,33283.0,47.114287,15.49718


__So what is the difference in area of this two cities__

In [48]:
print("Area of Stuttgart is " + str(stuttgart['Area [km2]'].sum()) +  " km2")
print("Area of Graz is " + str(graz['Area [km2]'].sum()) +  " km2")

Area of Stuttgart is 207.347 km2
Area of Graz is 127.57 km2


In [49]:
area_diff = round(stuttgart['Area [km2]'].sum()/ graz['Area [km2]'].sum(),2)
print("Area of Stuttgart is " + str(area_diff) + " times or " + str(round((area_diff-1)*100)) + "% bigger than area of Graz")

Area of Stuttgart is 1.63 times or 63.0% bigger than area of Graz


__So what is the bigest and smallest borough in each city?__

In [50]:
print("Smallest area in Stuttgart is in " + stuttgart['Borough'].loc[stuttgart['Area [km2]'] == stuttgart['Area [km2]'].min()].values[0] + " with area of " + str(stuttgart['Area [km2]'].min()) + " km2")
print("Smallest area in Graz is in " + graz['Borough'].loc[graz['Area [km2]'] == graz['Area [km2]'].min()].values[0] + " with area of " + str(graz['Area [km2]'].min()) + " km2")

Smallest area in Stuttgart is in Botnang with area of 2.135 km2
Smallest area in Graz is in Innere Stadt with area of 1.16 km2


In [51]:
print("Biggest area in Stuttgart is in " + stuttgart['Borough'].loc[stuttgart['Area [km2]'] == stuttgart['Area [km2]'].max()].values[0] + " with area of " + str(stuttgart['Area [km2]'].max()) + " km2")
print("Biggest area in Graz is in " + graz['Borough'].loc[graz['Area [km2]'] == graz['Area [km2]'].max()].values[0] + " with area of " + str(graz['Area [km2]'].max()) + " km2")

Biggest area in Stuttgart is in Vaihingen with area of 20.893 km2
Biggest area in Graz is in Andritz with area of 18.47 km2


__How many people live in each town?__

In [52]:
print("In Stuttgart lives " + str("{:,} people".format(stuttgart['Population'].sum())).replace(","," "))
print("In Graz lives " + str("{:,} people".format(graz['Population'].sum())).replace(","," "))

In Stuttgart lives 615 543 people
In Graz lives 286 686 people


In [53]:
print("So most populated area of Stuttgart is {} with population of {:,}".format(stuttgart['Borough'].loc[stuttgart['Population'] == stuttgart['Population'].max()].values[0], stuttgart['Population'].max()).replace(","," "))
print("So most populated area of Graz is {} with population of {:,}".format(graz['Borough'].loc[graz['Population'] == graz['Population'].max()].values[0], graz['Population'].max()).replace(","," "))

So most populated area of Stuttgart is Bad Cannstatt with population of 71 285
So most populated area of Graz is Jakomini with population of 33 283


In [54]:
print("So least populated area of Stuttgart is {} with population of {:,}".format(stuttgart['Borough'].loc[stuttgart['Population'] == stuttgart['Population'].min()].values[0], stuttgart['Population'].min()).replace(","," "))
print("So least populated area of Graz is {} with population of {:,}".format(graz['Borough'].loc[graz['Population'] == graz['Population'].min()].values[0], graz['Population'].min()).replace(","," "))

So least populated area of Stuttgart is Münster with population of 6 796
So least populated area of Graz is Innere Stadt with population of 3 899


__One interesting aspect is also population density of the area__

In [55]:
# Density of stuttgart
stuttgart['Population/km2'] = stuttgart['Population']/stuttgart['Area [km2]']

In [56]:
#Density of Graz
graz['Population/km2'] = graz['Population']/graz['Area [km2]']

In [57]:
graz

Unnamed: 0,Post Code,Borough,Area [km2],Population,Latitude,Longitude,Population/km2
0,8010,Innere Stadt,1.16,3899,47.074261,15.438466,3361.206897
1,8010,St. Leonhard,1.83,16235,47.068287,15.456344,8871.584699
2,8010,Geidorf,5.5,25300,47.084668,15.442896,4600.0
3,8020,Lend,3.7,30966,47.079675,15.420325,8369.189189
4,8020,Gries,5.05,28735,47.061222,15.42737,5690.09901
5,8010,Jakomini,4.06,33283,47.059623,15.444707,8197.783251
6,8041,Liebenau,7.99,14417,47.040169,15.449265,1804.380476
7,8042,St. Peter,8.86,15139,47.058701,15.469985,1708.690745
8,8010,Waltendorf,4.48,11906,47.067741,15.477172,2657.589286
9,8010,Ries,10.16,5910,47.088113,15.49718,581.692913


## Creating the maps of Graz and Stuttgart

In [58]:
import folium #import Folium to create the map

__Creating the map of Graz__

In [76]:
#geting the coordinates of Graz

def draw_a_map(city, city_name, country):
    loc = locator.geocode(city_name + ', ' + country)

    # creating a map of Graz usin latitude and longitude values
    map = folium.Map(location=[loc.latitude, loc.longitude], zoom_start=12)

    # add markers on map
    for lat, log, borough in zip(city['Latitude'], city['Longitude'], city['Borough']):
        borough = borough.replace("ä","a")
        borough = borough.replace("ü","u")
        borough = borough.replace("ö","o")
        borough = borough.replace("ß","ss")
        label = "{}, {}".format(borough, city_name)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, log],
            radius=15,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(map)

    folium.Marker(
            [location.latitude, location.longitude],
            popup=label).add_to(map)
    return map

In [77]:
graz_map = draw_a_map(graz, 'Graz', 'Austria')

In [78]:
graz_map

In [79]:
stuttgart_map = draw_a_map(stuttgart, 'Stuttgart', 'Germany')

In [83]:
stuttgart_map