# Parsing city names from a Wikipedia table

## Scraping Wikipedia

In [1]:
import requests
url = requests.get('https://de.wikipedia.org/wiki/Liste_der_Gro%C3%9F-_und_Mittelst%C3%A4dte_in_Deutschland').text

In [2]:
from bs4 import BeautifulSoup
html_all = BeautifulSoup(url, 'html.parser')
#print(html_all.prettify())

In [3]:
html_cities = html_all.select('table',{'class': 'wikitable sortable zebra'})[1]
#print(html_cities)

## Data preparation

In [4]:
import pandas as pd
#pd.set_option('display.max_rows', None)

cities = pd.read_html(str(html_cities))
cities = pd.DataFrame(cities[0])

cities = cities[['Rang', 'Name', '2019', 'Bundesland']]
cities.columns = ['rank', 'name', 'pop_2019', 'bundesland']

cities['pop_2019'] = cities['pop_2019'].str.replace('.', '', regex=True).astype('int64')

cities['name'] = cities['name'].str.replace('\d+', '', regex=True)
cities['name'] = cities['name'].str.replace('["(*)"]', '', regex=True)
cities['name'] = cities['name'].str.replace('/', ' / ', regex=True)
cities['name'] = cities['name'].str.replace('Porta ', 'Porta-', regex=True)
cities['name'] = cities['name'].str.rstrip()

cities

Unnamed: 0,rank,name,pop_2019,bundesland
0,1.0,Berlin,3669491,Berlin
1,2.0,Hamburg,1847253,Hamburg
2,3.0,München,1484226,Bayern
3,4.0,Köln,1087863,Nordrhein-Westfalen
4,5.0,Frankfurt am Main,763380,Hessen
...,...,...,...,...
697,698.0,Bad Salzungen,20097,Thüringen
698,699.0,Wilnsdorf,20086,Nordrhein-Westfalen
699,700.0,Bad Schwartau,20044,Schleswig-Holstein
700,701.0,Oberkirch,20036,Baden-Württemberg


## Looking for common suffixes
### -ingen

In [5]:
import collections
from collections import Counter

def count_ingen(string):
    for word, v in collections.Counter(string.split()).items():
        if word.endswith('ingen') and len(word)>6 and word != 'Thüringen':
            print(word)
            
for city in cities['name']:
    count_ingen(city)

Solingen
Göttingen
Reutlingen
Esslingen
Tübingen
Ratingen
Villingen-Schwenningen
Sindelfingen
Göppingen
Waiblingen
Hattingen
Böblingen
Memmingen
Bietigheim-Bissingen
Nürtingen
Leinfelden-Echterdingen
Völklingen
Ettlingen
Tuttlingen
Balingen
Vaihingen
Geislingen
Leichlingen
Emmendingen
Ehingen
Ditzingen
Meiningen
Öhringen
Überlingen
Kissingen
Büdingen
Donaueschingen
Metzingen
Kitzingen
Eppingen
Schwetzingen
Westoverledingen
Eislingen
Nördlingen
Mössingen
Krozingen


### -dorf

In [6]:
def count_dorf(string):
    for word, v in collections.Counter(string.split()).items():
        if word.endswith('dorf'):
            print(word)
            
for city in cities['name']:
    count_dorf(city)

Düsseldorf
Troisdorf
Alsdorf
Schorndorf
Warendorf
Mörfelden-Walldorf
Deggendorf
Burgdorf
Schwandorf
Hennigsdorf
Neuendorf
Zirndorf
Friedrichsdorf
Elsdorf
Stadtallendorf
Mühldorf
Wilnsdorf


### -au

In [7]:
def count_au(string):
    for word, v in collections.Counter(string.split()).items():
        if word.endswith('au') and word != 'Breisgau' and word != 'Donau':
            print(word)
            
for city in cities['name']:
    count_au(city)

Hanau
Zwickau
Dessau-Roßlau
Passau
Gronau
Dachau
Landau
Rodgau
Bernau
Ilmenau
Gaggenau
Groß-Gerau
Lindau
Zittau
Glauchau
Rappenau
Soltau
Werdau
Nidderau
Schwartau


### -bach

In [8]:
def count_bach(string):
    for word, v in collections.Counter(string.split()).items():
        if word.endswith('bach'):
            print(word)
            
for city in cities['name']:
    count_bach(city)

Mönchengladbach
Offenbach
Gladbach
Gummersbach
Fellbach
Ansbach
Schwabach
Dietzenbach
Rheinbach
Butzbach
Kulmbach
Mosbach
Korbach
Reichenbach


### -ach

In [9]:
def count_ach(string):
    for word, v in collections.Counter(string.split()).items():
        if word.endswith('ach') and word[-4].endswith('b') == False:
            print(word)
            
for city in cities['name']:
    count_ach(city)

Kreuznach
Lörrach
Eisenach
Biberach
Andernach
Herzogenaurach
Aichach


### -feld

In [10]:
def count_feld(string):
    for word, v in collections.Counter(string.split()).items():
        if word.endswith('feld'):
            print(word)
            
for city in cities['name']:
    count_feld(city)

Bielefeld
Krefeld
Langenfeld
Coesfeld
Hersfeld
Saalfeld
Karlsfeld


### -heim

In [11]:
from collections import defaultdict, Counter
def count_heim(string):
    for word, v in collections.Counter(string.split()).items():
        if word.endswith('heim'):
            print(word)
            
for city in cities['name']:
    count_heim(city)

Mannheim
Mülheim
Pforzheim
Hildesheim
Rüsselsheim
Rosenheim
Bergheim
Pulheim
Heidenheim
Bornheim
Weinheim
Monheim
Kirchheim
Bensheim
Hofheim
Sinsheim
Ingelheim
Crailsheim
Viernheim
Kornwestheim
Lampertheim
Nauheim
Forchheim
Northeim
Kelkheim
Unterschleißheim
Mühlheim
Hattersheim
Griesheim
Heppenheim
Meckenheim
Mergentheim
Wertheim
Weilheim
Laupheim
Hockenheim
Flörsheim
Puchheim
Germersheim


### -hausen

In [12]:
def count_hausen(string):
    for word, v in collections.Counter(string.split()).items():
        if word.endswith('hausen'):
            print(word)
            
for city in cities['name']:
    count_hausen(city)

Oberhausen
Recklinghausen
Oeynhausen
Nordhausen
Wusterhausen
Mühlhausen
Barsinghausen
Sangerhausen
Obertshausen
Lüdinghausen
Gelnhausen
Sondershausen
Wildeshausen


### -stadt

In [13]:
def count_stadt(string):
    for word, v in collections.Counter(string.split()).items():
        if word.endswith('stadt') and word != 'Lutherstadt':
            print(word)
            
for city in cities['name']:
    count_stadt(city)

Darmstadt
Ingolstadt
Lippstadt
Neustadt
Erftstadt
Filderstadt
Albstadt
Neustadt
Halberstadt
Arnstadt
Weinstadt
Weiterstadt
Lennestadt
Pfungstadt
Rudolstadt
Eisenhüttenstadt
Riedstadt
Freudenstadt
Seligenstadt
Groß-Umstadt
Schifferstadt
Duderstadt


### -furt

In [14]:
def count_furt(string):
    for word, v in collections.Counter(string.split()).items():
        if word.endswith('furt'):
            print(word)
            
for city in cities['name']:
    count_furt(city)

Frankfurt
Erfurt
Frankfurt
Schweinfurt
Steinfurt
Staßfurt


### -berg

In [15]:
def count_berg(string):
    for word, v in collections.Counter(string.split()).items():
        if word.endswith('berg'):
            print(word)
            
for city in cities['name']:
    count_berg(city)

Nürnberg
Heidelberg
Bamberg
Arnsberg
Stolberg
Leonberg
Wittenberg
Pinneberg
Heinsberg
Amberg
Freiberg
Herrenberg
Rheinberg
Gevelsberg
Friedberg
Rietberg
Friedberg
Landsberg
Wegberg
Strausberg
Plettenberg
Schmallenberg
Markkleeberg
Ronnenberg
Übach-Palenberg
Senftenberg
Sonneberg
Starnberg
Spremberg
Schramberg
Fröndenberg
Wachtberg


### -burg

In [16]:
def count_burg(string):
    for word, v in collections.Counter(string.split()).items():
        if word.endswith('burg'):
            print(word)
            
for city in cities['name']:
    count_burg(city)

Hamburg
Duisburg
Augsburg
Magdeburg
Freiburg
Oldenburg
Regensburg
Würzburg
Wolfsburg
Ludwigsburg
Flensburg
Marburg
Lüneburg
Brandenburg
Aschaffenburg
Neubrandenburg
Offenburg
Homburg
Ravensburg
Oranienburg
Rottenburg
Homburg
Siegburg
Coburg
Neu-Isenburg
Papenburg
Limburg
Cloppenburg
Merseburg
Ahrensburg
Bernburg
Naumburg
Altenburg
Nienburg
Neuburg
Rendsburg
Henstedt-Ulzburg
Oldenburg
Quedlinburg
Bedburg
Waldkraiburg
Dillenburg
Warburg
Rotenburg
Harzburg
Günzburg


### -a

In [17]:
def count_a(string):
    for word, v in collections.Counter(string.split()).items():
        if word.endswith('a'):
            print(word)
            
for city in cities['name']:
    count_a(city)

Jena
Gera
Fulda
Unna
Gotha
Pirna
Porta-Westfalica
Vechta
Hoyerswerda
Riesa
Grimma
Limbach-Oberfrohna
Apolda
Schlema
