# Extract data from web and push it to the database

## 1. Webscraping city data from wikipedia.com

In [None]:
#!pip install lat_lon_parser

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from lat_lon_parser import parse    # We will need latitude and longitude in a decimal format to retrieve information on weather and airports.

In [None]:
def cities_dataframe(cities):

  city_data = []

  for city in cities:
    url = f"https://www.wikipedia.org/wiki/{city}"
    response = requests.get(url)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    # extract the relevant information
    city_latitude = city_soup.find(class_="latitude").get_text()
    city_longitude = city_soup.find(class_="longitude").get_text()
    country = city_soup.find(class_="infobox-data").get_text()

    # keep track of data per city
    city_data.append({"City": city,
                    "Country": country,
                    "Latitude": parse(city_latitude), # latitude in decimal format
                    "Longitude": parse(city_longitude), # longitude in decimal format
                    })

  return pd.DataFrame(city_data)

### Call the function

In [None]:
list_of_cities = ["Berlin", "Hamburg", "Munich"]

cities_df = cities_dataframe(list_of_cities)
cities_df

### Download cities_df as .csv file

In [None]:
cities_df.to_csv('cities.csv')

## 2. Push the data to MySQL

In [None]:
import sqlalchemy

In [None]:
# import .csv file
cities_df = pd.read_csv("cities.csv")

In [None]:
from keys import MySQL_pass

### Establishment of connection with the SQL database

In [None]:
schema = "gans_data"
host = "127.0.0.1"
user = "root"
password = MySQL_pass
port = 3306

connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

### Push the cities_df to the empty "cities" table in the MySQL database

In [None]:
cities_df.to_sql('cities',
                  if_exists='append',
                  con=connection_string,
                  index=False)

## SUCCESS!