# Weather forecast update

The ```weather_update()``` function allows the user to update weather forecast in an SQL database using data from OpenWeatherMap (https://openweathermap.org).

## 1&nbsp;Weather function definition

In [None]:
def weather_forecast_update(input):

    # 01: Import libraries
    import os
    import gc
    import pandas as pd
    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    from sqlalchemy import create_engine, text

    # 02: Transform possible inputs into list of cities
    if (type(input) == int) | (type(input) == float):
        city_list = list()
    elif type(input) == str:
        city_list = [input]
    elif type(input) == pd.core.series.Series:
        city_list = list(input)
    elif type(input) == pd.core.frame.DataFrame:
        city_list = list(input['city'])
    elif type(input) == list:
        city_list = input.copy()
    elif type(input) == set:
        city_list = list(input)
    elif type(input) == dict:
        if 'city' in input.keys():
            if type(input['city']) == list:
                city_list = input['city']
            elif type(input['city']) == str:
                city_list = [input['city']]
        else:
            city_list = list(input.keys())

    # 03: Database reading
    schema = 'gans'
    host = os.getenv('Cloud_MySQL_HOST')
    user = 'root'
    password = os.getenv('Cloud_MySQL_API_KEY')
    port = 3306

    connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'
    engine = create_engine(connection_string)
    connection = engine.connect()

    city_sql = pd.read_sql(text('SELECT * FROM city'), con=connection)
    city_info_static_sql = pd.read_sql(text('SELECT * FROM city_info_static'), con=connection)
    city_info_static_sql.drop('city_info_static_id', axis=1, inplace=True)
    weather_sql_df = pd.read_sql_query(text('SELECT * FROM weather'), con=engine.connect())

    city_list_default = list(city_sql['city_name'])

    # 04: Check if the cities from input are in the database
    if city_list == []:# not in locals():
        city_list = city_list_default.copy()

    city_included = [city for city in city_list if city in city_sql['city_name'].values]
    city_not_included = [city for city in city_list if city not in city_sql['city_name'].values]
    city_excluded = []

    # 05: Create new data frames for new cities
    city_info_static_new_df = pd.DataFrame(columns=['city_id', 'country', 'latitude', 'longitude', 'latitude_num', 'longitude_num', 'website'])
    city_info_static_new_df['latitude_num'] = pd.to_numeric(city_info_static_new_df['latitude_num'], errors='coerce')
    city_info_static_new_df['longitude_num'] = pd.to_numeric(city_info_static_new_df['longitude_num'], errors='coerce')
    city_info_time_new_df = pd.DataFrame(columns=['city_id', 'area_city_km2', 'population', 'timestamp'])
    city_info_time_new_df['area_city_km2'] = pd.to_numeric(city_info_time_new_df['area_city_km2'], errors='coerce')
    city_info_time_new_df['population'] = pd.to_numeric(city_info_time_new_df['population'], errors='coerce')
    city_info_time_new_df['timestamp'] = pd.to_datetime(city_info_time_new_df['timestamp'], errors='coerce')

    # 06 function: Grabbing data from Wikipedia for cities not in database
    cities = []
    for city in range(len(city_not_included)):
        url = 'https://en.wikipedia.org/wiki/' + city_not_included[city]
        response = requests.get(url)
        infobox = BeautifulSoup(response.content, 'html.parser').find(class_='infobox ib-settlement vcard')
        
        try:
            latitude_element = infobox.find(class_='latitude')
            longitude_element = infobox.find(class_='longitude')

            cities.append(city_not_included[city])
            city_info_static_new_df.loc[city, 'country'] = infobox.find(class_='infobox-data').get_text()
            city_info_static_new_df.loc[city, 'latitude'] = infobox.find(class_='latitude').get_text()
            city_info_static_new_df.loc[city, 'longitude'] = infobox.find(class_='longitude').get_text()
            city_info_static_new_df.loc[city, 'latitude_num'] = pd.to_numeric(infobox.select('span.geo')[0].get_text().split(";")[0])
            city_info_static_new_df.loc[city, 'longitude_num'] = pd.to_numeric(infobox.select('span.geo')[0].get_text().split(";")[1])
            city_info_static_new_df.loc[city, 'website'] = infobox.find(string="Website").find_next("td").get_text()
            city_info_static_new_df.reset_index(drop=True, inplace=True)
            city_info_time_new_df.loc[city, 'area_city_km2'] = pd.to_numeric(infobox.find(string='Area').find_next(class_='infobox-data').get_text().split('km2')[0][:-1].replace(',', ''))
            city_info_time_new_df.loc[city, 'population'] = pd.to_numeric(infobox.find(string='Population').find_next(class_='infobox-data').get_text().replace(',', ''))
            city_info_time_new_df.loc[city, 'timestamp'] = pd.to_datetime(datetime.now()).strftime('%Y-%m-%d %H:%M:%S')
            city_info_time_new_df.reset_index(drop=True, inplace=True)
        except AttributeError:
            continue

    # 07: Filling in new cities into SQL, grabbing their new IDs and filling in the info-dataframes with the IDs and info
    for city in range(len(cities)):
        if cities[city] not in city_sql['city_name'].values:
            df = pd.DataFrame([cities[city]], columns=['city_name'])
            df.to_sql('city',
                if_exists='append',
                con=connection_string,
                index=False)
            id = pd.read_sql(text(f'SELECT city_id FROM city WHERE city_name = "{cities[city]}"'), con=engine.connect())['city_id'][0]
            city_info_static_new_df.loc[city, 'city_id'] = id
            city_info_time_new_df.loc[city, 'city_id'] = id

            dft = city_info_time_new_df.loc[[city]]
            dft.to_sql('city_info_time',
                if_exists='append',
                con=connection_string,
                index=False)
            
            dfs = city_info_static_new_df.loc[[city]]
            dfs.to_sql('city_info_static',
                if_exists='append',
                con=connection_string,
                index=False)

    # 08: Create an empty dataframe to store all the weather JSONs.
    weather_new_df = pd.DataFrame(columns=['city_id', 'temperature', 'weather', 'weather2', 'clouds', 'wind', 'gust', 'visibility', 'rain', 'snow', 'forecast_for', 'forecast_timestamp'])
    weather_new_df['city_id'] = pd.to_numeric(weather_new_df['city_id'], errors='coerce')
    weather_new_df['temperature'] = pd.to_numeric(weather_new_df['temperature'], errors='coerce')
    weather_new_df['clouds'] = pd.to_numeric(weather_new_df['clouds'], errors='coerce')
    weather_new_df['wind'] = pd.to_numeric(weather_new_df['wind'], errors='coerce')
    weather_new_df['gust'] = pd.to_numeric(weather_new_df['gust'], errors='coerce')
    weather_new_df['visibility'] = pd.to_numeric(weather_new_df['visibility'], errors='coerce')
    weather_new_df['rain'] = pd.to_numeric(weather_new_df['rain'], errors='coerce')
    weather_new_df['snow'] = pd.to_numeric(weather_new_df['snow'], errors='coerce')
    weather_new_df['forecast_for'] = pd.to_datetime(weather_new_df['forecast_for'], errors='coerce')
    weather_new_df['forecast_timestamp'] = pd.to_datetime(weather_new_df['forecast_timestamp'], errors='coerce')

    # 09: Get weather from API
    city_df = pd.read_sql(text('SELECT * FROM city'), con=engine.connect())
    city_info_static_df = pd.read_sql(text('SELECT * FROM city_info_static'), con=engine.connect())
    weather_df = pd.read_sql_query(text('SELECT * FROM weather'), con=engine.connect())
    owm_api_key = os.getenv('OpenWeatherMap_API_KEY')

    for city_id in city_df.loc[city_df['city_name'].isin(city_list), 'city_id']:
        geo = city_info_static_df.loc[city_info_static_df['city_id'] == (city_df.loc[city_id - 1, 'city_id']), ['latitude_num', 'longitude_num']]
        response = requests.get(f'https://api.openweathermap.org/data/2.5/forecast?lat={geo.iloc[0, 0]}&lon={geo.iloc[0, 1]}&units=metric&appid={owm_api_key}').json()['list']
        if ((weather_sql_df['city_id'] == city_id) & (weather_sql_df['forecast_timestamp'] == pd.to_datetime(response[0]['dt_txt']))).sum() > 0:
            continue
        for forecast in range(len(response)):
            row = len(weather_new_df)
            weather_new_df.loc[row, 'city_id'] = int(city_id)
            weather_new_df.loc[row, 'temperature'] = response[forecast]['main']['feels_like']
            weather_new_df.loc[row, 'weather'] = response[forecast]['weather'][0]['main']
            weather_new_df.loc[row, 'weather2'] = response[forecast]['weather'][0]['description']
            weather_new_df.loc[row, 'clouds'] = response[forecast].get('clouds', {}).get('all', None)
            weather_new_df.loc[row, 'wind'] = response[forecast].get('wind', {}).get('speed', None)
            weather_new_df.loc[row, 'gust'] = response[forecast].get('wind', {}).get('gust', None)
            weather_new_df.loc[row, 'visibility'] = response[forecast].get('visibility', None)
            weather_new_df.loc[row, 'rain'] = response[forecast].get('rain', {}).get('3h', None)
            weather_new_df.loc[row, 'snow'] = response[forecast].get('snow', {}).get('3h', None)
            weather_new_df.loc[row, 'forecast_for'] = response[forecast]['dt_txt']
            weather_new_df.loc[row, 'forecast_timestamp'] = response[0]['dt_txt']

    # 10: Filling in the weather into SQL
    for weather in range(len(weather_new_df)):
        dfw = weather_new_df.loc[[weather]]
        dfw.to_sql('weather',
            if_exists='append',
            con=connection_string,
            index=False)
    
    # 11: Cleaning variables once they're no longer needed
    del city_sql, city_info_static_sql, weather_sql_df, city_info_static_new_df, city_info_time_new_df, weather_new_df
    del cities, city_included, city_not_included, city_excluded
    del geo, response
    del host, schema, user, password, port, connection_string
    engine.dispose()
    gc.collect()

## 2&nbsp;Input

In [None]:
# List the cities for which you need the weather forecast update:
city_list = ['Berlin', 'Aachen', 'Hamburg', 'Hannover', 'Bangkok']

## 3&nbsp;Runing the function

In [None]:
weather_forecast_update(city_list)

## 4&nbsp;Results

In [None]:
import os
import pandas as pd
from sqlalchemy import create_engine, text

schema = 'gans'
host = os.getenv('Cloud_MySQL_DB_HOST')
user = 'root'
password = os.getenv('Cloud_MySQL_DB_API_KEY')
port = 3306

connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'
engine = create_engine(connection_string)

weather = pd.read_sql(text('SELECT * FROM weather'), con=engine.connect())

engine.dispose()
del schema, host, user, password, port, connection_string

weather