In [7]:
def get_weather(weather_arguments : dict) -> dict:
    import requests
    import json
    #check that only string arguments are present
    if not all(isinstance(val, str) for val in weather_arguments.values()):
        raise Exception('all arguments must be string')
        
    #preparing the request url
    weather_api = "http://api.openweathermap.org/data/2.5/forecast?"
    api_arguments = repr(weather_arguments).replace("': '", '=').replace("', '", '&')[2:-2]
    weather_request = weather_api + api_arguments
    
    response = requests.get(weather_request)
    if response.status_code != 200:
        raise Exception('url not found')
    return response.json()

weather_arguments = {
    'q' : 'Berlin', #city
    'appid' : '-----needs key-----', #api key
   # 'cnt' : '3', # number of results
    'units' : 'metric'
}
weather_json = get_weather(weather_arguments)

In [9]:
# convert json to pandas DataFrame
from pandas import DataFrame
def weather_json_to_df(weather_json : dict, city : str, keep_cols : list=None) -> DataFrame:
    # install flatdict; needed for weather_json_to_df()
    import sys, os
    sys.path.append(os.path.join(os.path.dirname(''), '../..'))
    from proj3_gans_scooters.src.proj3_utils import install_pip_pkg

    #!pip3 install flatdict
    install_pip_pkg({'flatdict'})
    
    from flatdict import FlatterDict as flatten
    import pandas as pd
    
    #take weather data and city name
    weather_df = pd.json_normalize([dict(flatten(i)) for i in weather_json['list']])
    weather_df = weather_df.assign(city = [city]*weather_df.shape[0])
    
    # return only selection of columns
    if not keep_cols:
        return weather_df
    keep_cols = [c for c in keep_cols if c in weather_df.columns]
    return weather_df[keep_cols]
    
keep_cols = ['city', 'dt_txt', 'main:temp', 'main:feels_like', 'main:humidity', 
             'weather:0:description', 'clouds:all', 'wind:speed', 'wind:deg', 
             'wind:gust', 'pop', 'rain:3h', 'snow:3h', 'sys:pod']
new_cols = ['city', 'date', 'temp_celcius', 'temp_feels_like_celcius', 'humidity_percent', 
             'weather_description', 'clouds_percent', 'wind_speed_meter_sec', 'wind_direction_degree', 
             'wind_gust_meter_sec', 'pop_percent', 'rain_3h_mm', 'snow_3h_mm', 'pod']
weather_df = weather_json_to_df(weather_json, 'Berlin', keep_cols)
weather_df = weather_df.rename(columns=dict(zip(keep_cols, new_cols)))

In [10]:
# handle nan values
from pandas import DataFrame
def cleanup_weather(df : DataFrame):
    import pandas as pd
    if 'rain_3h_mm' in df:
        df.loc[:, 'rain_3h_mm'] = df['rain_3h_mm'].fillna(0)
    if 'snow_3h_mm' in df:
        df.loc[:, 'snow_3h_mm'] = df['snow_3h_mm'].fillna(0)
    df.loc[:, 'date'] = pd.to_datetime(df['date'])

cleanup_weather(weather_df)
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   city                     40 non-null     object        
 1   date                     40 non-null     datetime64[ns]
 2   temp_celcius             40 non-null     float64       
 3   temp_feels_like_celcius  40 non-null     float64       
 4   humidity_percent         40 non-null     int64         
 5   weather_description      40 non-null     object        
 6   clouds_percent           40 non-null     int64         
 7   wind_speed_meter_sec     40 non-null     float64       
 8   wind_direction_degree    40 non-null     int64         
 9   wind_gust_meter_sec      40 non-null     float64       
 10  pop_percent              40 non-null     float64       
 11  rain_3h_mm               40 non-null     float64       
 12  pod                      40 non-null  

In [17]:
from pandas import DataFrame
def scrape_weather(city_lst : list, openweather_key : str) -> DataFrame:
    weather_arguments = {
        'q' : 'Berlin', #city
        'appid' : openweather_key, #api key
       # 'cnt' : '3', # number of results
        'units' : 'metric'
    }
    
    weather_lst = []
    for c in city_lst: 
        weather_arguments['q'] = c
        weather_json = get_weather(weather_arguments)

        keep_cols = ['city', 'dt_txt', 'main:temp', 'main:feels_like', 'main:humidity', 
                     'weather:0:description', 'clouds:all', 'wind:speed', 'wind:deg', 
                     'wind:gust', 'pop', 'rain:3h', 'snow:3h', 'sys:pod']
        weather_df = weather_json_to_df(weather_json, c, keep_cols)


        new_cols = ['city', 'date', 'temp_celcius', 'temp_feels_like_celcius', 'humidity_percent', 
                     'weather_description', 'clouds_percent', 'wind_speed_meter_sec', 'wind_direction_degree', 
                     'wind_gust_meter_sec', 'pop_percent', 'rain_3h_mm', 'snow_3h_mm', 'pod']
        weather_df = weather_df.rename(columns=dict(zip(keep_cols, new_cols)))
        cleanup_weather(weather_df)
        weather_lst.append(weather_df)
    df = None
    if weather_lst:
        import pandas as pd
        df = pd.concat(weather_lst, ignore_index=True)
    return df

In [18]:
weather_df.describe()

Unnamed: 0,temp_celcius,temp_feels_like_celcius,humidity_percent,clouds_percent,wind_speed_meter_sec,wind_direction_degree,wind_gust_meter_sec,pop_percent,rain_3h_mm
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,20.01125,19.59775,58.6,65.225,2.861,269.025,4.7565,0.10775,0.0545
std,3.511447,3.494207,15.754202,31.350367,0.963159,42.010675,1.502612,0.177497,0.145125
min,13.33,12.78,30.0,1.0,0.8,141.0,1.45,0.0,0.0
25%,16.5525,16.18,46.75,50.5,2.145,250.25,3.9825,0.0,0.0
50%,20.105,19.565,56.5,71.5,2.765,278.0,4.435,0.0,0.0
75%,22.995,22.525,71.0,88.75,3.4075,296.75,5.8725,0.15,0.0
max,25.92,25.53,88.0,100.0,5.12,331.0,8.31,0.58,0.77


In [23]:
#umlaute, spaces and other test:
cities = ['Kraków','Frankfurt am Main','Łódź','Wrocław',
          'Düsseldorf','Málaga','The Hague','Poznań','Gdańsk',
          'Palma de Mallorca','Iași','Las Palmas','Malmö','Varna',
          'Cluj-Napoca','Córdoba','Münster','Vila Nova de Gaia']
#runs without problems but I do not grep the cities
#test_cities = scrape_weather(cities)

#compare city names
import requests
import json
import pandas as pd

#preparing the request url
weather_api = "http://api.openweathermap.org/data/2.5/forecast?"

weather_arguments = {
    'q' : 'Berlin', #city
    'appid' : '33114be4ab846fa0878189eb9c1a6df4', #api key
    'cnt' : '1', # number of results
    'units' : 'metric'
}
# install flatdict; needed for weather_json_to_df()
#!pip3 install flatdict
weather_lst = []
for c in cities: 
    weather_arguments['q'] = c
    api_arguments = repr(weather_arguments).replace("': '", '=').replace("', '", '&')[2:-2]
    weather_request = weather_api + api_arguments

    response = requests.get(weather_request)
    assert(response.status_code == 200)
    print(c, "=", response.json()['city']['name'])

Kraków = Krakow
Frankfurt am Main = Frankfurt am Main
Łódź = Łódź Voivodeship
Wrocław = Wrocław
Düsseldorf = Düsseldorf
Málaga = Málaga
The Hague = The Hague
Poznań = Poznań
Gdańsk = Gdańsk
Palma de Mallorca = Palma de Mallorca
Iași = Iasi
Las Palmas = Las Palmas
Malmö = Malmo
Varna = Varna
Cluj-Napoca = Cluj-Napoca
Córdoba = Córdoba
Münster = Münster
Vila Nova de Gaia = Vila Nova de Gaia


In [21]:
test_cities

Unnamed: 0,city,date,temp_celcius,temp_feels_like_celcius,humidity_percent,weather_description,clouds_percent,wind_speed_meter_sec,wind_direction_degree,wind_gust_meter_sec,pop_percent,rain_3h_mm,pod
0,Kraków,2022-06-08 12:00:00,18.58,18.69,84,light rain,83,1.02,45,0.97,0.67,1.11,d
1,Kraków,2022-06-08 15:00:00,19.46,19.58,81,light rain,92,1.70,102,2.15,0.39,0.18,d
2,Kraków,2022-06-08 18:00:00,17.94,18.19,92,light rain,97,1.66,75,1.93,0.47,0.32,d
3,Kraków,2022-06-08 21:00:00,15.72,15.80,94,light rain,80,1.04,99,1.20,0.52,0.22,n
4,Kraków,2022-06-09 00:00:00,14.93,14.91,93,overcast clouds,89,1.03,108,1.14,0.34,0.00,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,Vila Nova de Gaia,2022-06-12 21:00:00,21.41,21.59,76,broken clouds,60,0.94,302,1.73,0.00,0.00,n
716,Vila Nova de Gaia,2022-06-13 00:00:00,20.62,20.77,78,broken clouds,70,1.09,179,1.43,0.00,0.00,n
717,Vila Nova de Gaia,2022-06-13 03:00:00,20.35,20.45,77,clear sky,1,1.13,132,1.40,0.00,0.00,n
718,Vila Nova de Gaia,2022-06-13 06:00:00,20.55,20.70,78,clear sky,1,0.79,148,1.31,0.00,0.00,d
