In [105]:
import asyncio
import re # regular-expression
import json
import datetime

from motor.motor_asyncio import AsyncIOMotorClient
from pymongo.server_api import ServerApi
import httpx
import bs4 # beautifulsoup
import pandas as pd
import tqdm.notebook

In [270]:
location = "Zweisimmen,Switzerland"

In [271]:
def parse_weather_table(date: datetime.date, table: str, location: str):
    soup = bs4.BeautifulSoup(table)
    
    # Finds all times in the Response and combines it to a datetime-object
    timestamps = [
        datetime.datetime.combine(date,datetime.time.fromisoformat(td.text))
        for td in soup.table.thead.find_all("td")
    ]
    
    tbody = soup.table.tbody
    
    data = dict(
        # for temperature and wind, the data can be found in the html-tag 'span class'
        # (with dot-notation instead of comma like in the innerHTML);
        # rain and cloud_percentage are only in the innerHTML, so they are found by the <td> around them
        temp_C = [
            float(span["data-temp"])
            for span in tbody.find("th",text="Temperatur").parent.find_all("span",class_="day_temp")
        ],
        rain_mm = [
            float(td.find("span",class_=lambda c:not c).text.replace(",","."))
            for td in tbody.find("th",text="Niederschlag").parent.find_all("td")
        ],
        wind_kmh = [
            float(span["data-wind"])
            for span in tbody.find("th",text="Wind").parent.find_all("span",class_="day_wind")
        ],
        cloud_percent = [
            float(td.text.replace(",",".").rstrip('%')) # no percentage-symbol, instead of , we need dots for decimals
            for td in tbody.find("th",text="Wolkendecke").parent.find_all("td")
        ],
    )
    
    return pd.DataFrame(index=timestamps,data=data)

In [279]:
async def collect_weather_data(location):
    
    uri = "mongodb+srv://scientificprogramming:***REMOVED***@scientificprogramming.nzfrli0.mongodb.net/test"
    DBclient = AsyncIOMotorClient(uri, server_api=ServerApi('1'))
    db = DBclient.data
    weather2_collection = db.wetter2
    
    data = {}
    failed = {}
    async with httpx.AsyncClient(
        base_url="https://www.wetter2.com/v1/",
    ) as client:
        date_range = tqdm.notebook.tqdm(pd.date_range("2000-01-01","2000-01-03",freq="D"))
        for base_date in date_range:
            day = base_date.day
            month = base_date.month
            date_range.set_description(f"{day=} {month=}")

            res = await client.post(
                url="past-weather/",
                data={
                    "place": location,
                    "day": day,
                    "month": month,
                    "city": location.split(',')[0],
                    "country": location.split(',')[1],              
                },
                headers={
                    "X-Requested-With": "XMLHttpRequest",
                    "authority": "94d9af7f3f88b1f06e944301ae4b886ccf7b12dd",
                },
            )
            res.raise_for_status()

            res_json = res.json()

            res_years = res_json['data']['years']
            if not isinstance(res_years, dict):
                print(f"Cannot parse data for {day=} {month=}: {str(res_years)[:50]}...")
                continue

            for k,v in res_years.items():
                date = datetime.date(year=int(k),day=day,month=month)
                try:
                    df = parse_weather_table(date, v["table"], location).tz_localize("UTC")        
                    df = df.set_index(pd.MultiIndex.from_arrays([location*df.shape[0], date],
                    names=["Location", "Datetime"]))
                        
              #      await insert_data_in_DB(weather2_collection, df)
                except Exception as ex:
                    print(f"{date} failed: {ex!r}")
                    failed[date] = v["table"]
                    continue
                data[date] = df
    len(data)
    return data

In [280]:
data = await collect_weather_data(location)

  0%|          | 0/3 [00:00<?, ?it/s]

2021-01-01 failed: TypeError('Input must be a list / sequence of array-likes.')
2022-01-01 failed: TypeError('Input must be a list / sequence of array-likes.')
2023-01-01 failed: TypeError('Input must be a list / sequence of array-likes.')
2021-01-02 failed: TypeError('Input must be a list / sequence of array-likes.')
2022-01-02 failed: TypeError('Input must be a list / sequence of array-likes.')
2023-01-02 failed: TypeError('Input must be a list / sequence of array-likes.')
2021-01-03 failed: TypeError('Input must be a list / sequence of array-likes.')
2022-01-03 failed: TypeError('Input must be a list / sequence of array-likes.')
2023-01-03 failed: TypeError('Input must be a list / sequence of array-likes.')


In [274]:
async def insert_data_in_DB(collection, data):
    data = data.reset_index().to_dict("records")
    for d in data:
        await collection.replace_one(
            dict(
                Location=d["location"],
                Datetime=d["datetime"],
            ),
            d,
            upsert=True,
        )

In [190]:
import bson

In [191]:
df.to_json()

'{"location":{"1672704000000":"Zweisimmen,Switzerland","1672707600000":"Zweisimmen,Switzerland","1672711200000":"Zweisimmen,Switzerland","1672714800000":"Zweisimmen,Switzerland","1672718400000":"Zweisimmen,Switzerland","1672722000000":"Zweisimmen,Switzerland","1672725600000":"Zweisimmen,Switzerland","1672729200000":"Zweisimmen,Switzerland","1672732800000":"Zweisimmen,Switzerland","1672736400000":"Zweisimmen,Switzerland","1672740000000":"Zweisimmen,Switzerland","1672743600000":"Zweisimmen,Switzerland","1672747200000":"Zweisimmen,Switzerland","1672750800000":"Zweisimmen,Switzerland","1672754400000":"Zweisimmen,Switzerland","1672758000000":"Zweisimmen,Switzerland","1672761600000":"Zweisimmen,Switzerland","1672765200000":"Zweisimmen,Switzerland","1672768800000":"Zweisimmen,Switzerland","1672772400000":"Zweisimmen,Switzerland","1672776000000":"Zweisimmen,Switzerland","1672779600000":"Zweisimmen,Switzerland","1672783200000":"Zweisimmen,Switzerland","1672786800000":"Zweisimmen,Switzerland"},"

In [192]:
all_data = pd.concat([v for k,v in sorted(data.items(),key=lambda kv: kv[0])])
all_data.head()

Unnamed: 0,data,location,temp_C,rain_mm,wind_kmh,cloud_percent
2021-01-01 00:00:00+00:00,2021-01-01,"Zweisimmen,Switzerland",-9.5,0.5,9.0,100.0
2021-01-01 01:00:00+00:00,2021-01-01,"Zweisimmen,Switzerland",-9.6,0.2,8.6,95.0
2021-01-01 02:00:00+00:00,2021-01-01,"Zweisimmen,Switzerland",-9.6,0.2,8.3,90.0
2021-01-01 03:00:00+00:00,2021-01-01,"Zweisimmen,Switzerland",-9.7,0.3,7.9,84.0
2021-01-01 04:00:00+00:00,2021-01-01,"Zweisimmen,Switzerland",-9.7,0.2,7.6,78.0


In [None]:
import matplotlib.pyplot as plt

fig,axes = plt.subplots(4,1,figsize=(16,16),sharex=True)
for ax,(k,v) in zip(axes,all_data.items()):
    v.plot(ax=ax)
    ax.set_ylabel(k)
all_data.temp_C.resample("D").mean().plot(ax=axes[0])
all_data.temp_C.resample("D").min().plot(ax=axes[0])
all_data.temp_C.resample("D").max().plot(ax=axes[0])