This scripts retrieve sensors data from the http://archive.luftdaten.info/ 
It uses as input a list of .csv sensors files url and save the resulting data as .csv

**this sript does not work on windows pc because of asyncio**, in this case, you can upload this notebook on google colab.
The asyncio library allows to run in this case 20 threads in parallel, making it much faster to scrap the data.

In [1]:
import datetime
import requests
import lxml.etree
from lxml import html
import pandas as pd
import asyncio
from concurrent.futures import ThreadPoolExecutor
import re
import io
import sqlalchemy
from aiohttp import ClientSession,TCPConnector
import nest_asyncio
import glob
import config
import numpy as np

# asyncio and jupyter cause trouble, this is a fix :
# https://markhneedham.com/blog/2019/05/10/jupyter-runtimeerror-this-event-loop-is-already-running/
nest_asyncio.apply()

In [2]:
## beautiful asyncio needs beautiful python 3.7
from platform import python_version

print(python_version())

3.7.1


In [3]:
engine = sqlalchemy.create_engine(f"sqlite:///{config.DB_PATH}")

### we set the date range to scrap data (luftdaten data start from 2017)

In [4]:
def set_dt_range(start='1/1/2017'):
    dt_range = pd.date_range(start, end=datetime.datetime.now().strftime("%Y-%m-%d"))
    dt_range = dt_range.strftime("%Y-%m-%d")
    dt_range = dt_range.tolist()
    return dt_range

In [5]:
regex = re.compile("^.*?\.(zip|csv)$")
async def fetch(dt, session):
    url = f'http://archive.luftdaten.info/{dt}'
   
    files_list = []
    try:
        async with session.get(url) as response:
            content = await response.text()        
            tree = html.fromstring(content)
            files = tree.xpath('//a/@href')
            files_list = list(filter(regex.search, files))
           
            return files_list
    except Exception as e:
        print(e)
        return files_list

async def run(lista):
   
    tasks = []

    # Fetch all responses within one Client session,
    # keep connection alive for all requests.
    # we set the connector to 10 because we are well educated people who play by the rules :
    connector = TCPConnector(limit=10)
    async with ClientSession(connector=connector) as session:
        for dt in lista:
            task = asyncio.ensure_future(fetch(dt, session))
            tasks.append(task)

        responses = await asyncio.gather(*tasks,return_exceptions=True )
        # you now have all response bodies in this variable
        return (responses)
    
def scrap_all_metadata(dt_lista):
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(run(dt_lista))
    data = loop.run_until_complete(future)

   
    
    return data


In [6]:
date_range = set_dt_range(start='10/1/2015')

chunks = [date_range[x:x+100] for x in range(0, len(date_range), 100)]

def scrap_all_metadata():
    for i,c in enumerate(chunks):
        df = pd.DataFrame()
        lista =scrap_all_metadata(c)
        for l in lista:
            df = pd.concat([df,pd.DataFrame(l)],axis=0)
        df.to_csv(f'metadata_{str(i)}.csv')


In [7]:
files = [f for f in glob.glob('../data/luftdaten_metadata/*.csv', recursive=True)]

In [8]:
df = pd.read_csv(files[0],index_col=0)
df.columns = ['filename']
df['filename']= df['filename'].str.replace('.csv','').str.replace('_sensor','').str.replace('_indoor','')
df['date']= df['filename'].str.slice(0,10)
df['date']= pd.to_datetime(df['date'])
df['timestamp_gmt']= df.date.astype(np.int64) / int(1e6)
df['sensor_id']= df['filename'].str.split('_').str[-1].astype(int)

In [11]:
def store_metadata(file_name):
    df = pd.read_csv(file_name,index_col=0)
    df.columns = ['filename']
    df['filename']= df['filename'].str.replace('.csv','').str.replace('_sensor','').str.replace('_indoor','')
    df['date']= df['filename'].str.slice(0,10)
    df['date']= pd.to_datetime(df['date'])
    df['timestamp_gmt']= df.date.astype(np.int64) / int(1e6)
    df['sensor_id']= df['filename'].str.split('_').str[-1].astype(int)
    df['sensor_type_name']= df.filename.str.extract(r'_\s*([^\.]*)\s*\_', expand=False)
    df[['timestamp_gmt','sensor_id','sensor_type_name']].to_sql('luftdaten_sensors_metadata', con=engine,if_exists='append',index=False)

In [12]:
for f in files:
    print(f)
    store_metadata(f)

../data/luftdaten_metadata/metadata_6.csv
../data/luftdaten_metadata/metadata_4.csv
../data/luftdaten_metadata/metadata_3.csv
../data/luftdaten_metadata/metadata_7.csv
../data/luftdaten_metadata/metadata_12.csv
../data/luftdaten_metadata/metadata_2.csv
../data/luftdaten_metadata/metadata_0.csv
../data/luftdaten_metadata/metadata_11.csv
../data/luftdaten_metadata/metadata_9.csv
../data/luftdaten_metadata/metadata_10.csv
../data/luftdaten_metadata/metadata_8.csv
../data/luftdaten_metadata/metadata_1.csv
../data/luftdaten_metadata/metadata_5.csv
../data/luftdaten_metadata/metadata_13.csv
