In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta

In [3]:
cities = ['ABZ', 'ALE', 'AMA', 'AMM', 'ASP', 'BEI', 'BOT', 'BSL', 'DEL', 'EGB',
          'HAD', 'HEL', 'HPB', 'HYY', 'KCE', 'KPZ', 'MAR', 'MHD', 'MLP', 'MUK', 
          'NAN', 'NEU', 'POV', 'SAO', 'SCH', 'SGP', 'UAE', 'VAR', 'VIE', 'WAL', 
          'ZOT']

In [86]:
def combine_datasets(folder):
    full_df = []

    for c in cities:
        df = pd.read_csv('data/'+folder+'/'+c+'.csv')
        df['station'] = c
        full_df.append(df)

    full_df = pd.concat(full_df)
    full_df = full_df.reset_index(drop=True)
    
    return full_df

In [87]:
aerosols_df = combine_datasets('aerosols')
aerosols_df = aerosols_df[['station', 'date', 'latitude', 'longitude',
                           'aermr01', 'aermr02', 'aermr03','aermr04', 'aermr05',
                           'aermr06', 'aermr07', 'aermr08',  'aermr09', 'aermr10', 'aermr11']]

aerosols_df.to_csv('data/aerosols_data.csv', index=False)

In [88]:
atmospheric_df = combine_datasets('atmospheric')
atmospheric_df = atmospheric_df[['station', 'date', 'latitude', 'longitude', 
                                 'd2m', 't2m']]

atmospheric_df.to_csv('data/atomospheric_data.csv', index=False)

In [89]:
boundary_layer_height_df = combine_datasets('boundary_layer_height')
boundary_layer_height_df = boundary_layer_height_df[['station', 'date', 'latitude', 'longitude', 
                                                     'blh']]

boundary_layer_height_df.to_csv('data/boundary_layer_height_data.csv', index=False)

In [90]:
gases_df = combine_datasets('gases')
gases_df = gases_df[['station', 'date', 'latitude', 'longitude', 
                     'co', 'c5h8', 'no2', 'no', 'so2']]

gases_df.to_csv('data/gases_data.csv', index=False)

In [93]:
slow_access_df = combine_datasets('slow_access')
slow_access_df = slow_access_df[['station', 'date', 'latitude', 'longitude', 
                                 'nh3', 'crwc', 'c10h16']]

slow_access_df.to_csv('data/slow_access_data.csv', index=False)

In [178]:
n100_df = []

for c in cities[:5]:
    df = pd.read_table('data/N100_proxy/'+c+'_N100.dat', sep='\s+', 
                       names=['year', 'month', 'day', 'hour', 'minute', 'n100'])
    
    df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
    
    if c == 'AMA':
        df['time'] = pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute']])
        df.time -= timedelta(hours=4)
        df = df.drop(columns='time')
    
    df = df.groupby('date', as_index=False).mean()
    df.date = df.date.dt.date
    df['station'] = c
    n100_df.append(df)

n100_df = pd.concat(n100_df)
n100_df = n100_df.reset_index(drop=True)
n100_df = n100_df[['station', 'date', 'n100']]
n100_df.to_csv('data/n100_data.csv', index=False)