## Imports (run all)

In [None]:
import kaleido
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import requests as rq
import json
from tqdm.auto import tqdm
import os
import ast
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import random
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
# from get_satellite_data import get_data

In [None]:

def nice_plot(fig,  x_label, y_label, title=None, height=550, width=800, legend=True, y_range=None, x_range=None):
    # set background to white
    fig.update_layout(plot_bgcolor='white')
    # change fig size
    fig.update_layout(height=height, width=width)
    # change x axis title
    fig.update_xaxes(title_text=x_label)
    #change y axis title
    fig.update_yaxes(title_text=y_label)

    # change title
    if title:
        fig.update_layout(title_text=title, title_x=0.5)

    if not legend:
        fig.update_layout(showlegend=False)

    if y_range:
        fig.update_layout(yaxis_range=y_range)
        
    fig.update_layout(
        margin=dict(l=0, r=0, b=0, t=0),
    )
    
    # add axis lines
    fig.update_yaxes(showline=True,  # add line at x=0
                     linecolor='black',  # line color
                     linewidth=2.4,  # line size
                     #  ticks='outside',  # ticks outside axis
                     mirror='allticks',  # add ticks to top/right axes
                     tickwidth=2.4,  # tick width
                     tickcolor='black',  # tick color
                     )
    fig.update_xaxes(showline=True,
                     showticklabels=True,
                     linecolor='black',
                     linewidth=2.4,
                     #  ticks='outside',
                     mirror='allticks',
                     tickwidth=2.4,
                     tickcolor='black',
                     )
    return fig


## traffic data (not needed for now)

In [None]:

traffic = pd.read_feather('london_traffic.feather')

In [None]:
# change to ndarray to list
traffic.lat_long = traffic.lat_long.apply(lambda x: x.tolist())

In [None]:
traffic['sum'] = traffic.lat_long.apply(lambda x: sum(x))
sums = list(traffic['sum'].unique())

In [None]:
unique_sums = traffic['sum'].unique()
lon_lat_longs = traffic[traffic['sum'].isin(unique_sums)].lat_long.to_list()

In [None]:
traffic.strings = traffic.lat_long.astype(str)
unique_strings = traffic.strings.unique()
lat_longs = [ast.literal_eval(x) for x in unique_strings]


In [None]:

lats = [x[0] for x in lat_longs]
longs = [x[1] for x in lat_longs]

In [None]:
lon_lat_longs = traffic.lat_long.unique()

In [None]:
lon_lat_longs_1 = lon_lat_longs[::2]
lon_lat_longs_2 = lon_lat_longs[1::2]
lon_lat_longs = list(zip(lon_lat_longs_1, lon_lat_longs_2)) # all the unique UK long and lats

In [None]:
# plot lon_lat_longs on map
fig = px.scatter_mapbox(lat=lats, lon=longs, mapbox_style='carto-positron')
fig.update_layout(width=500, height=500)
fig.show()

In [None]:
traffic.drop(columns=['level_0', 'index', 'id'], inplace=True)
traffic['lat_long'] = traffic.lat_long.apply(lambda x: tuple(x.tolist()))

In [None]:
import math

def haversine(lat_long1, lat_long2):
    R = 6371 # radius of earth in kilometers
    lat1, lon1 = lat_long1
    lat2, lon2 = lat_long2
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)
    a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance


In [None]:
t = '2020-05-05 19:00:00'
lat_long = ground.lat_long.iloc[-1]
dists = traffic[traffic['date_time'] == '2021-05-05 07:00:00']['lat_long'].apply(lambda x: haversine(lat_long, x))


In [None]:
def distance_between_points(point1, point2):
    x1, y1 = point1
    x2, y2 = point2
    return ((x2 - x1) ** 2 + (y2 - y1))


In [None]:
min(traffic[traffic['date_time'] == '2021-05-05 07:00:00']['lat_long'].apply(lambda x: abs(distance_between_points(lat_long, x))))

In [None]:
traffic['strings'] = traffic.lat_long.astype(str)
unique_strings = traffic.strings.unique()
traffic_points = [ast.literal_eval(x) for x in unique_strings]

In [None]:
traffic

In [None]:
hours.sum()

In [None]:
traffic['date'] = traffic.date_time.dt.date
hours = traffic.groupby('date')['count'].count()
px.scatter(hours)

In [None]:
lat1 = [x[0] for x in traffic_points]
lon1 = [x[1] for x in traffic_points]

lat2 = [x[0] for x in ground_locs]
lon2 = [x[1] for x in ground_locs]

fig = px.scatter_mapbox(lat=lat1 + lat2, lon=lon1+lon2, 
                        mapbox_style='carto-positron',
                        color=[0 for _ in range(len(lat1))] + [0.5 for _ in range(len(lat2))],
                        color_discrete_sequence=['red', 'blue'])


In [None]:
fig.show()

In [None]:
def weight(distance, cutoff):
  if distance < 1 :
    weighting = 1-distance
  else: 
    weighting = 0
  return weight

def get_traffic(traffic_df, lat_long, time=None, cutoff=1):

  # get traffic locations
  traffic_df['strings'] = traffic_df.lat_long.astype(str)
  unique_strings = traffic_df.strings.unique()
  traffic_points = [ast.literal_eval(x) for x in unique_strings]


  # calculate distance of between sensor and all lat_long points
  distances = {x:haversine(lat_long, x) for x in traffic_points}
  weights = {k:weight(v) for k, v in distances}

  

  return 

#


## Bristol ground data preprocessing

### Preprocessing steps

In [None]:
# load ground-based data
from pandas.api.types import CategoricalDtype
ground = pd.read_feather('air_quality_data.feather')

# convert ground date time to utc
ground['time'] = pd.to_datetime(ground['Date Time'], utc=True)

# create lat_long column for ground data
ground['lat_long'] = ground.geo_point_2d.apply(
    lambda x: x.split(',')).apply(lambda x: (float(x[0]), float(x[1])))

# round lat_long column to 3 dp
ground['lat_long'] = ground.lat_long.apply(
    lambda x: (round(x[0], 3), round(x[1], 3)))

# get unique points from ground data
points = ground.lat_long.unique().tolist()

days = ['Monday', 'Tuesday', 'Wednesday',
        'Thursday', 'Friday', 'Saturday', 'Sunday']
cat_type = CategoricalDtype(categories=days, ordered=True)
ground['day_of_week'] = ground.time.dt.day_name().astype(cat_type)

ground['day'] = ground['time'].dt.day
ground['week'] = ground['time'].dt.week

In [None]:
ground = ground[ground['Date Time'] > start]

In [None]:
# combine satellite with ground
bris_combined = pd.merge_asof(satellite.sort_values('time'), ground.sort_values('time').dropna(subset=['time']), on='time', by='lat_long', direction='nearest')

In [None]:
# change type to string
bris_combined['Location'] = bris_combined['Location'].astype(str)

In [None]:
bris_combined.to_feather('bristol_ground+sat.feather')

### Create bristol ground, satellite and combined dataframes (preprocessed)

In [None]:
from get_satellite_data import get_data

In [None]:
start = '2018-07-11'
end = '2022-10-31'
ground = pd.read_feather('bristol_ground_cleaned.feather')
no2_lat_longs = [list(y) for y in list(ground.lat_long.apply(lambda x: tuple(x)).unique())]
reverse_no2_lat_longs = [point[::-1] for point in no2_lat_longs]
no2 = get_data(type='nitrogen dioxide', location=reverse_no2_lat_longs, start_date=start, end_date=end, scale=1)
ground_locs = [ast.literal_eval(x) for x in ground.geo_point_2d.unique()]
bris_combined = pd.read_feather('bristol_ground_sat_weather.feather')

In [None]:
# aerosol = get_data(type='aerosol', location=reverse_no2_lat_longs, start_date=start, end_date=end, scale=1)
# bris_combined.lat_long = bris_combined.lat_long.apply(lambda x: tuple(x))
bristol = pd.merge_asof(aerosol.sort_values('time'), bris_combined.sort_values('time').dropna(subset=['time']), on='time', by='lat_long', direction='nearest')

In [None]:
bristol.to_feather('bristol_ground_sat_weather_new.feather')

### Add in weather to create model_df dataframe

In [None]:
model_df = pd.DataFrame(columns=['longitude', 'latitude', 'time',
       'tropospheric_NO2_column_number_density', 'lat_long', 'Date Time',
       'NOx', 'NO2', 'NO', 'SiteID', 'PM10', 'NVPM10', 'VPM10', 'NVPM2.5',
       'PM2.5', 'VPM2.5', 'CO', 'O3', 'SO2', 'Temperature', 'RH',
       'Air Pressure', 'Location', 'geo_point_2d', 'DateStart', 'DateEnd',
       'Current', 'Instrument Type', 'day_of_week', 'day', 'week',
       'temperature_2m', 'relativehumidity_2m', 'windspeed_10m'])

for lat, long in bris_combined.lat_long.unique():
    print(lat, long)
    weather_url = f"https://archive-api.open-meteo.com/v1/era5?latitude={lat}&longitude={long}&start_date={start}&end_date={end}&hourly=temperature_2m,relativehumidity_2m,windspeed_10m"
    response = rq.get(weather_url)
    weather_json = response.json()
    weather = pd.DataFrame.from_dict(weather_json['hourly'])
    weather['time'] = pd.to_datetime(weather['time'], utc=True)

    model_df = pd.concat([model_df, pd.merge_asof(bris_combined[bris_combined.lat_long == (lat, long)].sort_values('time'), weather.sort_values('time'), on='time', direction='nearest', tolerance=pd.Timedelta('30min')).dropna(subset=['NO2'])])

model_df.reset_index(inplace=True)
model_df.drop(columns=['index'], inplace=True)

In [None]:
model_df.to_feather('bristol_ground_sat_weather.feather')

## Just read in bristol ground, satellite, time and weather dataframe (model_df)

In [None]:
model_df = pd.read_feather('bristol_ground_sat_weather.feather')
bristol = pd.read_feather('bristol_ground_sat_weather.feather')

ground = pd.read_feather('bristol_ground_cleaned.feather')
ground.lat_long = ground.lat_long.apply(lambda x: tuple(x))

# get lat_long to location dict
# read in pickle
with open('lat_long_to_location.pkl', 'rb') as f:
    lat_long_to_location = pickle.load(f)


## London ground data preprocessing

Read in London ground data

In [None]:
london_ground = pd.read_feather('all_london_data.feather')

Get locations of London ground stations

In [None]:
url = "http://api.erg.kcl.ac.uk/AirQuality/Information/MonitoringSites/GroupName=London/Json"
response = rq.get(url)
data = response.json()

locs = pd.DataFrame.from_dict(data['Sites'], orient='columns')

# expand the nested dictionary
locs = pd.concat([locs.drop(['Site'], axis=1),
                 locs['Site'].apply(pd.Series)], axis=1)

# remove @ sign from column names
locs.columns = locs.columns.str.replace('@', '')

# get rid of closed stations
locs = locs[locs.DateClosed == ''].reset_index()

# drop date closed and index columns, not needed
locs.drop(['DateClosed', 'index'], axis=1, inplace=True)
locs = locs[locs.Latitude != '']

locs['lat_long'] = locs.Latitude.apply(lambda x: round(float(x), 3)).astype(
    str) + ',' + locs.Longitude.apply(lambda x: round(float(x), 3)).astype(str)
locs['lat_long'] = locs['lat_long'].apply(
    lambda x: tuple(map(float, x.split(','))))

london_lat_longs = locs.lat_long.unique()
reversed_points = [point[::-1] for point in london_lat_longs]

In [None]:
code_to_lat_long = dict(zip(locs.SiteCode, locs.lat_long))

Add in satellite data

In [None]:
sat_aerosol = get_data(type='aerosol', location=reversed_points[:35], start_date=start, end_date=end, scale=1)
sat_aerosol = sat_aerosol.append(get_data(type='aerosol', location=reversed_points[35:70], start_date=start, end_date=end, scale=1))
sat_aerosol = sat_aerosol.append(get_data(type='aerosol', location=reversed_points[70:], start_date=start, end_date=end, scale=1))

sat = get_data(type='nitrogen dioxide',location=reversed_points[:35], start_date=start, end_date=end, scale=1)
sat = sat.append(get_data(type='nitrogen dioxide',location=reversed_points[35:70], start_date=start, end_date=end, scale=1))
sat = sat.append(get_data(type='nitrogen dioxide',location=reversed_points[70:], start_date=start, end_date=end, scale=1))

In [None]:
# merge NO2 and aerosol data
london_sat = pd.merge_asof(sat_aerosol.sort_values('time'), sat.sort_values('time'), on='time', by='lat_long',
              direction='nearest')[['lat_long', 'time', 'absorbing_aerosol_index', 'tropospheric_NO2_column_number_density']]

In [None]:
london_ground['time'] = london_ground['MeasurementDateGMT']

In [None]:
# change dtype to string, change to datetime, then we can merge

london_ground.site_code = london_ground.site_code.astype(str)
london_ground['lat_long'] = london_ground['site_code'].map(code_to_lat_long)
london_ground.time = pd.to_datetime(london_ground.time, utc=True)


In [None]:
# merge satellite and ground data
london = pd.merge_asof(london_ground.sort_values('time'), london_sat.sort_values('time'), on='time', by='lat_long',
                direction='nearest')

In [None]:
london.drop(columns=['Nitric Oxide (ug/m3)', 
             'Oxides of Nitrogen (ug/m3)', 'Sulphur Dioxide (ug/m3)', 'site_code',
             'PM10 Particulate (ug/m3)', 'PM2.5 Particulate (ug/m3)',
             'Carbon Monoxide (mg/m3)', 'Ozone (ug/m3)'], inplace=True)
london.drop('MeasurementDateGMT', axis=1, inplace=True)


Add in weather to london ground and satellite data

In [None]:
london_df = pd.DataFrame(columns=['Nitrogen Dioxide (ug/m3)', 'time', 'lat_long',
       'absorbing_aerosol_index', 'tropospheric_NO2_column_number_density',
       'temperature_2m', 'relativehumidity_2m', 'windspeed_10m'])

i = 0
for lat, long in london.lat_long.dropna().unique():
    i += 1
    print(lat, long, i)
    weather_url = f"https://archive-api.open-meteo.com/v1/era5?latitude={lat}&longitude={long}&start_date={start}&end_date={end}&hourly=temperature_2m,relativehumidity_2m,windspeed_10m"
    response = rq.get(weather_url)
    weather_json = response.json()
    weather = pd.DataFrame.from_dict(weather_json['hourly'])
    weather['time'] = pd.to_datetime(weather['time'], utc=True)

    london_df = pd.concat([london_df, pd.merge_asof(london[london.lat_long == (lat, long)].sort_values('time'), weather.sort_values('time'), on='time', direction='nearest', tolerance=pd.Timedelta('30min')).dropna(subset=['Nitrogen Dioxide (ug/m3)'])])


In [None]:

london_df.reset_index(inplace=True)
london_df.drop(columns=['index'], inplace=True)
london_df.to_feather(path='london_ground_sat_weather.feather')


## Just read in london ground, satellite, time and weather dataframe (london)

In [None]:
london = pd.read_feather(path='london_ground_sat_weather.feather')

## Model experimentation

Add in extra features

In [None]:
model_df['lat_long'] = model_df.lat_long.apply(lambda x: tuple(x))
model_df['hour'] = model_df.time.dt.hour
model_df['minute'] = model_df.time.dt.minute

# ONE HOT ENCODE DAT BITCH
model_df = pd.concat([model_df, pd.get_dummies(model_df['day_of_week'])], axis=1)


Create train and test sets 

In [None]:
# create a list of the lat_longs
lat_longs = list(model_df.lat_long.unique())
lat_longs.remove((51.459, -2.595))

# create a list of 5 random lat_longs to train on
train_lat_longs = random.sample(lat_longs, 5)

# create a list of the remaining lat_longs to test on
test_lat_longs = [x for x in lat_longs if x not in train_lat_longs]


### code to test for all locations and plot on map

In [None]:
mses = {}
# train on all but one lat_long
for loc in tqdm(lat_longs):
    location = lat_long_to_location[loc]
    test_lat_longs = [loc]
    train_lat_longs = [x for x in lat_longs if x not in test_lat_longs]
    
    # create a list of the features
    features = ['hour', 'minute',
                'tropospheric_NO2_column_number_density',
                'NO2',
                'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday',
                'day', 'week',
                'temperature_2m', 'relativehumidity_2m', 'windspeed_10m'
                ]
    features.remove('NO2')

    # create a list of the target
    target = ['NO2']

    # create a list of the train and test dataframes
    train_df = model_df[model_df.lat_long.isin(train_lat_longs)]
    test_df = model_df[model_df.lat_long.isin(test_lat_longs)].reset_index()
    
    # bootstrap model
    model = BaggingRegressor(base_estimator=RandomForestRegressor(
        n_estimators=10, max_depth=6, random_state=42), n_estimators=10, random_state=42)


    # fit the model to the training data
    model.fit(train_df[features], train_df[target].to_numpy().flatten())

    # predict the test data
    predictions = model.predict(test_df[features])

    mse = mean_squared_error(test_df[test_df.Location == location].NO2,
                       predictions[test_df[test_df.Location == location].index])
    
    mses[loc] = mse

In [None]:

# create new_mses dict without the highest mse one
new_mses = {k: v for k, v in mses.items() if v != max(mses.values())}

In [None]:
fig = nice_plot(px.scatter_mapbox(lat=[x[0] for x in new_mses.keys()],
                        lon=[x[1] for x in new_mses.keys()],
                        color=[x for x in new_mses.values()],
                        color_continuous_scale='RdYlGn_r',
                        mapbox_style='carto-positron',
                        zoom=11.5,
), x_label=' ', y_label=' ', title='NO2 RMSE by Location')
# change marker size
fig.update_traces(marker=dict(size=30))
# fig.write_image(file='MSE_for_each_location_Bristol_RF.pdf', format='pdf')
fig.show()

### normal code

In [None]:

# create a list of the features
features = ['hour', 'minute', 
            'tropospheric_NO2_column_number_density',
            'NO2',
            'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday',
            'day', 'week',
            'temperature_2m', 'relativehumidity_2m', 'windspeed_10m'
            ]
features.remove('NO2')

# create a list of the target
target = ['NO2']

# create a list of the train and test dataframes
train_df = model_df[model_df.lat_long.isin(train_lat_longs)]
test_df = model_df[model_df.lat_long.isin(test_lat_longs)].reset_index()




MODEL

In [None]:
# import mlp regressor
from sklearn.neural_network import MLPRegressor

# import functions for convolutional neural network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout

# import bagging regressor
from sklearn.ensemble import BaggingRegressor

In [None]:
# # random forest regression model
# model = RandomForestRegressor(n_estimators=10, max_depth=6, random_state=42)

# # neural network regression model
# model = MLPRegressor(hidden_layer_sizes=(100, 100, 100), activation='relu', solver='adam', max_iter=1000, random_state=42)

# # neural network model
# model = Sequential()
# model.add(Dense(100, activation='relu', input_shape=(len(features),)))
# model.add(Dense(100, activation='relu'))
# model.add(Dense(100, activation='relu'))
# model.add(Dense(1))
# model.compile(optimizer='adam', loss='mse')
 

# # convolutional neural network model
# model = Sequential()
# model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(len(features), 1)))
# model.add(Conv1D(filters=64, kernel_size=2, activation='relu'))
# model.add(MaxPooling1D(pool_size=2))
# model.add(Flatten())
# model.add(Dense(50, activation='relu'))
# model.add(Dense(1))
# model.compile(optimizer='adam', loss='mse')
 
# bootstrap model
model = BaggingRegressor(base_estimator=RandomForestRegressor(n_estimators=10, max_depth=6, random_state=42), n_estimators=10, random_state=42)



# fit the model to the training data
model.fit(train_df[features], train_df[target].to_numpy().flatten())

# predict the test data
predictions = model.predict(test_df[features])

# calculate the mean squared error
mse = mean_squared_error(test_df[target].to_numpy().flatten(), predictions)

In [None]:
predictions = np.array([x[0] for x in predictions])

In [None]:
# plot the predictions and actual values for each location with plotly
fig = nice_plot(go.Figure(), 'time', 'NO2')
for location in test_df.Location.unique():
    fig.add_trace(go.Scatter(x=test_df[test_df.Location == location].time, y=predictions[test_df[test_df.Location == location].index], mode='lines', name=f'{location} Predicted'))
    fig.add_trace(go.Scatter(x=test_df[test_df.Location == location].time, y=test_df[test_df.Location == location].NO2, mode='lines', name=f'{location} Actual'))
    # print mse for each location
    print(f'{location} mse: {mean_squared_error(test_df[test_df.Location == location].NO2, predictions[test_df[test_df.Location == location].index])}')
fig.update_layout(title='NO2 Measurements', xaxis_title='time', yaxis_title='NO2')
fig.show()

### This is the loop that creates a dataframe with different variables in it (satellite, weather, time, date) and trains a model on that data

- need to work on how data is split (by location)

In [None]:
values = []
for lat, long in bris_combined.lat_long.unique():
    # print(lat, long)
    weather_url = f"https://archive-api.open-meteo.com/v1/era5?latitude={lat}&longitude={long}&start_date={start}&end_date={end}&hourly=temperature_2m,relativehumidity_2m,windspeed_10m"
    response = rq.get(weather_url)
    weather_json = response.json()
    weather = pd.DataFrame.from_dict(weather_json['hourly'])
    weather['time'] = pd.to_datetime(weather['time'], utc=True)

# , 'day', 'week', 'hour', 'day_number'
# 'relativehumidity_2m', 'temperature_2m', 'windspeed_10m',
    model_df = pd.merge_asof(bris_combined[bris_combined.lat_long == (lat, long)].sort_values('time'), weather.sort_values('time'), on='time', direction='nearest', tolerance=pd.Timedelta('30min')).dropna(subset='NO2')

    X = model_df[[
        'tropospheric_NO2_column_number_density', # satellite
        'day', 'week', 'day_number', # date
        'hour', # time
        'relativehumidity_2m', 'temperature_2m', 'windspeed_10m', # weather
        ]]
    y = model_df['NO2']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    reg = LinearRegression().fit(X_train, y_train)
    
    predictions = reg.predict(X_test)
    values.append(reg.score(X_test, y_test))
    print(f"{bris_combined[bris_combined.lat_long == (lat, long)].Location.iloc[0]} | {round(reg.score(X_test, y_test), 2)} | {[round(x, 2) for x in reg.coef_]}")
    
# view linear regression coefficients

print(f"mean: {np.mean(values)}")


## Extra stuff that was used for plotting bristol data and initially getting london data

In [None]:
fig = nice_plot(px.scatter(bris_combined[bris_combined.Location != 'Marlborough Street'], x='NO2', y='tropospheric_NO2_column_number_density',
                trendline='ols', opacity=0.2, color='Location'), 'Ground NO2 (μg/m³)', 'Satellite NO2 (mol/m²)')

# fig.update_layout(legend=legend_dict2)

In [None]:
legend_dict2 = {}
# get correlation coefficients for each location
for lat_long in bris_combined.lat_long.unique():
    location = bris_combined[bris_combined.lat_long ==lat_long].Location.iloc[0]
    corr = bris_combined[bris_combined.Location == location][['NO2','tropospheric_NO2_column_number_density']].corr()
    legend_dict2[lat_long] = f"{location}: {round(corr.iloc[0, 1], 3)}"
    print(round(corr.NO2.iloc[1], 3), location)

In [None]:
sat_aerosol['lat_long'] = sat_aerosol.lat_long.apply(lambda x: tuple(round(i, 3) for i in x))
sat['lat_long'] = sat.lat_long.apply(lambda x: tuple(round(i, 3) for i in x))


In [None]:
bristol_points = list(ground.lat_long.unique())

In [None]:
reverse_bristol_points = [point[::-1] for point in bristol_points]

In [None]:
ground['time2020'] = ground['time'].apply(lambda x: x.replace(year=2020))

In [None]:
ground['date2020'] = ground['time2020'].apply(lambda x: x.date())

In [None]:
fig = px.line(ground.sort_values('time2020').groupby(['Location', 'date2020']).mean().reset_index(), x='date2020', y='NO2', color='Location', title='NO2 levels in Bristol')


In [None]:
ground['year'] = ground['time'].apply(lambda x: x.year)

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=2)


In [None]:
# # plot average NO2 for each location over each day of the week
fig1 = nice_plot(px.line(ground.dropna(subset='NO2').groupby(['Location', 'day_of_week']).mean().reset_index(), x='day_of_week', y='NO2', color='Location'), x_label='Day of week', y_label='', width=1000, y_range=[0, 140])

# plot average NO2 for each location over each hour of the day
fig2 = nice_plot(px.line(ground.dropna(subset='NO2').groupby(['Location', 'hour']).mean().reset_index(), x='hour', y='NO2', color='Location'), x_label='Hour of day', y_label='NO2 (μg/m³)', legend=False, y_range=[0, 140])


In [None]:
# # save figure
# fig.write_image('plots/NO2_by_day_of_week.pdf')
# fig.write_image('plots/NO2_by_hour.pdf')
# fig.write_image('plots/NO2_over_year.pdf')


In [None]:
locs['lat_long'] = locs.Latitude.apply(lambda x: round(float(x), 3)).astype(str) + ',' + locs.Longitude.apply(lambda x: round(float(x), 3)).astype(str)
locs['lat_long'] = locs.lat_long.apply(lambda x: tuple(map(float, x.split(','))))

In [None]:
ground.dropna(subset=['time'], inplace=True)
ground.sort_values(by='time', inplace=True)

In [None]:
site_code = site_codes[0]
start = '2018-07-11'
end = '2022-10-31'
end = '2018-07-13'


In [None]:
test = pd.DataFrame.from_dict(response.json(), orient='index')
data = test.Data.iloc[0]



In [None]:
# parse data from list of dictionaries
data = pd.DataFrame.from_dict(data, orient='columns')
# remove @ sign from column names
data.columns = data.columns.str.replace('@', '')
# pivot data so each species is a column
data = data.pivot(index='MeasurementDateGMT', columns='SpeciesCode', values='Value').reset_index()
data.index.name = 'index'
data['Site_Code'] = site_code


In [None]:
[x['@ColumnName'].split(': ')[-1] for x in response.json()['AirQualityData']['Columns']['Column']]


In [None]:
site_codes = locs.SiteCode.unique().tolist()

In [None]:
locs.reset_index().to_feather('locations.feather')

In [None]:
### THIS ONE ###

# start = '2018-07-11'
start = '2020-07-11'
end = '2022-10-11'
# end = '2018-07-12'
# df = pd.DataFrame()
for site_code in tqdm(site_codes[-37:]):
    url = f"http://api.erg.ic.ac.uk/AirQuality/Data/Wide/Site/SiteCode={site_code}/StartDate={start}/EndDate={end}/Json"
    response = rq.get(url)
    if response.status_code != 200:
        continue
    try:
        data = pd.DataFrame.from_dict(response.json()['AirQualityData']['RawAQData']['Data'])
        try:
            cols = [x['@ColumnName'].split(': ')[-1]for x in response.json()['AirQualityData']['Columns']['Column']]
        except:
            cols = [response.json()['AirQualityData']['Columns']['Column']['@ColumnName'].split(': ')[-1]]
        cols.insert(0, 'MeasurementDateGMT')
        data.columns = cols
        data['site_code'] = site_code
        data['MeasurementDateGMT'] = pd.to_datetime(data['MeasurementDateGMT'])
        df = pd.concat([df,data])
        print(site_code)
    except:
        print(f'error,  {site_code}, {time.ctime()}')
        
df.reset_index().to_feather('london_data3.feather')
print(f'Finished at {time.ctime()}')


In [None]:
df = pd.concat([pd.read_feather('london_data.feather'), pd.read_feather('london_data2.feather'), pd.read_feather('london_data3.feather')])

In [None]:
df.reset_index().to_feather('all_london_data.feather')

In [None]:
df['Nitric Oxide (ug/m3)'] = pd.to_numeric(df['Nitric Oxide (ug/m3)'], downcast='float')
df['Nitrogen Dioxide (ug/m3)'] = pd.to_numeric(df['Nitrogen Dioxide (ug/m3)'], downcast='float')
df['Ozone (ug/m3)'] = pd.to_numeric(df['Ozone (ug/m3)'], downcast='float')
df['PM10 Particulate (ug/m3)'] = pd.to_numeric(df['PM10 Particulate (ug/m3)'], downcast='float')
df['PM2.5 Particulate (ug/m3)'] = pd.to_numeric(df['PM2.5 Particulate (ug/m3)'], downcast='float')
df['Sulphur Dioxide (ug/m3)'] = pd.to_numeric(df['Sulphur Dioxide (ug/m3)'], downcast='float')
df['Oxides of Nitrogen (ug/m3)'] = pd.to_numeric(df['Oxides of Nitrogen (ug/m3)'], downcast='float')
df['site_code'] = df['site_code'].astype('category')
df['Benzene (ug/m3)'] = pd.to_numeric(df['Benzene (ug/m3)'], downcast='float')
df['Carbon Monoxide (mg/m3)'] = pd.to_numeric(df['Carbon Monoxide (mg/m3)'], downcast='float')


In [None]:
df.drop(columns='Benzene (ug/m3)', inplace=True)

In [None]:
df['hour'] = df['MeasurementDateGMT'].dt.hour
df['day'] = df['MeasurementDateGMT'].dt.day
df['month'] = df['MeasurementDateGMT'].dt.month
days = ['Monday', 'Tuesday', 'Wednesday',
        'Thursday', 'Friday', 'Saturday', 'Sunday']
cat_type = CategoricalDtype(categories=days, ordered=True)
df['day_of_week'] = df['MeasurementDateGMT'].dt.day_name().astype(cat_type)


In [None]:
import plotly.graph_objects as go

In [None]:
# plot nitrogen dioxide levels for each site - with a red line
fig1 = px.line(df.groupby('day_of_week').mean().reset_index(), x='day_of_week', y='Nitrogen Dioxide (ug/m3)', title='Nitrogen Dioxide Levels')
# change the colour of the line
fig1.update_traces(line_color='red')
# add ground data
fig2 = px.line(ground.groupby('day_of_week').mean().reset_index(), x='day_of_week', y='NO2')
fig2.update_traces(line_color='blue')
fig = nice_plot(go.Figure(data=fig1.data + fig2.data), 'Nitrogen Dioxide Levels', 'Day of Week', 'NO2')

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=ground.groupby('day_of_week').mean().reset_index()['day_of_week'], y=ground.groupby('day_of_week').mean().reset_index()['NO2'], name='Bristol'))
fig.add_trace(go.Scatter(x=df.groupby('day_of_week').mean().reset_index()['day_of_week'], y=df.groupby(
    'day_of_week').mean().reset_index()['Nitrogen Dioxide (ug/m3)'], name='London'))
fig = nice_plot(fig, 'Nitrogen Dioxide Levels', 'Day of Week', 'NO2')


In [None]:
# plot hourly average nitrogen dioxide levels for london and bristol
fig = go.Figure()
fig.add_trace(go.Scatter(x=ground.groupby('hour').mean().reset_index()['hour'], y=ground.groupby('hour').mean().reset_index()['NO2'], name='Bristol'))
fig.add_trace(go.Scatter(x=df.groupby('hour').mean().reset_index()['hour'], y=df.groupby('hour').mean().reset_index()['Nitrogen Dioxide (ug/m3)'], name='London'))
fig = nice_plot(fig, 'Nitrogen Dioxide Levels', 'xth hour of the day', 'NO2')
fig.show()


In [None]:

# pm25_lat_longs = list(ground[ground.Location.isin(pm25_locations)].lat_long.unique())


In [None]:
# plot no2 lat longs and pm25 lat longs on shapefile
fig = go.Figure()
fig.add_trace(go.Scattermapbox(lat=[x[0] for x in no2_lat_longs], lon=[x[1] for x in no2_lat_longs], mode='markers', marker=go.scattermapbox.Marker(size=17), text=['NO2']*len(no2_lat_longs), name='NO2'))
fig.add_trace(go.Scattermapbox(lat=[x[0] for x in pm25_lat_longs], lon=[x[1] for x in pm25_lat_longs], mode='markers', marker=go.scattermapbox.Marker(size=17), text=['PM2.5']*len(pm25_lat_longs), name='NO2 & PM2.5'))
fig.update_layout(mapbox_style="carto-positron", mapbox_zoom=11.7,
                  mapbox_center={"lat": 51.454, "lon": -2.594})
# change size of map
fig.update_layout(height=600, margin={"r":0,"t":0,"l":0,"b":0}, width=1000)


In [None]:
fig.write_image('plots/lon_bris_lat_long.pdf')

In [None]:
df.reset_index(inplace=True)

In [None]:
# df.to_feather('all_london_data.feather')

In [None]:
df.sort_values(by=['site_code', 'MeasurementDateGMT'], inplace=True)

In [None]:
from get_satellite_data import get_data

## plotting

In [None]:
bristol.lat_long = bristol.lat_long.apply(lambda x: (x[0], x[1]))

In [None]:
london.lat_long = london.lat_long.apply(lambda x: (x[0], x[1]))
london.lat_long.nunique()

In [None]:
# set time to midnight, change year to 2000
bristol['yearly_avg'] = bristol['time'].apply(lambda x: x.replace(year=2000, hour=0, minute=0, second=0, microsecond=0))

In [None]:
london['yearly_avg'] = london['time'].apply(lambda x: x.replace(year=2000, hour=0, minute=0, second=0, microsecond=0))

In [None]:
bristol['week'] = bristol['yearly_avg'].dt.week
london['week'] = london['yearly_avg'].dt.week

In [None]:
# plot weekly average nitrogen dioxide levels for 8 bristol lat_longs in red and 8 london lat_longs in blue
fig = go.Figure()
for n, i in enumerate(bristol.lat_long.unique()):
    fig.add_trace(go.Scatter(x=bristol[bristol.lat_long == i].groupby('week').mean(numeric_only=True).reset_index()[
                  'week'], y=bristol[bristol.lat_long == i].groupby('week').mean(numeric_only=True).reset_index()['NO2'], name=lat_long_to_location[i], line=dict(color='red')))
for n, i in enumerate(np.random.choice(london.lat_long.unique(), 8)):
    fig.add_trace(go.Scatter(x=london[london.lat_long == i].groupby('week').mean(numeric_only=True).reset_index()['week'], y=london[london.lat_long == i].groupby(
        'week').mean(numeric_only=True).reset_index()['Nitrogen Dioxide (ug/m3)'], name=lat_long_to_location[i], line=dict(color='blue')))# 
fig = nice_plot(fig, 'Week of the year', 'NO2', width= 1600, height=400)
fig.show()


In [None]:
fig.write_image('plots/lon_bris_year.pdf')

In [None]:
london_lat_long_to_location = dict(zip(locs.lat_long.unique(), locs.SiteName.unique()))