In [39]:
import pandas as pd
import requests
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point
from matplotlib.colors import LogNorm
from mpl_toolkits.axes_grid1 import make_axes_locatable
import plotly.express as px

In [40]:
res = requests.get("https://api.openaq.org/v2/locations?country=UG", headers={"X-API-Key": ""})
json_data = res.json()
results = json_data.get('results', [])

In [41]:
# convert json to dataframe
data = pd.DataFrame(results)
data.head(47)

Unnamed: 0,id,city,name,entity,country,sources,isMobile,isAnalysis,parameters,sensorType,coordinates,lastUpdated,firstUpdated,measurements,bounds,manufacturers
0,551971,,IPA Busia,,UG,,False,,"[{'id': 128, 'unit': 'f', 'count': 76397, 'ave...",,"{'latitude': 0.455464, 'longitude': 34.12366}",2023-11-29T14:29:01+00:00,2023-02-01T07:13:34+00:00,1080960,"[34.12366, 0.455464, 34.12366, 0.455464]","[{'modelName': 'PurpleAir Sensor', 'manufactur..."
1,367233,,Rakai Hospital,,UG,,False,,"[{'id': 133, 'unit': 'particles/cm³', 'count':...",,"{'latitude': -0.71297, 'longitude': 31.4033}",2023-11-29T14:28:43+00:00,2022-10-20T14:24:48+00:00,873852,"[31.4033, -0.71297, 31.4033, -0.71297]","[{'modelName': 'PurpleAir Sensor', 'manufactur..."
2,367229,,Kasensero HC2,,UG,,False,,"[{'id': 2, 'unit': 'µg/m³', 'count': 140244, '...",,"{'latitude': -0.91343, 'longitude': 31.763}",2023-11-29T14:28:32+00:00,2022-10-20T12:24:52+00:00,1278882,"[31.763, -0.91343, 31.763, -0.91343]","[{'modelName': 'PurpleAir Sensor', 'manufactur..."
3,747107,,Lyantonde RHSP,,UG,,False,,"[{'id': 133, 'unit': 'particles/cm³', 'count':...",,"{'latitude': -0.407055, 'longitude': 31.15865}",2023-11-29T14:28:31+00:00,2023-03-22T16:00:01+00:00,992916,"[31.15865, -0.407055, 31.15865, -0.407055]","[{'modelName': 'PurpleAir Sensor', 'manufactur..."
4,368591,,Mpugwe HC3,,UG,,False,,"[{'id': 128, 'unit': 'f', 'count': 65579, 'ave...",,"{'latitude': -0.26994, 'longitude': 31.81011}",2023-11-29T14:28:26+00:00,2022-10-25T00:01:30+00:00,1105308,"[31.81011, -0.26994, 31.81011, -0.26994]","[{'modelName': 'PurpleAir Sensor', 'manufactur..."
5,370297,,UroCare,,UG,,False,,"[{'id': 129, 'unit': 'particles/cm³', 'count':...",,"{'latitude': 0.35329, 'longitude': 32.53511}",2023-11-29T14:28:15+00:00,2022-11-07T16:25:09+00:00,930816,"[32.53511, 0.35329, 32.53511, 0.35329]","[{'modelName': 'PurpleAir Sensor', 'manufactur..."
6,367238,,Byakabanda HC3,,UG,,False,,"[{'id': 134, 'unit': '%', 'count': 75022, 'ave...",,"{'latitude': -0.75919, 'longitude': 31.385141}",2023-11-29T14:28:15+00:00,2022-10-20T14:31:28+00:00,1270296,"[31.385141, -0.75919, 31.385141, -0.75919]","[{'modelName': 'PurpleAir Sensor', 'manufactur..."
7,368621,,Nyendo HC3,,UG,,False,,"[{'id': 128, 'unit': 'f', 'count': 95, 'averag...",,"{'latitude': -0.32349, 'longitude': 31.76217}",2023-11-29T14:28:14+00:00,2022-10-25T00:25:08+00:00,1296991,"[31.76217, -0.32349, 31.76217, -0.32349]","[{'modelName': 'PurpleAir Sensor', 'manufactur..."
8,370296,,UVRI,,UG,,False,,"[{'id': 19, 'unit': 'µg/m³', 'count': 78634, '...",,"{'latitude': 0.07483, 'longitude': 32.45825}",2023-11-29T14:28:09+00:00,2022-10-30T14:24:05+00:00,789966,"[32.45825, 0.07483, 32.45825, 0.07483]","[{'modelName': 'PurpleAir Sensor', 'manufactur..."
9,1443366,,Mutukula HC3,,UG,,False,,"[{'id': 1, 'unit': 'µg/m³', 'count': 9915, 'av...",,"{'latitude': -0.99005, 'longitude': 31.41993}",2023-11-29T14:28:06+00:00,2023-08-16T18:48:54+00:00,117336,"[31.41993, -0.99005, 31.41993, -0.99005]","[{'modelName': 'N/A', 'manufacturerName': 'Ope..."


In [None]:
# data[['name']]

In [None]:
# general info
data.info()

In [None]:
# number of rows, number of columns
data.shape

In [None]:
data.dtypes

In [None]:
# Number of features/variables
num_variables = len(data.columns)
print(f"Number of features/variables: {num_variables}")

Type of Each Feature:

| **Categorical Features** | **Numerical Features**      | **Temporal Features**       |
|--------------------------|-----------------------------|-----------------------------|
| city                     | id                          | lastUpdated                 |
| name                     | isMobile                    | firstUpdated                |
| entity                   | isAnalysis                  |                             |
| country                  | parameters                  |                             |
| sources                  | coordinates                 |                             |
| sensorType               | lastUpdated                 |                             |
| manufacturers            | firstUpdated                |      

In [None]:
# Check for missing values in the dataset
data.isnull()

In [None]:
# Count the number of missing values for each feature
data.isnull().sum()


In [None]:
# Select only the relevant columns
selected_columns = ['id', 'name', 'country', 'isMobile', 'parameters', 'coordinates', 'lastUpdated',
                     'firstUpdated', 'measurements', 'bounds', 'manufacturers']

# Subset the DataFrame with relevant columns
selected_df = data[selected_columns]

selected_df.head()

In [None]:
selected_df.describe()

In [None]:
# Compare air quality measurements over time to identify trends and patterns. 
# Are there specific months, seasons, or days of the week when air quality is consistently better or worse?

# Convert 'lastUpdated' to datetime format
data['lastUpdated'] = pd.to_datetime(data['lastUpdated'])

# debugger
# data['lastUpdated'].head()

# Set 'lastUpdated' as the index
data.set_index('lastUpdated', inplace=True)

# Resample data to daily or weekly averages for smoother trends
resampled_data = data['measurements'].resample('D').mean()  # 'D' for daily, 'W' for weekly

# Plotting the temporal trends
plt.figure(figsize=(12, 6))
plt.plot(resampled_data.index, resampled_data, marker='o', linestyle='-', color='b')
plt.title('Temporal Trends in Air Quality')
plt.xlabel('Date')
plt.ylabel('Average Air Quality Measurement')
plt.grid(True)
plt.show()

The scatter plot shows the average air quality measurement for each month from July 2021 to Nov 2023. The air quality measurement is on a scale of 0 to 1, with 0 being the best air quality and 1 being the worst.

The overall trend in air quality is positive, with the average air quality measurement decreasing over time. This suggests that air quality is improving overall. However, there are some fluctuations in air quality from month to month.

In [None]:
# univariate visualization: 01
# geographical distribution of air quality measurements using the latitude and longitude information.

# Convert 'coordinates' column to Point geometry
data['geometry'] = data['coordinates'].apply(lambda coord: Point(coord['longitude'], coord['latitude']))

# Create a GeoDataFrame
gdf = gpd.GeoDataFrame(data, geometry='geometry')

# Plotting
fig, ax = plt.subplots(figsize=(12, 8))

# Color-coding based on air quality measurements
scatter = gdf.plot(
    ax=ax,
    markersize=data['measurements'] / 5000,
    c=data['measurements'],
    cmap='viridis',
    legend=True,
    norm=LogNorm(),
    alpha=0.7
)

# Add colorbar
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.1)

# Capture the mappable for the colorbar
sm = plt.cm.ScalarMappable(cmap='viridis', norm=LogNorm())
sm.set_array(data['measurements'])
cbar = plt.colorbar(sm, cax=cax)
cbar.set_label('Air Quality Measurements')

# Center the title
title = ax.set_title('Spatial Distribution of Air Quality Measurements')
title.set_position([.5, 1.05])  # Adjust the y-coordinate to center the title

plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()



The air quality measurements from the scatter plot show the highest concentrations of air pollution in the southwest corner of the image, where the latitude and longitude values are lowest. The air pollution then decreases in concentration towards the northeast corner of the image, where the latitude and longitude values are highest. This suggests that there may be a source of air pollution in the southwest corner of the image, such as a factory or power plant.

The colors in the scatter plot represent different levels of air pollution, with the darkest colors representing the highest levels of pollution. 

In [None]:
# univariate visualization: 02
# Histogram of Air Quality Measurements

plt.figure(figsize=(10, 6))
plt.hist(data['measurements'], bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Air Quality Measurements')
plt.xlabel('Air Quality Measurements')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


The x-axis shows the air quality measurements, in units of 1e6 (parts per million).
The y-axis shows the frequency, in number of measurements.
The highest point on the histogram is at 0.3, which means that there were more measurements at this level than any other.
The histogram is roughly symmetrical, meaning that there are about the same number of measurements above and below the mean.
The histogram has a long tail at the higher end, which means that there are a few measurements with very high air quality levels.

In [None]:
# multivariate visualiation: 01

# Extract latitude and average particles data
latitude = data['coordinates'].apply(lambda x: x['latitude'])
average_particles = data['parameters'].apply(lambda x: x[0]['average'])

# Create a scatter plot
plt.figure(figsize=(12, 8))
sns.scatterplot(x=latitude, y=average_particles, hue=sensor_names, palette='viridis')
plt.title('Relationship between Latitude and Average Particles per Cubic Meter')
plt.xlabel('Latitude')
plt.ylabel('Average Particles per Cubic Meter')
plt.legend(title='Sensor Name', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

The graph shows a general trend of decreasing average particle concentration with increasing latitude. This is likely due to a number of factors, including temperature, population density, and vegetation cover.

The average particle concentration ranges from about 200 to 800 particles per cubic meter. The highest average particle concentrations are found in urban areas, such as Kampala and Jinja. The lowest average particle concentrations are found in rural areas, such as Buyamba HC3 and Mutukula HC3.

In [None]:
# multivariate visualiation: 02

# Assuming 'coordinates' column contains latitude and longitude information
data['latitude'] = data['coordinates'].apply(lambda x: x['latitude'])
data['longitude'] = data['coordinates'].apply(lambda x: x['longitude'])

# Convert 'lastUpdated' to datetime
data['lastUpdated'] = pd.to_datetime(data['lastUpdated'])

# Create a 3D scatter plot
fig = px.scatter_3d(data, x='lastUpdated', y='latitude', z='longitude',
                    color='measurements', size='measurements', size_max=30,
                    opacity=0.7, title='Time Series 3D Scatter Plot')

# Show the interactive plot
fig.show()

The graph shows a sharp increase in the number of measurements in late 2023. This could be due to a number of factors, such as increased public awareness of air quality issues, or a new government initiative to monitor air quality more closely.
The graph shows a slight dip in the number of measurements in late 2023. This could be due to a number of factors, such as a decrease in funding for air quality monitoring programs, or a change in the priorities of government agencies.
The graph shows a significant seasonal pattern, with more measurements taken in the summer months than in the winter months. This is likely due to a number of factors, such as weather conditions, human activities, and plant growth.