In [1]:
import os
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from handling_date_formats import try_parse_dates

In [2]:
# Load data
base_path = os.path.join('..', 'data', 'geoai-ground-level-no2-estimation-challenge20240710-12938-q3sk51')
test_path = os.path.join(base_path, 'Test.csv')
test_df = pd.read_csv(test_path)

# Define a list of date formats to try
date_formats = ['%Y-%m-%d', '%d-%m-%Y', '%d/%m/%y', '%d-%m-%y', '%m/%d/%y']

# Try parsing the 'Date' column using the defined formats
test_df['Date'] = try_parse_dates(test_df['Date'], date_formats)

# Inspect the first few rows to verify the reformatting
print(test_df['Date'].head())

Multiple date formats needed: ['%d/%m/%y', '%m/%d/%y']
All dates were successfully converted.
0   2019-01-01
1   2019-01-01
2   2019-01-01
3   2019-01-01
4   2019-01-01
Name: Date, dtype: datetime64[ns]


In [3]:
# Extract years, months and days from the column 'Date'.
test_df['Year'] = test_df['Date'].dt.year
test_df['Month'] = test_df['Date'].dt.month
test_df['Day'] = test_df['Date'].dt.day

# Count unique values
unique_years = test_df['Year'].nunique()
unique_months = test_df['Month'].nunique()
unique_days = test_df['Day'].nunique()

print(f"Number of unique years: {unique_years}")
print(f"Number of unique months: {unique_months}")
print(f"Number of single days: {unique_days}")

Number of unique years: 3
Number of unique months: 12
Number of single days: 31


In [4]:
# Counting entries per year
entries_per_year = test_df.groupby('Year').size()
print("Entries per year:")
print(entries_per_year)

Entries per year:
Year
2019    2190
2020    2196
2021    2190
dtype: int64


In [5]:
# Counting entries per month (for all years)
entries_per_month = test_df.groupby('Month').size()
print("Entries per month:")
print(entries_per_month)

Entries per month:
Month
1     558
2     510
3     558
4     540
5     558
6     540
7     558
8     558
9     540
10    558
11    540
12    558
dtype: int64


In [6]:
# Entries per month for each year
entries_per_month_per_year = test_df.groupby(['Year', 'Month']).size().unstack(fill_value=0)
print("Entries per month per year:")
print(entries_per_month_per_year)

Entries per month per year:
Month   1    2    3    4    5    6    7    8    9    10   11   12
Year                                                             
2019   186  168  186  180  186  180  186  186  180  186  180  186
2020   186  174  186  180  186  180  186  186  180  186  180  186
2021   186  168  186  180  186  180  186  186  180  186  180  186


In [9]:
# Distribution of NO2_total by Year
fig_year = px.box(test_df, x='Year', y='NO2_total', title='NO2_total Distribution by Year', labels={'NO2_total': 'NO2_total', 'Year': 'Year'})
fig_year.show()

# Distribution of NO2_total by Month
fig_month = px.box(test_df, x='Month', y='NO2_total', title='Distribution of NO2_total by Month', labels={'NO2_total': 'NO2_total', 'Month': 'Month'})
fig_month.show()

# Distribution of NO2_total by Day of the Month
fig_day = px.box(test_df, x='Day', y='NO2_total', title='NO2_total Distribution by Day of the Month', labels={'NO2_total': 'NO2_total', 'Day': 'Day'})
fig_day.show()
