In [1]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import glob
import os
import metpy
import metpy.calc as mpcalc
from metpy.units import units
import numpy as np
from datetime import datetime
import xarray as xr
from pint import UnitRegistry
ureg = UnitRegistry()
import seaborn as sns
from scipy import stats
from scipy.stats import f_oneway
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from scipy.stats import t
from scipy.stats import sem

In [None]:
# Function to extract epoch from the first few rows of the CSV file
def extract_epoch_from_header(file):
    # Read the first few lines to find the epoch (assuming it's in a comment or a header)
    with open(file, 'r') as f:
        skiprow = f.readline()
        skiprow = skiprow.strip().split()[0]
        for line in f:
            if "startdate:" in line.lower():  # Assuming the word 'epoch' is present in the line
                # Extract the epoch date (this assumes the date is the second item in the line)
                epoch = line.strip().split()[1]
                return int(skiprow)-1, epoch[0:4]+"-"+epoch[4:6]+"-"+epoch[6:8]+" "+epoch[8:10]+":"+epoch[10:12]+":"+epoch[12:14]
    return None  # Return None if no epoch is found

# Function to load and adjust time for each CSV
def load_and_adjust_time(file):
    # Try to extract the epoch from the header or a specific column
    nskiprows,epoch = extract_epoch_from_header(file)
   
    if epoch is None:
        raise ValueError(f"Could not find epoch for file: {file}")
   
    # Load the actual data (skipping header if necessary)
    df = pd.read_csv(file, skiprows=nskiprows,sep="\\s+")  # Adjust skiprows based on where the data starts
   
    # Convert 'time' column to a datetime, assuming it's in days
    # Adjust time column to start from the epoch time found in the file
    df['starttime_dt'] = pd.to_timedelta(df['starttime'], unit='D') + pd.Timestamp(epoch)
   
    # Return the adjusted DataFrame
    return df

In [None]:
#set a working directory till NMVOC folder
working_dir = "C:\\Year1\\"
# Find all CSV files (change the path if nedeed)
prop_files = glob.glob(working_dir+"NMVOC\\Data\\data_EBAS_propane\\*.nas")

# this function reads apropadjust times for all CSV files
dfs = []
for file in prop_files:
    df = load_and_adjust_time(file)
    dfs.append(df)

# concatenates all DataFrames (based on time)
prop_df = pd.concat(dfs).sort_values('starttime_dt').reset_index(drop=True)

# Show combined DataFrame
print(prop_df)

In [None]:
df=prop_df
#remove lines depending on flags
df = df[df.flag != 0.999]
df = df[df.flag != 0.899]
df = df[df.flag != 0.456]
df = df[df.flag != 0.260]
df = df[df.flag != 0.259]
df = df[df.flag != 0.256]
df = df[df.C3H8 != 99999.99]

df.index = df["starttime_dt"]
#remove useless columns
df = df.drop(columns=["flag"])
df = df.drop(columns=["starttime"])
df = df.drop(columns=["endtime"])
df = df.drop(columns=["C3H8.1"])
df = df.drop(columns=["C3H8.2"])
df

In [None]:
# helper columns with  hour, month, year
df['hour'] = df['starttime_dt'].dt.hour
df['month'] = df['starttime_dt'].dt.month
df['year'] = df['starttime_dt'].dt.year
df['date'] = df['starttime_dt'].dt.date
df['day'] = df['starttime_dt'].dt.dayofyear
df["month-day"] = df['starttime_dt'].dt.strftime("%m-%d")  # Format as 'MM-DD'

In [None]:
#loop for rounding to the hours
df["starttime_dt"] = [i.round("H") for i in df["starttime_dt"]]
df

In [None]:
# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# Function to determine season
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

In [None]:
# Filter data to include only records between January 1 and June 30
start_month = 1  # January
end_month = 6   # June
df_filtered = df[(df['month'] >= start_month) & (df['month'] <= end_month)]

# Create subplots for each year between January 1 and June 30
fig, axes = plt.subplots(3, 1, figsize=(12, 15))
    

# List of years to plot
years = [2019, 2020, 2021]

# Plot for each year
for i, year in enumerate(years):
    # Filter the data for the current year
    df_year = df_filtered[df_filtered['year'] == year]
    
    # Plot the data, using hue for flags and ensuring consistent x-axis range
    sns.scatterplot(x='date', y='C3H8', data=df_year, s=100, ax=axes[i])
    
    # Set title for each subplot
    axes[i].set_title(f'Values between January and June {year}, Colored by Flags')
    axes[i].set_ylabel('C3H8 [ppt]')
    
    # Ensure the x-axis covers the range from January 1 to June 30
    axes[i].set_xlim([pd.Timestamp(f'{year}-01-01'), pd.Timestamp(f'{year}-06-30')])
    #axes[i].set_ylim()

# Set the common x-axis label
axes[-1].set_xlabel('Date')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)


# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
# calculate descriptive statistics 
# Define multiple time ranges
time_ranges = [('2011-01-01', '2019-12-31'), ('2020-01-01', '2020-12-31'), ('2021-01-01', '2021-12-31'), ('2022-01-01', '2023-12-31'), ]

columns_of_interest = ['C3H8']

# Function to filter data and calculate descriptive statistics
def calculate_stats(df, start_date, end_date, column):
    # Filter data for the time range
    
    filtered_data = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
    filtered_data = filtered_data[column]
    # Calculate descriptive statistics
    stats = filtered_data.describe()  # This calculates count, mean, std, min, 25%, 50%, 75%, max
    # Loop through each time range and calculate stats
    return stats

all_stats = pd.DataFrame()
for start_date, end_date in time_ranges:
    print(f"Descriptive statistics for {start_date} to {end_date}:")
    stats = calculate_stats(df, start_date, end_date, columns_of_interest )
    stats['Time Range'] = f"{start_date} to {end_date}"
    stats.set_index(['Time Range'], append=True, inplace=True)
    
    # Append to the all_stats DataFrame
    all_stats = pd.concat([all_stats, stats])

# Save all the stats into a single CSV file
all_stats = all_stats.round(2)
print(all_stats.round(1))

In [None]:
#ANOVA
# Define the time ranges
range_1 = ('2011-01-01', '2019-12-31')
range_2 = ('2020-01-01', '2020-12-31')
range_3 = ('2021-01-01', '2021-12-31')
range_4 = ('2022-01-01', '2023-12-31')

# Group 1: Data between 2011-01-01 and 2019-12-31
group_1 = df[(df['date'] >= range_1[0]) & (df['date'] <= range_1[1])]['C3H8']

# Group 2: Data between 2020-01-01 and 2020-12-31
group_2 = df[(df['date'] >= range_2[0]) & (df['date'] <= range_2[1])]['C3H8']
# Group 3: Data between '2021-01-01', '2021-12-31')
group_3 = df[(df['date'] >= range_3[0]) & (df['date'] <= range_3[1])]['C3H8']
# Group 4: Data between '2021-01-01', '2021-12-31')
group_4 = df[(df['date'] >= range_4[0]) & (df['date'] <= range_4[1])]['C3H8']

# Perform one-way ANOVA
f_stat, p_value = stats.f_oneway(group_1, group_2, group_3, group_4)

# Display the results
print(f'F-statistic: {f_stat}, p-value: {p_value}')

# Set significance level
alpha = 0.05

# Print the result
if p_value < alpha:
    print(f"Reject the null hypothesis (p-value: {p_value:.4f}). There is a significant difference between the group means.")
else:
    print(f"Accept the null hypothesis (p-value: {p_value:.4f}). There is no significant difference between the group means.")

In [None]:
df

In [None]:

# Define a function to group years into categories
def categorize_year(date):
    if date.year in range(2011, 2020):
        return '2011-2019'
    elif date.year == 2020:
        return '2020'
    elif date.year == 2021:
        return '2021'
    elif date.year in range(2022, 2024):
        return '2022-2023'
    else:
        return 'Other'

# Apply the function to create a new 'year_group' column
df['year_group'] = df['date'].apply(categorize_year)

# Create the boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x='year_group', y='C3H8', data=df, showfliers=False, showmeans=True,fill=False, 
            meanprops={"marker":"D", "markerfacecolor":"red", "markeredgecolor":"black"},
            boxprops={'color':'black'},medianprops={'color':'black'},whiskerprops={'color':'black'},
            capprops={'color':'black'})

# Customize the plot
#plt.title('Boxplot of $C_3H_8$', fontsize=14)
plt.xlabel('Time')
plt.ylabel('$C_3H_8$ [ppt]')

# Show the plot
plt.savefig("C:\\YEAR1\\NMVOC\\code\\figures\\c3h8_CMN_years_boxplot.png", dpi=300)
# Show the plot
#plt.show()


In [None]:
# calculate descriptive statistics 
# Define multiple time ranges
time_ranges = [('2020-03-09', '2020-05-04'), ('2020-05-05', '2020-10-22'), ('2020-10-23', '2021-12-29')]

columns_of_interest = ['C3H8']

# Function to filter data and calculate descriptive statistics
def calculate_stats(df, start_date, end_date, column):
    # Filter data for the time range
    
    filtered_data = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
    filtered_data = filtered_data[column]
    # Calculate descriptive statistics
    stats = filtered_data.describe()  # This calculates count, mean, std, min, 25%, 50%, 75%, max
    # Loop through each time range and calculate stats
    return stats

all_stats = pd.DataFrame()
for start_date, end_date in time_ranges:
    print(f"Descriptive statistics for {start_date} to {end_date}:")
    stats = calculate_stats(df, start_date, end_date, columns_of_interest )
    stats['Time Range'] = f"{start_date} to {end_date}"
    stats.set_index(['Time Range'], append=True, inplace=True)
    
    # Append to the all_stats DataFrame
    all_stats = pd.concat([all_stats, stats])

# Save all the stats into a single CSV file
all_stats = all_stats.round(3)
print(all_stats.round(3))
#all_stats.to_csv("figures/all_time_ranges_stats.csv",sep="\t")

In [None]:
# Define the time range (I Lockdown(LD): 9th March to 4th May) for the years between 2011 and 2019
start_date = '03-09'  # Month-Day format for start date
end_date = '05-04'    # Month-Day format for end date
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]  # Years of interest

# Create a mask to filter rows based on the time period for the selected years
mask = (df.index.year.isin(years)) & \
       (df.index.strftime('%m-%d') >= start_date) & \
       (df.index.strftime('%m-%d') <= end_date)

# Filter the data for the specific date range and years
df_filtered_LD = df[mask]

# Group the data by year
grouped_by_year = df_filtered_LD.groupby(df_filtered_LD.index.year)


# Compute descriptive statistics for all variables in the combined date range
descriptive_stats_LD = df_filtered_LD[['C3H8']].describe()

# Round the results to 1 decimal place
descriptive_stats_rounded_LD = descriptive_stats_LD.round(2)

# Print the results
print(descriptive_stats_rounded_LD)

In [None]:
# Define the time range (Soft (S): 5th May to 22snd October) for the years between 2015 and 2019
start_date = '05-5'  # Month-Day format for start date
end_date = '10-22'    # Month-Day format for end date
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]   # Years of interest

# Create a mask to filter rows based on the time period for the selected years
mask = (df.index.year.isin(years)) & \
       (df.index.strftime('%m-%d') >= start_date) & \
       (df.index.strftime('%m-%d') <= end_date)

# Filter the data for the specific date range and years
df_filtered_S = df[mask]

# Group the data by year
grouped_by_year = df_filtered_S.groupby(df_filtered_S.index.year)


# Compute descriptive statistics for all variables in the combined date range
descriptive_stats_S = df_filtered_S[['C3H8']].describe()

# Round the results to 1 decimal place
descriptive_stats_rounded_S = descriptive_stats_S.round(2)

# Print the results
print(descriptive_stats_rounded_S)

In [None]:
# Define the time range (II Lockdown (IILD): 23rd October to 29th December) for the years between 2015 and 2019
start_date = '10-23'  # Month-Day format for start date
end_date = '12-29'    # Month-Day format for end date
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]   # Years of interest

# Create a mask to filter rows based on the time period for the selected years
mask = (df.index.year.isin(years)) & \
       (df.index.strftime('%m-%d') >= start_date) & \
       (df.index.strftime('%m-%d') <= end_date)

# Filter the data for the specific date range and years
df_filtered_IILD = df[mask]

# Group the data by year
grouped_by_year = df_filtered_IILD.groupby(df_filtered_IILD.index.year)

# Compute descriptive statistics for all variables in the combined date range
descriptive_stats_IILD = df_filtered_IILD[['C3H8']].describe()

# Round the results to 1 decimal place
descriptive_stats_rounded_IILD = descriptive_stats_IILD.round(2)

# Print the results
print(descriptive_stats_rounded_IILD)

In [None]:
for start_date, end_date in time_ranges:
    filtered_df = df.loc[start_date:end_date]
    print(f"Statistics for {start_date} to {end_date}:")
    print(filtered_df.describe(), "\n")

In [None]:
df

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['MonthDay'] = df['date'].dt.strftime('%m-%d')

# Define the three time ranges
time_range_1 = ('03-09', '05-04')
time_range_2 = ('05-05', '10-22')
time_range_3 = ('10-23', '12-29')

# Filter data for each time range
range_1_data = df[(df['MonthDay'] >= time_range_1[0]) & (df['MonthDay'] <= time_range_1[1])]
range_2_data = df[(df['MonthDay'] >= time_range_2[0]) & (df['MonthDay'] <= time_range_2[1])]
range_3_data = df[(df['MonthDay'] >= time_range_3[0]) & (df['MonthDay'] <= time_range_3[1])]

In [None]:
#To generate boxplots for the three time periods for the year 2020 and compare them with the time range 2011-2019, 2020, 2021-2023
# Extract year, month, and day from 'Date'



# Separate 2020, 2011-2019, and 2021, 2022-2023 data for each time range
def get_year_split(data):
    data_2020 = data[data['year'] == 2020]
    data_2021 = data[data['year'] == 2021]
    data_2011_2019 = data[(data['year'] >= 2011) & (data['year'] <= 2019)]
    data_2022_2023 = data[(data['year'] >= 2022) & (data['year'] <= 2023)]
    return data_2020, data_2021, data_2022_2023, data_2011_2019

range_1_2020, range_1_2021, range_1_2022_2023, range_1_2011_2019  = get_year_split(range_1_data)
range_2_2020, range_2_2021, range_2_2022_2023, range_2_2011_2019 = get_year_split(range_2_data)
range_3_2020, range_3_2021, range_3_2022_2023,range_3_2011_2019 = get_year_split(range_3_data)


In [None]:

# Add a 'Period' column to distinguish among 2020, 2021, 2022-2023 and 2011-2019
range_1_2011_2019['Period'] = '2011-2019'
range_1_2020['Period'] = '2020'
range_1_2021['Period'] = '2021'
range_1_2022_2023['Period'] = '2022-2023'
range_2_2011_2019['Period'] = '2011-2019'
range_2_2020['Period'] = '2020'
range_2_2021['Period'] = '2021'
range_2_2022_2023['Period'] = '2022-2023'
range_3_2011_2019['Period'] = '2011-2019'
range_3_2020['Period'] = '2020'
range_3_2021['Period'] = '2021'
range_3_2022_2023['Period'] = '2022-2023'


# Combine the data for each time range
range_1_combined = pd.concat([range_1_2011_2019,range_1_2020,range_1_2021,range_1_2022_2023])
range_2_combined = pd.concat([range_2_2011_2019,range_2_2020, range_2_2021, range_2_2022_2023])
range_3_combined = pd.concat([range_3_2011_2019, range_3_2020, range_3_2021,range_3_2022_2023 ])

In [None]:
range_1_combined

In [None]:

# Plot the boxplots for each range
plt.figure(figsize=(14, 8))


# Function to plot boxplots with means
def plot_boxplot_with_mean(ax, data, title):
    sns.boxplot(x='Period', y='C3H8', data=data, ax=ax,
                showfliers=False,  # Hide outliers
                whis=[5, 95],      # Whiskers representing 5th–95th percentiles
                positions=range(4),
                showmeans=True,fill=False, meanprops={"marker":"D", "markerfacecolor":"red", "markeredgecolor":"black"},
                boxprops={'color':'black'},medianprops={'color':'black'},whiskerprops={'color':'black'}, capprops={'color':'black'})
    ax.set_title(title)
    
    # Calculate mean values
    #means = data.groupby('Period')['C3H8'].mean()
     # Assign x-axis positions (0 for '2015-2019', 1 for '2020', 2 for '2021', 3 for '2022-2023')
    x_positions = [0, 12]
    # Plot mean values as red dots
    #for i, period in enumerate(means.index):
    #ax.scatter( means[period], color='red', s=100, marker='D')
        #Plot the mean as a diamond
    #ax.scatter(means.index, means.values, color='red', marker='D', label='Mean', zorder=3)
    
    time_order = ['2011-2019', '2020', '2021', '2022-2023']

    # Set titles and labels
    #ax.set_title(f'{pollutant}')
    #ax.set_xlabel('Season')
    ax.set_xticks(range(0,len(time_order)))
    ax.set_xticklabels(time_order, fontsize=14)
    ax1.yaxis.set_tick_params(labelsize=14)
    ax2.yaxis.set_tick_params(labelsize=14)
    ax3.yaxis.set_tick_params(labelsize=14)
    if i == 0:
        ax.set_ylabel('[$C_3H_8$ [ppt]]', fontsize=14)
    
    # Ensure the legend only appears once (for the first subplot)
    if i == 0:
        ax.legend()
    ax.set_ylim(0,100)    
# Plot for Time Range 1
ax1 = plt.subplot(1, 3, 1)
plot_boxplot_with_mean(ax1, range_1_combined, 'Time Range 1')

# Plot for Time Range 2
ax2 = plt.subplot(1, 3, 2)
plot_boxplot_with_mean(ax2, range_2_combined, 'Time Range 2')

# Plot for Time Range 3
ax3 = plt.subplot(1, 3, 3)
plot_boxplot_with_mean(ax3, range_3_combined, 'Time Range 3')
# Display the plots
plt.tight_layout()
plt.savefig("C:\\YEAR1\\NMVOC\\code\\figures\\c3h8_CMN_COVID.png", dpi=300)
plt.show()


In [None]:
#boxplt comparison among 3 time ranges (I Lockdown, mild measures, II lockdown) for the  '2011-2019', '2020', '2021', '2022-2023' years

# Define the plotting function
def plot_boxplot_with_mean(ax, data, title):
    sns.boxplot(
        x='Period', y='C3H8', data=data, ax=ax,
        showfliers=False,  # Hide outliers
        whis=[5, 95],      # Whiskers representing 5th–95th percentiles
        showmeans=True, fill=False,
        meanprops={"marker": "D", "markerfacecolor": "red", "markeredgecolor": "black"},
        boxprops={'color': 'black'}, medianprops={'color': 'black'},
        whiskerprops={'color': 'black'}, capprops={'color': 'black'}
    )
    
    # Set the title
    ax.set_title(title, fontsize=14)
    
    # Set x-axis labels
    time_order = ['2011-2019', '2020', '2021', '2022-2023']
    ax.set_xticks(range(len(time_order)))
    ax.set_xticklabels(time_order, fontsize=12)
    
    # Set y-axis label for the first subplot
    if ax == ax1:  # Only set the y-label for the first subplot
        ax.set_ylabel('C3H8 [ppt]', fontsize=14)
    
    # Set y-axis tick size
    ax.tick_params(axis='y', labelsize=12)

    # Set y-axis limits
    ax.set_ylim(0, 1000)

# Create the figure
plt.figure(figsize=(14, 8))

# Plot for Time Range 1
ax1 = plt.subplot(1, 3, 1)
plot_boxplot_with_mean(ax1, range_1_combined, 'Time Range 1')

# Plot for Time Range 2
ax2 = plt.subplot(1, 3, 2)
plot_boxplot_with_mean(ax2, range_2_combined, 'Time Range 2')

# Plot for Time Range 3
ax3 = plt.subplot(1, 3, 3)
plot_boxplot_with_mean(ax3, range_3_combined, 'Time Range 3')

# Adjust layout to avoid overlapping
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
#To generate boxplots for the three time periods for the year 2020 and compare them with the time range 2011-2019, 2021-2023
# Extract year, month, and day from 'Date'



# Separate 2020, 2021-2023, 2011-2019 data for each time range
def get_year_split(data):
    data_2020 = data[data['year'] == 2020]
    data_2011_2019 = data[(data['year'] >= 2011) & (data['year'] <= 2019)]
    data_2021_2023 = data[(data['year'] >= 2021) & (data['year'] <= 2023)]
    return data_2020, data_2021_2023, data_2011_2019

range_1_2020, range_1_2021_2023, range_1_2011_2019  = get_year_split(range_1_data)
range_2_2020, range_2_2021_2023, range_2_2011_2019 = get_year_split(range_2_data)
range_3_2020, range_3_2021_2023, range_3_2011_2019 = get_year_split(range_3_data)

In [None]:
# Add a 'Period' column to distinguish among 2020, 2021, 2022-2023 and 2011-2019
range_1_2011_2019['Period'] = '2011-2019'
range_1_2020['Period'] = '2020'
range_1_2021_2023['Period'] = '2021-2023'
range_2_2011_2019['Period'] = '2011-2019'
range_2_2020['Period'] = '2020'
range_2_2021_2023['Period'] = '2021-2023'
range_3_2011_2019['Period'] = '2011-2019'
range_3_2020['Period'] = '2020'
range_3_2021_2023['Period'] = '2021-2023'


# Combine the data for each time range
range_1_combined = pd.concat([range_1_2011_2019,range_1_2020,range_1_2021_2023])
range_2_combined = pd.concat([range_2_2011_2019,range_2_2020, range_2_2021_2023])
range_3_combined = pd.concat([range_3_2011_2019, range_3_2020,range_3_2021_2023 ])

In [None]:
#boxplt comparison among 3 time ranges (I Lockdown, mild measures, II lockdown) for the  '2011-2019', '2020', '2021-2023' years

# Define the plotting function
def plot_boxplot_with_mean(ax, data, title):
    sns.boxplot(
        x='Period', y='C3H8', data=data, ax=ax,
        showfliers=False,  # Hide outliers
        whis=[5, 95],      # Whiskers representing 5th–95th percentiles
        showmeans=True, fill=False,
        meanprops={"marker": "D", "markerfacecolor": "red", "markeredgecolor": "black"},
        boxprops={'color': 'black'}, medianprops={'color': 'black'},
        whiskerprops={'color': 'black'}, capprops={'color': 'black'}
    )
    
    # Set the title
    ax.set_title(title, fontsize=14)
    
    # Set x-axis labels
    time_order = ['2011-2019', '2020', '2021-2023']
    ax.set_xticks(range(len(time_order)))
    ax.set_xticklabels(time_order, fontsize=12)
    
    
    ax.set_ylabel('C3H8 [ppt]', fontsize=14)
    ax.set_xlabel('Time', fontsize=14)
    
    # Set y-axis tick size
    ax.tick_params(axis='y', labelsize=12)

    # Set y-axis limits
    ax.set_ylim(0, 1000)

# Calculate and plot the mean values
    means = data.groupby('Period')['C3H8'].mean()
    for i, mean_value in enumerate(means):
        # Add text next to the diamond markers
        ax.text(i, mean_value + 6, f'{mean_value:.1f}', color='black', ha='center', fontsize=12)

# Create the figure
plt.figure(figsize=(14, 8))

# Plot for Time Range 1
ax1 = plt.subplot(1, 3, 1)
plot_boxplot_with_mean(ax1, range_1_combined, 'I lockdown')

# Plot for Time Range 2
ax2 = plt.subplot(1, 3, 2)
plot_boxplot_with_mean(ax2, range_2_combined, 'Mild restrictions')

# Plot for Time Range 3
ax3 = plt.subplot(1, 3, 3)
plot_boxplot_with_mean(ax3, range_3_combined, 'II lockdown')

# Adjust layout to avoid overlapping
plt.tight_layout()

# Show the plot
plt.savefig("C:\\YEAR1\\NMVOC\\code\\figures\\c3h8_CMN_COVID_3_timeRanges.png", dpi=300)
plt.show()


In [None]:
# Define your time ranges
time_range_1 = ('03-09', '05-04')
time_range_2 = ('05-05', '10-22')
time_range_3 = ('10-23', '12-29')

# Function to filter the data by year and time range
def filter_by_time_range(df, year, time_range):
    start_date = pd.to_datetime(f"{year}-{time_range[0]}")
    end_date = pd.to_datetime(f"{year}-{time_range[1]}")
    return df[(df['date'] >= start_date) & (df['date'] <= end_date)]

# Create lists to hold data for ANOVA
data_range_1, data_range_2, data_range_3 = [], [], []

# Iterate over the time ranges and periods, and collect data
years_groups = [(2011, 2019), (2020, 2020), (2021, 2023)]

for start_year, end_year in years_groups:
    for year in range(start_year, end_year + 1):
        # Collect data for each time period within each year
        data_range_1.append(filter_by_time_range(df, year, time_range_1)['C3H8'].values)
        data_range_2.append(filter_by_time_range(df, year, time_range_2)['C3H8'].values)
        data_range_3.append(filter_by_time_range(df, year, time_range_3)['C3H8'].values)

# Flatten the data (if you have a list of arrays, combine them into one list for each time period)
data_range_1 = [val for sublist in data_range_1 for val in sublist]
data_range_2 = [val for sublist in data_range_2 for val in sublist]
data_range_3 = [val for sublist in data_range_3 for val in sublist]

# Perform one-way ANOVA
f_stat, p_value = stats.f_oneway(data_range_1, data_range_2, data_range_3)

# Print the results
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpretation of p-value
if p_value < 0.05:
    print("The mean values across the time periods are significantly different.")
else:
    print("No significant difference between the means of the time periods.")

In [None]:
#ANOVA
# Define the time ranges
range_1 = ('2011-01-01', '2019-12-31')
range_2 = ('2020-01-01', '2020-12-31')
range_3 = ('2021-01-01', '2023-12-31')



# Group 1: Data between 2011-01-01 and 2019-12-31
group_1 = df[(df['date'] >= range_1[0]) & (df['date'] <= range_1[1])]['C3H8']

# Group 2: Data between 2020-01-01 and 2020-12-31
group_2 = df[(df['date'] >= range_2[0]) & (df['date'] <= range_2[1])]['C3H8']
# Group 3: Data between '2021-01-01', '2023-12-31')
group_3 = df[(df['date'] >= range_3[0]) & (df['date'] <= range_3[1])]['C3H8']


# Perform one-way ANOVA
f_stat, p_value = stats.f_oneway(group_1, group_2, group_3)

# Display the results
print(f'F-statistic: {f_stat}, p-value: {p_value}')

# Set significance level
alpha = 0.05

# Print the result
if p_value < alpha:
    print(f"Reject the null hypothesis (p-value: {p_value:.4f}). There is a significant difference between the group means.")
else:
    print(f"Accept the null hypothesis (p-value: {p_value:.4f}). There is no significant difference between the group means.")

In [None]:
#To generate boxplots for the three time periods for the year 2020 and compare them with the time range 2011-2019, 2021-2023
# Extract year, month, and day from 'Date'



# Separate 2020, 2021-2023, 2011-2019 data for each time range
def get_year_split(data):
    data_2020 = data[data['year'] == 2020]
    data_2011_2019 = data[(data['year'] >= 2011) & (data['year'] <= 2019)]
    data_2021_2023 = data[(data['year'] >= 2021) & (data['year'] <= 2023)]
    return data_2020, data_2021_2023, data_2011_2019

range_2020, range_2021_2023, range_2011_2019  = get_year_split(df)


In [None]:
#boxplt comparison among 3 time ranges:  '2011-2019', '2020', '2021-2023' years
# Define time periods: 2011-2019, 2020, 2021-2023
def assign_period(date):
    if pd.Timestamp('2011-01-01') <= date <= pd.Timestamp('2019-12-31'):
        return '2011-2019'
    elif pd.Timestamp('2020-01-01') <= date <= pd.Timestamp('2020-12-31'):
        return '2020'
    elif pd.Timestamp('2021-01-01') <= date <= pd.Timestamp('2023-12-31'):
        return '2021-2023'
    else:
        return None

# Apply this to your dataframe to create a 'Period' column
df['Period'] = pd.to_datetime(df['date']).apply(assign_period)

# Function to plot the boxplot and display the mean values
def plot_boxplot_with_mean(data, ax):
    sns.boxplot(
        x='Period', y='C3H8', data=data, ax=ax,
        showfliers=False,  # Hide outliers
        whis=[5, 95],      # Whiskers representing 5th–95th percentiles
        showmeans=True,
        fill=False,
        meanprops={"marker": "D", "markerfacecolor": "red", "markeredgecolor": "black"},
        boxprops={'color': 'black'}, medianprops={'color': 'black'},
        whiskerprops={'color': 'black'}, capprops={'color': 'black'}
    )

    # Calculate the mean values for each period
    means = data.groupby('Period')['C3H8'].mean()

    # Add the mean values above the diamond markers
    for i, (period, mean_value) in enumerate(means.items()):
        ax.text(i, mean_value + 5, f'{mean_value:.2f}', color='black', ha='center', fontsize=10)

    # Set labels and limits
    ax.set_ylabel('$C_3H_8$ [ppt]', fontsize=14)
    ax.set_xlabel('Time', fontsize=14)
    ax.set_ylim(0,  1000)  # Adjust y-axis limit based on the data range
    ax.tick_params(axis='y', labelsize=12)
    #ax.set_title('C3H8 Levels by Time Range', fontsize=16)

# Create the figure for the boxplot
plt.figure(figsize=(8, 6))

# Create a single subplot for the three periods
ax = plt.subplot(1, 1, 1)
plot_boxplot_with_mean(df, ax)

# Adjust layout
plt.tight_layout()


# Show the plot
plt.savefig("C:\\YEAR1\\NMVOC\\code\\figures\\c3h8_CMN_COVID_yearRanges.png", dpi=300)
#plt.show()


In [None]:
#ANOVA
# Define the time ranges
range_1 = ('2011-01-01', '2019-12-31')
range_2 = ('2020-01-01', '2020-12-31')
range_3 = ('2021-01-01', '2023-12-31')



# Group 1: Data between 2011-01-01 and 2019-12-31
group_1 = df[(df['date'] >= range_1[0]) & (df['date'] <= range_1[1])]['C3H8']

# Group 2: Data between 2020-01-01 and 2020-12-31
group_2 = df[(df['date'] >= range_2[0]) & (df['date'] <= range_2[1])]['C3H8']
# Group 3: Data between '2021-01-01', '2023-12-31')
group_3 = df[(df['date'] >= range_3[0]) & (df['date'] <= range_3[1])]['C3H8']


# Perform one-way ANOVA
f_stat, p_value = stats.f_oneway(group_1, group_2, group_3)

# Display the results
print(f'F-statistic: {f_stat}, p-value: {p_value}')

# Set significance level
alpha = 0.05

# Print the result
if p_value < alpha:
    print(f"Reject the null hypothesis (p-value: {p_value:.4f}). There is a significant difference between the group means.")
else:
    print(f"Accept the null hypothesis (p-value: {p_value:.4f}). There is no significant difference between the group means.")

In [None]:
median = df.groupby('Period')['C3H8'].quantile(0.5)
median

In [None]:
# diel variation comparison among 2011-2019, 2020, 2021-2023
#  Define time ranges
time_ranges = {
    "2011-2019": (2011, 2019),
    "2020": (2020, 2020),
    "2021-2023": (2021, 2023)
}

# Function to calculate hourly stats
def get_hourly_stats(data):
    grouped = data.groupby("hour")["C3H8"]
    means = grouped.mean()
    stds = grouped.std()
    counts = grouped.count()

    # Calculate 95% confidence interval
    confidence = 0.95
    t_value = t.ppf((1 + confidence) / 2, df=counts - 1)  # t critical value
    margin_of_error = t_value * (stds / np.sqrt(counts))

    lower_bound = means - margin_of_error
    upper_bound = means + margin_of_error

    return means, lower_bound, upper_bound

# Initialize plot
plt.figure(figsize=(12, 10))

colors = ["blue", "green", "orange"]
for i, (label, (start_year, end_year)) in enumerate(time_ranges.items()):
    # Filter data for the current time range
    range_data = df[(df["year"] >= start_year) & (df["year"] <= end_year)]
    
    # Calculate hourly statistics
    means, lower, upper = get_hourly_stats(range_data)
    
    # Plot
    plt.plot(means.index, means, label=f"{label} Mean", color=colors[i], marker='o')
    plt.fill_between(means.index, lower, upper, color=colors[i], alpha=0.2, label=f"{label} 95% CI")

# Plot customization
plt.title("Hourly Means with 95% Confidence Interval (2011-2019, 2020, 2021-2023)")
plt.xlabel("Hour of Day")
plt.ylabel("$C_3H_8$ [ppt]")
plt.xticks(range(24))
plt.legend()
plt.grid()
plt.tight_layout()

# Show plot
plt.show()

In [None]:



# Define overall time ranges for years
time_periods = {
    "2011-2019": (2011, 2019),
    "2020": (2020, 2020),
    "2021-2023": (2021, 2023),
}

# Define the date ranges for subplots
date_ranges = {
    "Time Range 1 (03-09 to 05-04)": ("03-09", "05-04"),
    "Time Range 2 (05-05 to 10-22)": ("05-05", "10-22"),
    "Time Range 3 (10-23 to 12-29)": ("10-23", "12-29"),
}

# Function to calculate hourly stats for specific years and date range
def get_hourly_stats(data, start_date, end_date):
    # Filter for the specific date range
    mask = (data["month-day"] >= start_date) & (data["month-day"] <= end_date)
    filtered_data = data[mask]
    grouped = filtered_data.groupby("hour")["C3H8"]
    
    means = grouped.mean()
    stds = grouped.std()
    counts = grouped.count()

    # Calculate 95% confidence interval
    confidence = 0.95
    t_value = t.ppf((1 + confidence) / 2, df=counts - 1)  # t critical value
    margin_of_error = t_value * (stds / np.sqrt(counts))

    lower_bound = means - margin_of_error
    upper_bound = means + margin_of_error

    return means, lower_bound, upper_bound

# Initialize subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

colors = ["blue", "green", "orange"]

# Loop through date ranges and create subplots
for ax, (title, (start_date, end_date)) in zip(axes, date_ranges.items()):
    for i, (label, (start_year, end_year)) in enumerate(time_periods.items()):
        # Filter data for the specific time period
        range_data = df[(df["year"] >= start_year) & (df["year"] <= end_year)]
        
        # Get hourly stats
        means, lower, upper = get_hourly_stats(range_data, start_date, end_date)
        
        # Plot
        ax.plot(means.index, means, label=f"{label} Mean", color=colors[i], marker='o')
        ax.fill_between(means.index, lower, upper, color=colors[i], alpha=0.2, label=f"{label} 95% CI")
    
    # Customize each subplot
    ax.set_title(title)
    ax.set_xlabel("Hour of Day")
    if ax == axes[0]:
        ax.set_ylabel("$C_3H_8$ [ppt]")
    ax.set_xticks(range(24))
    ax.legend()
    ax.grid()

# Adjust layout
plt.tight_layout()
plt.savefig("C:\\YEAR1\\NMVOC\\code\\figures\\c3h8_diel_Covid.png", dpi=300)
plt.show()
