In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns

In [20]:
df = pd.read_csv('NYC_Weather_2016_2022.csv')

In [None]:
df.columns

Index(['time', 'temperature_2m (°C)', 'precipitation (mm)', 'rain (mm)',
       'cloudcover (%)', 'cloudcover_low (%)', 'cloudcover_mid (%)',
       'cloudcover_high (%)', 'windspeed_10m (km/h)', 'winddirection_10m (°)'],
      dtype='object')

In [22]:
df_new = df.copy()
df_new = df_new.dropna(subset=['temperature_2m (°C)', 'precipitation (mm)', 'rain (mm)',
       'cloudcover (%)', 'cloudcover_low (%)', 'cloudcover_mid (%)',
       'cloudcover_high (%)', 'windspeed_10m (km/h)', 'winddirection_10m (°)'])

In [23]:
df_new['time']

0        2016-01-01T00:00
1        2016-01-01T01:00
2        2016-01-01T02:00
3        2016-01-01T03:00
4        2016-01-01T04:00
               ...       
59611    2022-10-19T19:00
59612    2022-10-19T20:00
59613    2022-10-19T21:00
59614    2022-10-19T22:00
59615    2022-10-19T23:00
Name: time, Length: 59587, dtype: object

In [24]:
# Split the 'time' column into 'date' and 'time' columns
df_new['date'] = df_new['time'].str.split('T').str[0]
df_new['time_only'] = df_new['time'].str.split('T').str[1]

# Display the first few rows to verify the changes
df_new[['date', 'time_only']]

# Drop the original 'time' column if you no longer need it
df_new = df_new.drop('time', axis=1)

# Convert to datetime objects if needed
df_new['date'] = pd.to_datetime(df_new['date'])
df_new['time_only'] = pd.to_datetime(df_new['time_only'], format='%H:%M').dt.time

In [25]:
# First, get all column names
all_columns = df_new.columns.tolist()

# Remove 'date' and 'time_only' from the list (if they exist)
remaining_columns = [col for col in all_columns if col not in ['date', 'time_only']]

# Create new column order with date and time first, followed by all other columns
new_column_order = ['date', 'time_only'] + remaining_columns

# Reorder the DataFrame columns
df_new = df_new[new_column_order]

# Display the DataFrame to verify the changes
df_new

Unnamed: 0,date,time_only,temperature_2m (°C),precipitation (mm),rain (mm),cloudcover (%),cloudcover_low (%),cloudcover_mid (%),cloudcover_high (%),windspeed_10m (km/h),winddirection_10m (°)
0,2016-01-01,00:00:00,7.6,0.0,0.0,69.0,53.0,0.0,72.0,10.0,296.0
1,2016-01-01,01:00:00,7.5,0.0,0.0,20.0,4.0,0.0,56.0,9.8,287.0
2,2016-01-01,02:00:00,7.1,0.0,0.0,32.0,3.0,0.0,99.0,9.7,285.0
3,2016-01-01,03:00:00,6.6,0.0,0.0,35.0,5.0,0.0,100.0,9.2,281.0
4,2016-01-01,04:00:00,6.3,0.0,0.0,34.0,4.0,0.0,100.0,9.1,279.0
...,...,...,...,...,...,...,...,...,...,...,...
59611,2022-10-19,19:00:00,12.2,0.0,0.0,68.0,9.0,100.0,0.0,21.2,212.0
59612,2022-10-19,20:00:00,12.2,0.0,0.0,61.0,8.0,90.0,0.0,20.3,207.0
59613,2022-10-19,21:00:00,12.2,0.0,0.0,62.0,21.0,72.0,0.0,16.6,207.0
59614,2022-10-19,22:00:00,11.1,0.0,0.0,4.0,0.0,6.0,0.0,10.8,206.0


In [26]:
# Define aggregation methods for different columns
aggregation_dict = {
    'temperature_2m (°C)': 'mean',
    'precipitation (mm)': 'sum',  # Sum for precipitation
    'rain (mm)': 'sum',  # Sum for rain
    'cloudcover (%)': 'mean',
    'cloudcover_low (%)': 'mean',
    'cloudcover_mid (%)': 'mean',
    'cloudcover_high (%)': 'mean',
    'windspeed_10m (km/h)': 'mean',
    'winddirection_10m (°)': 'mean'
}

# Group by date and apply the appropriate aggregation method for each column
daily_means = df_new.groupby('date').agg(aggregation_dict).round(2)

# Reset the index to make 'date' a column instead of an index
daily_means = daily_means.reset_index()

# Display the result
daily_means

Unnamed: 0,date,temperature_2m (°C),precipitation (mm),rain (mm),cloudcover (%),cloudcover_low (%),cloudcover_mid (%),cloudcover_high (%),windspeed_10m (km/h),winddirection_10m (°)
0,2016-01-01,5.41,0.0,0.0,57.29,34.50,9.71,77.75,12.48,273.33
1,2016-01-02,2.39,0.0,0.0,10.25,9.92,1.42,1.50,13.38,266.42
2,2016-01-03,3.01,0.0,0.0,10.29,11.29,0.08,0.00,12.75,244.38
3,2016-01-04,0.10,0.0,0.0,14.75,11.83,6.75,0.00,16.85,313.54
4,2016-01-05,-6.78,0.0,0.0,0.67,0.75,0.00,0.00,17.65,304.25
...,...,...,...,...,...,...,...,...,...,...
2478,2022-10-14,15.46,12.6,12.6,49.33,32.50,54.46,25.71,11.47,160.83
2479,2022-10-15,14.20,0.0,0.0,9.88,10.12,0.08,2.25,8.34,201.12
2480,2022-10-17,14.69,0.9,0.9,66.08,24.96,39.04,81.58,8.21,249.21
2481,2022-10-18,11.94,1.5,1.5,39.75,3.58,51.08,19.00,10.68,175.42


In [27]:
# # Group by date and calculate the mean for each numerical column
# daily_means = df_new.groupby('date')[['temperature_2m (°C)', 'precipitation (mm)', 
#                                      'rain (mm)', 'cloudcover (%)', 'cloudcover_low (%)', 
#                                      'cloudcover_mid (%)', 'cloudcover_high (%)', 'windspeed_10m (km/h)', 
#                                      'winddirection_10m (°)']].mean().round(2)

# # Reset the index to make 'date' a column instead of an index
# daily_means = daily_means.reset_index()

# # Display the result
# daily_means

In [28]:
daily_means = daily_means[daily_means['date'].dt.year!=2022]
daily_means

Unnamed: 0,date,temperature_2m (°C),precipitation (mm),rain (mm),cloudcover (%),cloudcover_low (%),cloudcover_mid (%),cloudcover_high (%),windspeed_10m (km/h),winddirection_10m (°)
0,2016-01-01,5.41,0.0,0.0,57.29,34.50,9.71,77.75,12.48,273.33
1,2016-01-02,2.39,0.0,0.0,10.25,9.92,1.42,1.50,13.38,266.42
2,2016-01-03,3.01,0.0,0.0,10.29,11.29,0.08,0.00,12.75,244.38
3,2016-01-04,0.10,0.0,0.0,14.75,11.83,6.75,0.00,16.85,313.54
4,2016-01-05,-6.78,0.0,0.0,0.67,0.75,0.00,0.00,17.65,304.25
...,...,...,...,...,...,...,...,...,...,...
2187,2021-12-27,2.78,0.0,0.0,45.79,12.67,45.54,40.12,7.55,218.88
2188,2021-12-28,4.30,2.2,2.2,76.17,46.79,64.38,54.25,7.89,220.96
2189,2021-12-29,5.85,5.1,5.1,94.33,72.29,51.29,59.00,6.28,64.38
2190,2021-12-30,6.95,2.4,2.4,100.00,92.67,86.88,50.92,4.31,156.29


In [29]:
daily_means.to_csv('weather_means_2016_2022.csv')