In [1]:
import os
import pandas as pd
from datetime import datetime

def starts_with_20(filename):
    """Check if the filename starts with '20'."""
    return filename.startswith("20")

def get_csv_files(folder_path):
    """Return a list of CSV files in the folder that start with '20'."""
    all_files = os.listdir(folder_path)
    csv_files = [file for file in all_files if file.endswith(".csv") and starts_with_20(file)]
    return csv_files

def read_and_combine_csv_files(folder_path, csv_files):
    """Read and concatenate the CSV files into a single DataFrame."""
    combined_dataframe = pd.concat([pd.read_csv(os.path.join(folder_path, file)) for file in csv_files])
    return combined_dataframe

In [2]:
folder_path = "data/"

# Get a list of CSV files in the folder that start with "20"
csv_files = get_csv_files(folder_path)

# Read and concatenate the CSV files into a single DataFrame
combined_dataframe = read_and_combine_csv_files(folder_path, csv_files)

# Convert unix timestamp to datetime and consider the timezone, Brazil is UTC-3
combined_dataframe.dt = pd.to_datetime(combined_dataframe.dt, unit='s') - pd.Timedelta(days=1)
combined_dataframe.sunrise = pd.to_datetime(combined_dataframe.sunrise, unit='s') - pd.Timedelta(hours=3)
combined_dataframe.sunset = pd.to_datetime(combined_dataframe.sunset, unit='s') - pd.Timedelta(hours=3)

# Get only the time for sunrise and sunset
combined_dataframe.sunrise = combined_dataframe.sunrise.dt.time
combined_dataframe.sunset = combined_dataframe.sunset.dt.time


# Convert the weather column to dicts
combined_dataframe.weather = combined_dataframe.weather.apply(pd.Series)\
    .apply(lambda x: eval(x.to_list()[0])[0], axis=1)

# Drop duplicates
combined_dataframe.drop_duplicates(subset=['dt'], keep='first', inplace=True)

# Print the combined DataFrame
combined_dataframe

Unnamed: 0,dt,sunrise,sunset,temp,feels_like,pressure,humidity,dew_point,clouds,visibility,wind_speed,wind_deg,weather,rain,wind_gust,uvi
0,2007-12-31,05:22:25,18:56:12,23.73,24.20,1015,78,19.67,40,10000.0,4.60,170,"{'id': 701, 'main': 'Mist', 'description': 'mi...",,,
1,2008-01-01,05:23:03,18:56:30,22.01,22.36,1017,80,18.40,75,6000.0,3.10,170,"{'id': 803, 'main': 'Clouds', 'description': '...",,,
2,2008-01-02,05:23:41,18:56:47,21.38,21.79,1016,85,18.76,75,5000.0,2.60,130,"{'id': 701, 'main': 'Mist', 'description': 'mi...",{'1h': 0.15},,
3,2008-01-03,05:24:20,18:57:03,21.99,22.23,1018,76,17.57,0,8000.0,1.00,140,"{'id': 800, 'main': 'Clear', 'description': 'c...",,,
4,2008-01-04,05:24:59,18:57:17,22.98,23.06,1018,66,16.29,75,10000.0,1.50,300,"{'id': 500, 'main': 'Rain', 'description': 'li...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,2023-03-26,06:12:19,18:12:07,23.27,23.92,1019,87,20.98,40,10000.0,8.49,182,"{'id': 501, 'main': 'Rain', 'description': 'mo...",{'1h': 3.65},10.28,0.0
86,2023-03-27,06:12:42,18:11:07,19.46,19.73,1021,87,17.24,75,10000.0,4.92,91,"{'id': 803, 'main': 'Clouds', 'description': '...",,6.26,0.0
87,2023-03-28,06:13:04,18:10:08,20.75,21.20,1020,89,18.87,40,10000.0,4.12,110,"{'id': 802, 'main': 'Clouds', 'description': '...",,,0.0
88,2023-03-29,06:13:26,18:09:08,22.08,22.35,1020,77,17.86,0,10000.0,1.54,320,"{'id': 800, 'main': 'Clear', 'description': 'c...",,,0.0


In [3]:
def find_complete_intervals(df, date_column):
    # Calculate the differences between consecutive dates
    df['Diff'] = df[date_column].diff().dt.days

    # Create a new column to group continuous intervals
    df['Group'] = (df['Diff'] > 1).cumsum()

    # Calculate start and end dates for each group
    intervals = df.groupby('Group')[date_column].agg(['min', 'max']).reset_index(drop=True)

    # Remove single date intervals
    intervals = intervals[intervals['max'] - intervals['min'] > pd.Timedelta(days=0)]

    # Remove auxiliar columns
    df.drop(['Diff', 'Group'], axis=1, inplace=True)

    return intervals


find_complete_intervals(combined_dataframe, 'dt')

Unnamed: 0,min,max
0,2007-12-31,2023-03-30


In [4]:
combined_dataframe

Unnamed: 0,dt,sunrise,sunset,temp,feels_like,pressure,humidity,dew_point,clouds,visibility,wind_speed,wind_deg,weather,rain,wind_gust,uvi
0,2007-12-31,05:22:25,18:56:12,23.73,24.20,1015,78,19.67,40,10000.0,4.60,170,"{'id': 701, 'main': 'Mist', 'description': 'mi...",,,
1,2008-01-01,05:23:03,18:56:30,22.01,22.36,1017,80,18.40,75,6000.0,3.10,170,"{'id': 803, 'main': 'Clouds', 'description': '...",,,
2,2008-01-02,05:23:41,18:56:47,21.38,21.79,1016,85,18.76,75,5000.0,2.60,130,"{'id': 701, 'main': 'Mist', 'description': 'mi...",{'1h': 0.15},,
3,2008-01-03,05:24:20,18:57:03,21.99,22.23,1018,76,17.57,0,8000.0,1.00,140,"{'id': 800, 'main': 'Clear', 'description': 'c...",,,
4,2008-01-04,05:24:59,18:57:17,22.98,23.06,1018,66,16.29,75,10000.0,1.50,300,"{'id': 500, 'main': 'Rain', 'description': 'li...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,2023-03-26,06:12:19,18:12:07,23.27,23.92,1019,87,20.98,40,10000.0,8.49,182,"{'id': 501, 'main': 'Rain', 'description': 'mo...",{'1h': 3.65},10.28,0.0
86,2023-03-27,06:12:42,18:11:07,19.46,19.73,1021,87,17.24,75,10000.0,4.92,91,"{'id': 803, 'main': 'Clouds', 'description': '...",,6.26,0.0
87,2023-03-28,06:13:04,18:10:08,20.75,21.20,1020,89,18.87,40,10000.0,4.12,110,"{'id': 802, 'main': 'Clouds', 'description': '...",,,0.0
88,2023-03-29,06:13:26,18:09:08,22.08,22.35,1020,77,17.86,0,10000.0,1.54,320,"{'id': 800, 'main': 'Clear', 'description': 'c...",,,0.0


In [5]:
# Extract all diferent weather conditions
pd.DataFrame(combined_dataframe.weather.drop_duplicates().to_list())

Unnamed: 0,id,main,description,icon
0,701,Mist,mist,50n
1,803,Clouds,broken clouds,04n
2,800,Clear,clear sky,01n
3,500,Rain,light rain,10n
4,802,Clouds,scattered clouds,03n
5,721,Haze,haze,50n
6,211,Thunderstorm,thunderstorm,11n
7,804,Clouds,overcast clouds,04n
8,520,Rain,light intensity shower rain,09n
9,300,Drizzle,light intensity drizzle,09n


In [6]:
combined_dataframe["weather_id"] = combined_dataframe.weather.apply(lambda x: x['id'])
combined_dataframe

Unnamed: 0,dt,sunrise,sunset,temp,feels_like,pressure,humidity,dew_point,clouds,visibility,wind_speed,wind_deg,weather,rain,wind_gust,uvi,weather_id
0,2007-12-31,05:22:25,18:56:12,23.73,24.20,1015,78,19.67,40,10000.0,4.60,170,"{'id': 701, 'main': 'Mist', 'description': 'mi...",,,,701
1,2008-01-01,05:23:03,18:56:30,22.01,22.36,1017,80,18.40,75,6000.0,3.10,170,"{'id': 803, 'main': 'Clouds', 'description': '...",,,,803
2,2008-01-02,05:23:41,18:56:47,21.38,21.79,1016,85,18.76,75,5000.0,2.60,130,"{'id': 701, 'main': 'Mist', 'description': 'mi...",{'1h': 0.15},,,701
3,2008-01-03,05:24:20,18:57:03,21.99,22.23,1018,76,17.57,0,8000.0,1.00,140,"{'id': 800, 'main': 'Clear', 'description': 'c...",,,,800
4,2008-01-04,05:24:59,18:57:17,22.98,23.06,1018,66,16.29,75,10000.0,1.50,300,"{'id': 500, 'main': 'Rain', 'description': 'li...",,,,500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,2023-03-26,06:12:19,18:12:07,23.27,23.92,1019,87,20.98,40,10000.0,8.49,182,"{'id': 501, 'main': 'Rain', 'description': 'mo...",{'1h': 3.65},10.28,0.0,501
86,2023-03-27,06:12:42,18:11:07,19.46,19.73,1021,87,17.24,75,10000.0,4.92,91,"{'id': 803, 'main': 'Clouds', 'description': '...",,6.26,0.0,803
87,2023-03-28,06:13:04,18:10:08,20.75,21.20,1020,89,18.87,40,10000.0,4.12,110,"{'id': 802, 'main': 'Clouds', 'description': '...",,,0.0,802
88,2023-03-29,06:13:26,18:09:08,22.08,22.35,1020,77,17.86,0,10000.0,1.54,320,"{'id': 800, 'main': 'Clear', 'description': 'c...",,,0.0,800


In [7]:
combined_dataframe = combined_dataframe.assign(
    dia=combined_dataframe['dt'].dt.day,
    mes=combined_dataframe['dt'].dt.month,
    ano=combined_dataframe['dt'].dt.year,
)

new_order = ['dt', 'dia', 'mes', 'ano', 'sunrise', 'sunset', 'temp', 'feels_like', 'pressure', 'humidity', 'dew_point', 'clouds', 'visibility', 'wind_speed', 'wind_deg', 'weather', 'rain', 'wind_gust', 'uvi']
combined_dataframe = combined_dataframe.reindex(columns=new_order)


In [8]:
def get_season(month):
    if month in [1, 2, 3]:
        return 'Verão'
    elif month in [4, 5, 6]:
        return 'Outono'
    elif month in [7, 8, 9]:
        return 'Inverno'
    else:
        return 'Primavera'

# aplicar a função à coluna 'mes' para criar uma nova coluna 'estacao'
combined_dataframe['estacao'] = combined_dataframe['mes'].apply(lambda x: get_season(x))

In [9]:
combined_dataframe

Unnamed: 0,dt,dia,mes,ano,sunrise,sunset,temp,feels_like,pressure,humidity,dew_point,clouds,visibility,wind_speed,wind_deg,weather,rain,wind_gust,uvi,estacao
0,2007-12-31,31,12,2007,05:22:25,18:56:12,23.73,24.20,1015,78,19.67,40,10000.0,4.60,170,"{'id': 701, 'main': 'Mist', 'description': 'mi...",,,,Primavera
1,2008-01-01,1,1,2008,05:23:03,18:56:30,22.01,22.36,1017,80,18.40,75,6000.0,3.10,170,"{'id': 803, 'main': 'Clouds', 'description': '...",,,,Verão
2,2008-01-02,2,1,2008,05:23:41,18:56:47,21.38,21.79,1016,85,18.76,75,5000.0,2.60,130,"{'id': 701, 'main': 'Mist', 'description': 'mi...",{'1h': 0.15},,,Verão
3,2008-01-03,3,1,2008,05:24:20,18:57:03,21.99,22.23,1018,76,17.57,0,8000.0,1.00,140,"{'id': 800, 'main': 'Clear', 'description': 'c...",,,,Verão
4,2008-01-04,4,1,2008,05:24:59,18:57:17,22.98,23.06,1018,66,16.29,75,10000.0,1.50,300,"{'id': 500, 'main': 'Rain', 'description': 'li...",,,,Verão
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,2023-03-26,26,3,2023,06:12:19,18:12:07,23.27,23.92,1019,87,20.98,40,10000.0,8.49,182,"{'id': 501, 'main': 'Rain', 'description': 'mo...",{'1h': 3.65},10.28,0.0,Verão
86,2023-03-27,27,3,2023,06:12:42,18:11:07,19.46,19.73,1021,87,17.24,75,10000.0,4.92,91,"{'id': 803, 'main': 'Clouds', 'description': '...",,6.26,0.0,Verão
87,2023-03-28,28,3,2023,06:13:04,18:10:08,20.75,21.20,1020,89,18.87,40,10000.0,4.12,110,"{'id': 802, 'main': 'Clouds', 'description': '...",,,0.0,Verão
88,2023-03-29,29,3,2023,06:13:26,18:09:08,22.08,22.35,1020,77,17.86,0,10000.0,1.54,320,"{'id': 800, 'main': 'Clear', 'description': 'c...",,,0.0,Verão
