In [63]:
import os
import pandas as pd
from datetime import datetime

def starts_with_20(filename):
    """Check if the filename starts with '20'."""
    return filename.startswith("20")

def get_csv_files(folder_path):
    """Return a list of CSV files in the folder that start with '20'."""
    all_files = os.listdir(folder_path)
    csv_files = [file for file in all_files if file.endswith(".csv") and starts_with_20(file)]
    return csv_files

def read_and_combine_csv_files(folder_path, csv_files):
    """Read and concatenate the CSV files into a single DataFrame."""
    combined_dataframe = pd.concat([pd.read_csv(os.path.join(folder_path, file)) for file in csv_files])
    return combined_dataframe

In [64]:
folder_path = "data/"

# Get a list of CSV files in the folder that start with "20"
csv_files = get_csv_files(folder_path)

# Read and concatenate the CSV files into a single DataFrame
combined_dataframe = read_and_combine_csv_files(folder_path, csv_files)

# Convert unix timestamp to datetime and consider the timezone, Brazil is UTC-3
combined_dataframe.dt = pd.to_datetime(combined_dataframe.dt, unit='s') - pd.Timedelta(days=1)
combined_dataframe.sunrise = pd.to_datetime(combined_dataframe.sunrise, unit='s') - pd.Timedelta(hours=3)
combined_dataframe.sunset = pd.to_datetime(combined_dataframe.sunset, unit='s') - pd.Timedelta(hours=3)

# Drop duplicates
combined_dataframe.drop_duplicates(subset=['dt'], keep='first', inplace=True)

# Print the combined DataFrame
combined_dataframe

Unnamed: 0,dt,sunrise,sunset,temp,feels_like,pressure,humidity,dew_point,clouds,visibility,wind_speed,wind_deg,weather,rain,wind_gust
0,2013-12-31,2013-12-31 05:22:42,2013-12-31 18:56:21,26.70,27.77,1014,61,18.56,40,10000.0,4.12,330,"[{'id': 802, 'main': 'Clouds', 'description': ...",,
1,2014-01-01,2014-01-01 05:23:20,2014-01-01 18:56:38,24.70,25.16,1015,74,19.76,75,10000.0,3.60,310,"[{'id': 500, 'main': 'Rain', 'description': 'l...",,
2,2014-01-02,2014-01-02 05:23:59,2014-01-02 18:56:54,28.70,29.75,1013,54,18.48,0,10000.0,5.14,330,"[{'id': 800, 'main': 'Clear', 'description': '...",,
3,2014-01-03,2014-01-03 05:24:38,2014-01-03 18:57:09,26.70,28.36,1016,70,20.78,0,10000.0,3.10,150,"[{'id': 800, 'main': 'Clear', 'description': '...",,
4,2014-01-04,2014-01-04 05:25:18,2014-01-04 18:57:23,20.70,21.12,1021,88,18.64,100,,7.72,180,"[{'id': 500, 'main': 'Rain', 'description': 'l...",{'1h': 0.31},
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,2020-12-26,2020-12-26 05:19:51,2020-12-26 18:54:40,19.83,20.01,1016,82,16.67,75,6000.0,3.60,140,"[{'id': 701, 'main': 'Mist', 'description': 'm...",,
172,2020-12-27,2020-12-27 05:20:25,2020-12-27 18:55:03,20.83,21.19,1017,85,18.22,20,8000.0,1.50,70,"[{'id': 701, 'main': 'Mist', 'description': 'm...",,
173,2020-12-28,2020-12-28 05:21:00,2020-12-28 18:55:25,20.50,20.93,1018,89,18.63,75,3000.0,2.10,20,"[{'id': 502, 'main': 'Rain', 'description': 'h...",{'1h': 11.94},
174,2020-12-29,2020-12-29 05:21:36,2020-12-29 18:55:46,20.46,20.91,1016,90,18.76,75,10000.0,1.00,100,"[{'id': 701, 'main': 'Mist', 'description': 'm...",,


In [65]:
def find_complete_intervals(df, date_column):
    # Calculate the differences between consecutive dates
    df['Diff'] = df[date_column].diff().dt.days

    # Create a new column to group continuous intervals
    df['Group'] = (df['Diff'] > 1).cumsum()

    # Calculate start and end dates for each group
    intervals = df.groupby('Group')[date_column].agg(['min', 'max']).reset_index(drop=True)

    # Remove single date intervals
    intervals = intervals[intervals['max'] - intervals['min'] > pd.Timedelta(days=0)]

    return intervals


find_complete_intervals(combined_dataframe, 'dt')

Unnamed: 0,min,max
0,2013-12-31,2014-09-12
1,2014-12-30,2020-12-30


In [66]:
combined_dataframe

Unnamed: 0,dt,sunrise,sunset,temp,feels_like,pressure,humidity,dew_point,clouds,visibility,wind_speed,wind_deg,weather,rain,wind_gust,Diff,Group
0,2013-12-31,2013-12-31 05:22:42,2013-12-31 18:56:21,26.70,27.77,1014,61,18.56,40,10000.0,4.12,330,"[{'id': 802, 'main': 'Clouds', 'description': ...",,,,0
1,2014-01-01,2014-01-01 05:23:20,2014-01-01 18:56:38,24.70,25.16,1015,74,19.76,75,10000.0,3.60,310,"[{'id': 500, 'main': 'Rain', 'description': 'l...",,,1.0,0
2,2014-01-02,2014-01-02 05:23:59,2014-01-02 18:56:54,28.70,29.75,1013,54,18.48,0,10000.0,5.14,330,"[{'id': 800, 'main': 'Clear', 'description': '...",,,1.0,0
3,2014-01-03,2014-01-03 05:24:38,2014-01-03 18:57:09,26.70,28.36,1016,70,20.78,0,10000.0,3.10,150,"[{'id': 800, 'main': 'Clear', 'description': '...",,,1.0,0
4,2014-01-04,2014-01-04 05:25:18,2014-01-04 18:57:23,20.70,21.12,1021,88,18.64,100,,7.72,180,"[{'id': 500, 'main': 'Rain', 'description': 'l...",{'1h': 0.31},,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,2020-12-26,2020-12-26 05:19:51,2020-12-26 18:54:40,19.83,20.01,1016,82,16.67,75,6000.0,3.60,140,"[{'id': 701, 'main': 'Mist', 'description': 'm...",,,1.0,1
172,2020-12-27,2020-12-27 05:20:25,2020-12-27 18:55:03,20.83,21.19,1017,85,18.22,20,8000.0,1.50,70,"[{'id': 701, 'main': 'Mist', 'description': 'm...",,,1.0,1
173,2020-12-28,2020-12-28 05:21:00,2020-12-28 18:55:25,20.50,20.93,1018,89,18.63,75,3000.0,2.10,20,"[{'id': 502, 'main': 'Rain', 'description': 'h...",{'1h': 11.94},,1.0,1
174,2020-12-29,2020-12-29 05:21:36,2020-12-29 18:55:46,20.46,20.91,1016,90,18.76,75,10000.0,1.00,100,"[{'id': 701, 'main': 'Mist', 'description': 'm...",,,1.0,1
