In [3]:
import pandas as pd
from datetime import datetime

In [None]:

df = pd.read_csv("data/SG-ES.csv", encoding="latin-1", sep=",")

# Discarding non recovery/death outcomes
df = df[df["Evolucao"].isin(["Cura", "Óbito pelo COVID-19"])]

# Translating column names
df["NotificationDate"] = df["DataNotificacao"]
df["DeathDate"] = df["DataObito"]
df["RecoveryDate"] = df.apply(lambda row: row["DataEncerramento"] if row["Evolucao"] == "Cura" else None, axis=1)

# Converting to datetime
df["NotificationDate"] = pd.to_datetime(df["NotificationDate"], format='%Y-%m-%d')
df["DeathDate"] = pd.to_datetime(df["DeathDate"], format='%Y-%m-%d')
df["RecoveryDate"] = pd.to_datetime(df["RecoveryDate"], format='%Y-%m-%d')

# Discarding inconsistent entries
df = df[df["RecoveryDate"] >= df["NotificationDate"]]


In [5]:

def filter_csv_by_time_period(input_file, output_file, time_column, start_time, end_time, chunksize=100000):
    """
    Filters a massive CSV file for rows within a specified time period.
    
    Parameters:
        input_file (str): Path to the input CSV file
        output_file (str): Path to save the filtered CSV
        time_column (str): Name of the column containing timestamps
        start_time (str): Start time in format matching your CSV (e.g., '2023-01-01')
        end_time (str): End time in format matching your CSV (e.g., '2023-01-31')
        chunksize (int): Number of rows to process at a time (adjust based on memory)
    """
    
    # Convert string times to datetime objects for comparison
    start_dt = pd.to_datetime(start_time)
    end_dt = pd.to_datetime(end_time)
    
    # Initialize a list to hold filtered chunks
    filtered_chunks = []
    
    # Read the CSV in chunks
    for chunk in pd.read_csv(input_file, parse_dates=[time_column], chunksize=chunksize, encoding="latin-1", sep=";"):
        # Filter the current chunk
        mask = (chunk[time_column] >= start_dt) & (chunk[time_column] <= end_dt)
        filtered_chunk = chunk.loc[mask]
        
        # Append to our list of filtered chunks
        if not filtered_chunk.empty:
            filtered_chunks.append(filtered_chunk)
    
    # Combine all filtered chunks and save to CSV
    if filtered_chunks:
        pd.concat(filtered_chunks).to_csv(output_file, index=False)
        print(f"Filtered data saved to {output_file}")
    else:
        print("No data found within the specified time period.")

# Example usage
if __name__ == "__main__":
    input_csv = "MICRODADOS.csv"          # Replace with your input file
    output_csv = "covid-es.csv"      # Replace with your output file
    time_col = "DataNotificacao"                # Replace with your time column name
    start = "2020-01-01 00:00:00"        # Replace with your start time
    end = "2021-01-31 23:59:59"          # Replace with your end time
    
    filter_csv_by_time_period(input_csv, output_csv, time_col, start, end)

  for chunk in pd.read_csv(input_file, parse_dates=[time_column], chunksize=chunksize, encoding="latin-1", sep=";"):
  for chunk in pd.read_csv(input_file, parse_dates=[time_column], chunksize=chunksize, encoding="latin-1", sep=";"):
  for chunk in pd.read_csv(input_file, parse_dates=[time_column], chunksize=chunksize, encoding="latin-1", sep=";"):
  for chunk in pd.read_csv(input_file, parse_dates=[time_column], chunksize=chunksize, encoding="latin-1", sep=";"):
  for chunk in pd.read_csv(input_file, parse_dates=[time_column], chunksize=chunksize, encoding="latin-1", sep=";"):
  for chunk in pd.read_csv(input_file, parse_dates=[time_column], chunksize=chunksize, encoding="latin-1", sep=";"):
  for chunk in pd.read_csv(input_file, parse_dates=[time_column], chunksize=chunksize, encoding="latin-1", sep=";"):
  for chunk in pd.read_csv(input_file, parse_dates=[time_column], chunksize=chunksize, encoding="latin-1", sep=";"):
  for chunk in pd.read_csv(input_file, parse_dates=[time_column]

Filtered data saved to covid-es.csv
