In [3]:
import csv
import random
import string
import time
import re
from datetime import datetime, timedelta
import os

In [9]:
if not os.path.exists("output"):
    os.makedirs("output")

In [13]:
def generate_data(file_number):
    filename = f"output/{file_number}"  # Updated filename with the "output" directory

    with open(filename, 'w', newline='') as csvfile:
        fieldnames = ['t', 'integer', 'char']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        start_time = time.time()  # Get the start time
        event_count = 0  # Initialize the event count

        while True:
            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            integer = random.randint(1, 10)
            char = random.choice(string.ascii_uppercase)
            writer.writerow({'t': timestamp, 'integer': integer, 'char': char})
            
            event_count += 1

            elapsed_time = time.time() - start_time  # Calculate elapsed time
            if elapsed_time >= 1:  # Check if 1 second has passed
                break

        return event_count

In [15]:
if __name__ == "__main__":
    for i in range(1, 61):  # Generate 10 files (1.csv, 2.csv, ...)
        filename = f"{i}.csv"  # Create a different filename for each iteration
        event_count = generate_data(filename)
        print(f"Events per second in {filename}: {event_count}")

Events per second in 1.csv: 139563
Events per second in 2.csv: 141215
Events per second in 3.csv: 140813
Events per second in 4.csv: 140910
Events per second in 5.csv: 140213
Events per second in 6.csv: 139774
Events per second in 7.csv: 137719
Events per second in 8.csv: 138603
Events per second in 9.csv: 140760
Events per second in 10.csv: 141958
Events per second in 11.csv: 141641
Events per second in 12.csv: 141346
Events per second in 13.csv: 140813
Events per second in 14.csv: 139124
Events per second in 15.csv: 140255
Events per second in 16.csv: 139437
Events per second in 17.csv: 140408
Events per second in 18.csv: 139564
Events per second in 19.csv: 133258
Events per second in 20.csv: 139747
Events per second in 21.csv: 139098
Events per second in 22.csv: 138924
Events per second in 23.csv: 139507
Events per second in 24.csv: 138958
Events per second in 25.csv: 139794
Events per second in 26.csv: 139052
Events per second in 27.csv: 139212
Events per second in 28.csv: 137124
E

In [16]:
start_time = None  # Declare start_time as a global variable
data=""
timestamp= None
def process_file(filename, regex_pattern, window_duration):
    global start_time  # Access the global start_time variable
    global data
    global timestamp
    pattern = re.compile(regex_pattern)
    
    with open(filename, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        
        for row in reader:
            timestamp = datetime.strptime(row['t'], '%Y-%m-%d %H:%M:%S')
            event = row['char']
            
            if start_time is None:
                start_time = timestamp
            
            time_difference = (timestamp - start_time).total_seconds()

            if time_difference < window_duration:
                data += event
            else:
                matches = len(re.findall(pattern, data))
                print(f"Window {start_time} - {timestamp}: Matches: {matches}")
                start_time = timestamp
                data = ""

def process_file_not_found(regex_pattern):
    global start_time
    global data
    global timestamp
    pattern = re.compile(regex_pattern)
    matches = len(re.findall(pattern, data))
    print(f"Window {start_time} - {timestamp}: Matches: {matches}")

def main():
    regex_pattern = r'([BCDFGHJKLMNPQRSTVWXYZ][AEIOU])+[BCDFGHJKLMNPQRSTVWXYZ]?'

    window_duration = 10  # Fixed window duration of 10 seconds

    for i in range(1, 61):  
        filename = f"output/{i}.csv"
        
        if os.path.exists(filename):
            print(filename)
            process_file(filename, regex_pattern, window_duration)
        else:
            process_file_not_found(regex_pattern)
            break
            # print(f"File {filename} does not exist.")

if __name__ == "__main__":
    main()

output/1.csv
output/2.csv
output/3.csv
output/4.csv
output/5.csv
output/6.csv
output/7.csv
output/8.csv
output/9.csv
output/10.csv
Window 2023-09-09 12:12:58 - 2023-09-09 12:13:08: Matches: 179919
output/11.csv
output/12.csv
output/13.csv
output/14.csv
output/15.csv
output/16.csv
output/17.csv
output/18.csv
output/19.csv
output/20.csv
Window 2023-09-09 12:13:08 - 2023-09-09 12:13:18: Matches: 182884
output/21.csv
output/22.csv
output/23.csv
output/24.csv
output/25.csv
output/26.csv
output/27.csv
output/28.csv
output/29.csv
output/30.csv
Window 2023-09-09 12:13:18 - 2023-09-09 12:13:28: Matches: 182349
output/31.csv
output/32.csv
output/33.csv
output/34.csv
output/35.csv
output/36.csv
output/37.csv
output/38.csv
output/39.csv
output/40.csv
Window 2023-09-09 12:13:28 - 2023-09-09 12:13:38: Matches: 182239
output/41.csv
output/42.csv
output/43.csv
output/44.csv
output/45.csv
output/46.csv
output/47.csv
output/48.csv
output/49.csv
output/50.csv
Window 2023-09-09 12:13:38 - 2023-09-09 12:13

In [None]:
# 1.csv
# Window 2023-09-09 11:14:23 - 2023-09-09 11:14:24: Matches: 10351
# 2.csv
# Window 2023-09-09 11:14:24 - 2023-09-09 11:14:25: Matches: 12006
# 3.csv
# Window 2023-09-09 11:14:25 - 2023-09-09 11:14:26: Matches: 11227
# 4.csv
# Window 2023-09-09 11:14:26 - 2023-09-09 11:14:27: Matches: 11602
# 5.csv
# Window 2023-09-09 11:14:27 - 2023-09-09 11:14:28: Matches: 11058
# 6.csv
# Window 2023-09-09 11:14:28 - 2023-09-09 11:14:29: Matches: 11479
# 7.csv
# Window 2023-09-09 11:14:29 - 2023-09-09 11:14:30: Matches: 11464
# 8.csv
# Window 2023-09-09 11:14:30 - 2023-09-09 11:14:31: Matches: 14015
# 9.csv
# Window 2023-09-09 11:14:31 - 2023-09-09 11:14:32: Matches: 15359
# 10.csv
# Window 2023-09-09 11:14:32 - 2023-09-09 11:14:33: Matches: 17361
# File 11.csv does not exist.
# File 12.csv does not exist.
# File 13.csv does not exist.
# File 14.csv does not exist.