In [5]:
import pandas as pd
from datetime import datetime, timedelta

# Set the number of rows
num_rows = 10

# Generate timestamps for the first 4 rows within 10ms of the first row
start_time = datetime.now()
timestamps = [start_time + timedelta(milliseconds=i) for i in range(4)]

# Fill in the remaining timestamps with larger intervals to ensure they are outside the 10ms window
for _ in range(4, num_rows):
    next_time = timestamps[-1] + timedelta(milliseconds=1000)
    timestamps.append(next_time)

# Generate categories
categories = ['A'] * 4 + ['B', 'C', 'D', 'E', 'F', 'G']

# Create the DataFrame
df = pd.DataFrame({
    'timestamp': timestamps,
    'category': categories
})

# Display the DataFrame
df


Unnamed: 0,timestamp,category
0,2024-06-20 21:47:52.589151,A
1,2024-06-20 21:47:52.590151,A
2,2024-06-20 21:47:52.591151,A
3,2024-06-20 21:47:52.592151,A
4,2024-06-20 21:47:53.592151,B
5,2024-06-20 21:47:54.592151,C
6,2024-06-20 21:47:55.592151,D
7,2024-06-20 21:47:56.592151,E
8,2024-06-20 21:47:57.592151,F
9,2024-06-20 21:47:58.592151,G


In [8]:
def find_consecutive_categories(df, window_ms=10):
    subset_dfs = []
    
    for i in range(len(df)):
        start_time = df.loc[i, 'timestamp']
        end_time = start_time + timedelta(milliseconds=window_ms)
        window_df = df[(df['timestamp'] >= start_time) & (df['timestamp'] <= end_time)]
        # Check if there are more than 2 consecutive same categories
        if len(window_df) > 2:
            for category in window_df['category'].unique():
                category_window_df = window_df[window_df['category'] == category]
                if len(category_window_df) > 2:
                    subset_dfs.append(category_window_df)
                    break  # Stop once we find a valid subset for this window
            
    
    return subset_dfs

In [9]:
# Find the subsets
subsets = find_consecutive_categories(df)

# Display the subsets
for idx, subset_df in enumerate(subsets):
    print(f"Subset {idx + 1}:")
    print(subset_df)
    print()

# Display the original DataFrame for reference
print("Original DataFrame:")
print(df)

Subset 1:
                   timestamp category
0 2024-06-20 21:47:52.589151        A
1 2024-06-20 21:47:52.590151        A
2 2024-06-20 21:47:52.591151        A
3 2024-06-20 21:47:52.592151        A

Subset 2:
                   timestamp category
1 2024-06-20 21:47:52.590151        A
2 2024-06-20 21:47:52.591151        A
3 2024-06-20 21:47:52.592151        A

Original DataFrame:
                   timestamp category
0 2024-06-20 21:47:52.589151        A
1 2024-06-20 21:47:52.590151        A
2 2024-06-20 21:47:52.591151        A
3 2024-06-20 21:47:52.592151        A
4 2024-06-20 21:47:53.592151        B
5 2024-06-20 21:47:54.592151        C
6 2024-06-20 21:47:55.592151        D
7 2024-06-20 21:47:56.592151        E
8 2024-06-20 21:47:57.592151        F
9 2024-06-20 21:47:58.592151        G
