In [1]:
import pandas as pd

In [2]:
import pyarrow.parquet as pq
import pyarrow as pa
import os

filtered_classes = {
    1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle',
    75: 'clock', 68: 'cell phone', 10: 'traffic light',
    12: 'stop sign', 0: '__background__', 6: 'bus', 44: 'knife'
}

def filter_annotations(annotations):
    return [ann for ann in annotations if ann['category_id'] in filtered_classes]

def process_chunk(chunk):
    chunk = chunk.to_pandas()
    chunk['annotations'] = chunk['annotations'].apply(filter_annotations)
    return chunk[chunk['annotations'].apply(len) > 0]

def process_file(file_name, output_file):
    parquet_file = pq.ParquetFile(file_name)
    
    writer = None
    for batch in parquet_file.iter_batches(batch_size=10000):
        filtered_chunk = process_chunk(batch)
        if not filtered_chunk.empty:
            if writer is None:
                schema = pa.Schema.from_pandas(filtered_chunk)
                writer = pq.ParquetWriter(output_file, schema)
            table = pa.Table.from_pandas(filtered_chunk)
            writer.write_table(table)
    
    if writer:
        writer.close()

def main():
    output_dir = "filter 640"
    os.makedirs(output_dir, exist_ok=True)

    for i in range(1, 18):
        input_file = f"../datasets/640x640/resized_dataset_train_640_{i}.parquet"
        output_file = os.path.join(output_dir, f"filtered_dataset_train_640_{i}.parquet")
        
        print(f"Processing {input_file}")
        process_file(input_file, output_file)
        print(f"Processed and saved {output_file}")

    print("Filtering complete.")

if __name__ == "__main__":
    main()

Processing ../datasets/640x640/resized_dataset_train_640_1.parquet
Processed and saved filter 640/filtered_dataset_train_640_1.parquet
Processing ../datasets/640x640/resized_dataset_train_640_2.parquet
Processed and saved filter 640/filtered_dataset_train_640_2.parquet
Processing ../datasets/640x640/resized_dataset_train_640_3.parquet
Processed and saved filter 640/filtered_dataset_train_640_3.parquet
Processing ../datasets/640x640/resized_dataset_train_640_4.parquet
Processed and saved filter 640/filtered_dataset_train_640_4.parquet
Processing ../datasets/640x640/resized_dataset_train_640_5.parquet
Processed and saved filter 640/filtered_dataset_train_640_5.parquet
Processing ../datasets/640x640/resized_dataset_train_640_6.parquet
Processed and saved filter 640/filtered_dataset_train_640_6.parquet
Processing ../datasets/640x640/resized_dataset_train_640_7.parquet
Processed and saved filter 640/filtered_dataset_train_640_7.parquet
Processing ../datasets/640x640/resized_dataset_train_64

In [5]:
import pandas as pd
import glob

# Path pattern for your parquet files
file_pattern = '/home/muhammadfasi/Downloads/FYP/scripts/filter 640/filtered_dataset_train_640_*.parquet'

# Get all matching file names
file_names = glob.glob(file_pattern)

# Initialize a counter for total rows
total_rows = 0

# Loop through each file
for file_name in file_names:
    # Read the parquet file
    df = pd.read_parquet(file_name)
    
    # Add the number of rows to the total
    total_rows += len(df)
    
    # Print the count for each file
    print(f"{file_name}: {len(df)} rows")

# Print the total count
print(f"\nTotal rows across all files: {total_rows}")

/home/muhammadfasi/Downloads/FYP/scripts/filter 640/filtered_dataset_train_640_17.parquet: 1550 rows
/home/muhammadfasi/Downloads/FYP/scripts/filter 640/filtered_dataset_train_640_16.parquet: 1606 rows
/home/muhammadfasi/Downloads/FYP/scripts/filter 640/filtered_dataset_train_640_24.parquet: 703 rows
/home/muhammadfasi/Downloads/FYP/scripts/filter 640/filtered_dataset_train_640_21.parquet: 1630 rows
/home/muhammadfasi/Downloads/FYP/scripts/filter 640/filtered_dataset_train_640_15.parquet: 1553 rows
/home/muhammadfasi/Downloads/FYP/scripts/filter 640/filtered_dataset_train_640_18.parquet: 1605 rows
/home/muhammadfasi/Downloads/FYP/scripts/filter 640/filtered_dataset_train_640_10.parquet: 1579 rows
/home/muhammadfasi/Downloads/FYP/scripts/filter 640/filtered_dataset_train_640_19.parquet: 1512 rows
/home/muhammadfasi/Downloads/FYP/scripts/filter 640/filtered_dataset_train_640_7.parquet: 1571 rows
/home/muhammadfasi/Downloads/FYP/scripts/filter 640/filtered_dataset_train_640_11.parquet: 17