# CIFAR 10 v1

#### Install required dependencies

In [None]:
#%pip install -r requirements.txt

## Prepare CIFAR-10-C

#### Extract CIFAR-10-C.tar archive

Delete the CIFAR-10-C and CIFAR-10-C-npy directories if they exist. Extract the CIFAR-10-C.tar file into the current directory. Rename the extracted CIFAR-10-C folder to CIFAR-10-C-npy.

In [1]:
import tarfile, shutil
from pathlib import Path

shutil.rmtree("CIFAR-10-C", ignore_errors=True)
shutil.rmtree("CIFAR-10-C-npy", ignore_errors=True)

filename = "CIFAR-10-C.tar"
dest = Path(".").resolve()

with tarfile.open(filename, mode="r:*") as tar:
    safe = []
    for m in tar.getmembers():
        target = (dest / m.name).resolve()
        safe.append(m)
    tar.extractall(path=dest, members=safe, filter="data")

Path("CIFAR-10-C").rename("CIFAR-10-C-npy")

print(f"extracted {filename} to CIFAR-10-C-npy/")

extracted CIFAR-10-C.tar to CIFAR-10-C-npy/


#### Convert Numpy files to images organized by class and severity

Scan the CIFAR-10-C-npy directory for all corruption arrays except labels.npy. Load labels.npy and define the class names. For each corruption file, load the array, then for severities 1 through 5 slice the appropriate 10k block of images. Make a severity subfolder like "s1" under the folder named after the corruption. Save each image in that slice as a PNG and record a row with the image path and its string label. After processing everything, report how many images were written. Convert the rows into a DataFrame and write CIFAR-10-C-images.csv with path and label columns.

In [2]:
import numpy as np
from PIL import Image
import pandas as pd

npy_files_dir_name = "CIFAR-10-C-npy"
images_dir_name = "CIFAR-10-C"

npy_dir = Path(npy_files_dir_name)
images_dir = Path(images_dir_name)
images_dir.mkdir(exist_ok=True)

files = sorted([p for p in npy_dir.glob("*.npy") if p.name != "labels.npy"])
print(f"Found {len(files)} .npy files in {npy_dir}")

labels_path = npy_dir / "labels.npy"
labels = np.load(labels_path)
print(f"Loaded labels from {labels_path} with shape {labels.shape}")

CLASSES = [
    "airplane",
    "automobile",
    "bird",
    "cat",
    "deer",
    "dog",
    "frog",
    "horse",
    "ship",
    "truck",
]

rows = []

for p in files:
    name = p.stem
    print(f"exporting {name} -> {images_dir / name}")

    X = np.load(p, mmap_mode="r")

    for s in range(1, 6):
        index_start = 10_000 * (s - 1)
        index_end = 10_000 * s

        Xs = X[index_start:index_end]
        ys = labels[index_start:index_end]

        out_dir = images_dir / name / f"s{s}"
        out_dir.mkdir(parents=True, exist_ok=True)

        for j in range(Xs.shape[0]):
            img = Image.fromarray(Xs[j])
            image_name = out_dir / f"{j}.png"
            img.save(image_name, format="PNG")
            rows.append({"path": str(image_name), "label": CLASSES[ys[j]]})

print(f"Exported {len(rows)} images to {images_dir}")

df = pd.DataFrame(rows)
df.to_csv("CIFAR-10-C-images.csv", index=False)
print(f"Exported CIFAR-10-C-images.csv")


Found 19 .npy files in CIFAR-10-C-npy
Loaded labels from CIFAR-10-C-npy/labels.npy with shape (50000,)
exporting brightness -> CIFAR-10-C/brightness
exporting contrast -> CIFAR-10-C/contrast
exporting defocus_blur -> CIFAR-10-C/defocus_blur
exporting elastic_transform -> CIFAR-10-C/elastic_transform
exporting fog -> CIFAR-10-C/fog
exporting frost -> CIFAR-10-C/frost
exporting gaussian_blur -> CIFAR-10-C/gaussian_blur
exporting gaussian_noise -> CIFAR-10-C/gaussian_noise
exporting glass_blur -> CIFAR-10-C/glass_blur
exporting impulse_noise -> CIFAR-10-C/impulse_noise
exporting jpeg_compression -> CIFAR-10-C/jpeg_compression
exporting motion_blur -> CIFAR-10-C/motion_blur
exporting pixelate -> CIFAR-10-C/pixelate
exporting saturate -> CIFAR-10-C/saturate
exporting shot_noise -> CIFAR-10-C/shot_noise
exporting snow -> CIFAR-10-C/snow
exporting spatter -> CIFAR-10-C/spatter
exporting speckle_noise -> CIFAR-10-C/speckle_noise
exporting zoom_blur -> CIFAR-10-C/zoom_blur
Exported 950000 image

## Prepare CIFAR-10 data

#### Extract train.7z archive

Delete the CIFAR-10 train directories if they exist. Extract train.7z into the current directory. Rename the extracted train folder to CIFAR-10.

In [3]:
from pathlib import Path
import py7zr

shutil.rmtree("CIFAR-10", ignore_errors=True)
shutil.rmtree("train", ignore_errors=True)

archive = "train.7z"

with py7zr.SevenZipFile(archive, mode="r") as z:
    z.extractall(path='.')

Path("train").rename("CIFAR-10")

print(f"extracted {archive} to CIFAR-10/")

extracted train.7z to CIFAR-10/


#### Perform train, validate, test split

Read trainLabels.csv into a new DataFrame. Create an 80/20 stratified split into train and test. From the train portion, carve out 10 percent for validation, again keeping stratification. Write the three splits to CIFAR-10_train.csv, CIFAR-10_val.csv, and CIFAR-10_test.csv. Print the row counts for each file.

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("trainLabels.csv")

train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df["label"], random_state=17
)

train_df, val_df = train_test_split(
    train_df, test_size=0.1, stratify=train_df["label"], random_state=17
)

train_df.to_csv("CIFAR-10_train.csv", index=False)
val_df.to_csv("CIFAR-10_val.csv", index=False)
test_df.to_csv("CIFAR-10_test.csv", index=False)

print("wrote CIFAR-10_train.csv", len(train_df))
print("wrote CIFAR-10_val.csv", len(val_df))
print("wrote CIFAR-10_test.csv", len(test_df))

wrote CIFAR-10_train.csv 36000
wrote CIFAR-10_val.csv 4000
wrote CIFAR-10_test.csv 10000


## Data summaries

#### CIFAR-10:

Load the train, validation, and test CSVs and report their row counts. Compute and display class counts for each splot. Walk the training set images from CIFAR-10/, open each as RGB, and tally image sizes and modes. Accumulate per-channel sums and squared the sums to get totals across all pixels. Compute per-channel mean and standard deviation in the 0 to 1 range from those totals, then print the size and mode tallies and the rounded mean and std values.

In [5]:
from collections import Counter

import numpy as np
import pandas as pd
from PIL import Image

cifar_10_imgs = "CIFAR-10"
cifar_10_train_csv = "CIFAR-10_train.csv"
cifar_10_val_csv = "CIFAR-10_val.csv"
cifar_10_test_csv = "CIFAR-10_test.csv"

print("CIFAR-10 Summary:")

df_train = pd.read_csv(cifar_10_train_csv)
print("train rows:", len(df_train))

df_val = pd.read_csv(cifar_10_val_csv)
print("val rows:", len(df_val))

df_test = pd.read_csv(cifar_10_test_csv)
print("test rows:", len(df_test))

train_counts = df_train["label"].value_counts().sort_index()
print("\nclass counts (train):")
print(train_counts.to_string())

val_counts = df_val["label"].value_counts().sort_index()
print("\nclass counts (val):")
print(val_counts.to_string())

test_counts = df_test["label"].value_counts().sort_index()
print("\nclass counts (test):")
print(test_counts.to_string())

sizes = Counter()
modes = Counter()

sum_c = np.zeros(3, dtype=np.float64)
sum_sq_c = np.zeros(3, dtype=np.float64)
pix_count = 0

for k, row in df_train.iterrows():
    img_path = f"{cifar_10_imgs}/{row['id']}.png"
    img = Image.open(img_path).convert("RGB")

    sizes[img.size] += 1
    modes[img.mode] += 1

    x = np.asarray(img, dtype=np.float32) / 255.0
    sum_c += x.sum(axis=(0, 1))
    sum_sq_c += (x * x).sum(axis=(0, 1))
    pix_count += x.shape[0] * x.shape[1]

mean = (sum_c / pix_count).astype(np.float64)
var = (sum_sq_c / pix_count) - mean * mean
std = np.sqrt(np.clip(var, 0, None))

print("\nimage sizes sample:", dict(sizes))
print("image modes sample:", dict(modes))

print("\nper-channel mean:", np.round(mean, 6))
print("per-channel std: ", np.round(std, 6))

CIFAR-10 Summary:
train rows: 36000
val rows: 4000
test rows: 10000

class counts (train):
label
airplane      3600
automobile    3600
bird          3600
cat           3600
deer          3600
dog           3600
frog          3600
horse         3600
ship          3600
truck         3600

class counts (val):
label
airplane      400
automobile    400
bird          400
cat           400
deer          400
dog           400
frog          400
horse         400
ship          400
truck         400

class counts (test):
label
airplane      1000
automobile    1000
bird          1000
cat           1000
deer          1000
dog           1000
frog          1000
horse         1000
ship          1000
truck         1000

image sizes sample: {(32, 32): 36000}
image modes sample: {'RGB': 36000}

per-channel mean: [0.490912 0.481628 0.44589 ]
per-channel std:  [0.246647 0.243163 0.261323]


#### CIFAR-10-C:

Scan the CIFAR-10-C images folder for corruption folders and report their names. For each corruption, walk the severity subfolders s1 through s5, count the PNGs, collect rows of {corruption, severity, count} info, sort them, and print a summary table. Load labels.npy, map label ids to class names, and print overall class counts across all 50k items. Slice labels into five 10k blocks by severity, compute class counts for each block, align them to the class list, concatenate the columns, and print the per severity label table.

In [6]:
from pathlib import Path

import numpy as np
import pandas as pd

cifar_10_c_imgs = "CIFAR-10-C"
cifar_10_c_labels = "CIFAR-10-C-npy/labels.npy"

print("CIFAR-10-C Summary:\n")

corruptions = sorted([p for p in Path(cifar_10_c_imgs).iterdir() if p.is_dir()])
print("corruptions found:", [c.name for c in corruptions])

rows = []

for corruption_dir in corruptions:
    for severity_dir in sorted(corruption_dir.glob("s*")):       
        count = sum(1 for _ in severity_dir.glob("*.png"))
        
        rows.append(
            {
                "corruption": corruption_dir.name,
                "severity": int(severity_dir.name[1:]),
                "count": count,
            }
        )

c10c_df = pd.DataFrame(rows).sort_values(["corruption", "severity"])

print("\ncounts by corruption and severity:")
print(c10c_df.to_string(index=False))

labels = np.load(cifar_10_c_labels)

CLASSES = [
    "airplane",
    "automobile",
    "bird",
    "cat",
    "deer",
    "dog",
    "frog",
    "horse",
    "ship",
    "truck",
]

overall = (
    pd.Series(labels)
    .map(lambda i: CLASSES[int(i)])
    .value_counts()
    .sort_index()
)
print("\nlabel counts from labels.npy (all severities combined):")
print(overall.to_string())

per_sev = []

for s in range(1, 6):
    i0, i1 = 10_000 * (s - 1), 10_000 * s
    
    sev_counts = (
        pd.Series(labels[i0:i1])
        .map(lambda i: CLASSES[int(i)])
        .value_counts()
    )
    sev_counts = sev_counts.reindex(CLASSES, fill_value=0)
    per_sev.append(sev_counts.rename(f"s{s}"))

per_sev_df = pd.concat(per_sev, axis=1)
print("\nlabel counts per severity (from labels.npy):")
print(per_sev_df.to_string())

CIFAR-10-C Summary:

corruptions found: ['brightness', 'contrast', 'defocus_blur', 'elastic_transform', 'fog', 'frost', 'gaussian_blur', 'gaussian_noise', 'glass_blur', 'impulse_noise', 'jpeg_compression', 'motion_blur', 'pixelate', 'saturate', 'shot_noise', 'snow', 'spatter', 'speckle_noise', 'zoom_blur']

counts by corruption and severity:
       corruption  severity  count
       brightness         1  10000
       brightness         2  10000
       brightness         3  10000
       brightness         4  10000
       brightness         5  10000
         contrast         1  10000
         contrast         2  10000
         contrast         3  10000
         contrast         4  10000
         contrast         5  10000
     defocus_blur         1  10000
     defocus_blur         2  10000
     defocus_blur         3  10000
     defocus_blur         4  10000
     defocus_blur         5  10000
elastic_transform         1  10000
elastic_transform         2  10000
elastic_transform         