In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

data_dir = Path('../data')
csv_files = list(data_dir.glob('*.csv'))

print(f"Found {len(csv_files)} CSV files:")
for file in csv_files:
    print(f"  - {file.name}")

Found 8 CSV files:
  - Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
  - Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
  - Friday-WorkingHours-Morning.pcap_ISCX.csv
  - Monday-WorkingHours.pcap_ISCX.csv
  - Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
  - Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
  - Tuesday-WorkingHours.pcap_ISCX.csv
  - Wednesday-workingHours.pcap_ISCX.csv


In [2]:

for file in csv_files:
    print(f"\n{'='*60}")
    print(f"File: {file.name}")
    print(f"{'='*60}")
    
    try:
        df_sample = pd.read_csv(file, nrows=5)
        
        print(f"Shape: {df_sample.shape}")
        print(f"\nColumns:")
        for i, col in enumerate(df_sample.columns):
            print(f"  {i+1:2d}. {col}")
        
        print(f"\nData types:")
        print(df_sample.dtypes)
        
        print(f"\nFirst few rows:")
        print(df_sample.head())
        
    except Exception as e:
        print(f"Error reading {file.name}: {e}")


File: Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Shape: (5, 79)

Columns:
   1.  Destination Port
   2.  Flow Duration
   3.  Total Fwd Packets
   4.  Total Backward Packets
   5. Total Length of Fwd Packets
   6.  Total Length of Bwd Packets
   7.  Fwd Packet Length Max
   8.  Fwd Packet Length Min
   9.  Fwd Packet Length Mean
  10.  Fwd Packet Length Std
  11. Bwd Packet Length Max
  12.  Bwd Packet Length Min
  13.  Bwd Packet Length Mean
  14.  Bwd Packet Length Std
  15. Flow Bytes/s
  16.  Flow Packets/s
  17.  Flow IAT Mean
  18.  Flow IAT Std
  19.  Flow IAT Max
  20.  Flow IAT Min
  21. Fwd IAT Total
  22.  Fwd IAT Mean
  23.  Fwd IAT Std
  24.  Fwd IAT Max
  25.  Fwd IAT Min
  26. Bwd IAT Total
  27.  Bwd IAT Mean
  28.  Bwd IAT Std
  29.  Bwd IAT Max
  30.  Bwd IAT Min
  31. Fwd PSH Flags
  32.  Bwd PSH Flags
  33.  Fwd URG Flags
  34.  Bwd URG Flags
  35.  Fwd Header Length
  36.  Bwd Header Length
  37. Fwd Packets/s
  38.  Bwd Packets/s
  39.  Min Packet Length
  

In [3]:
# Check for missing values in one file as an example
sample_file = csv_files[0]
print(f"Checking missing values in {sample_file.name}")

df_sample = pd.read_csv(sample_file, nrows=1000)  # Load 1000 rows for analysis

print(f"\nMissing values per column:")
missing_counts = df_sample.isnull().sum()
print(missing_counts[missing_counts > 0])

if missing_counts.sum() == 0:
    print("\nNo missing values found in the sample.")

Checking missing values in Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv

Missing values per column:
Series([], dtype: int64)

No missing values found in the sample.


In [4]:
# Check for unique values in categorical columns
print(f"\nUnique values in categorical columns:")
for col in df_sample.select_dtypes(include=['object']).columns:
    unique_vals = df_sample[col].unique()
    print(f"\n{col}: {len(unique_vals)} unique values")
    if len(unique_vals) <= 10:
        print(f"  Values: {unique_vals}")
    else:
        print(f"  First 10 values: {unique_vals[:10]}")


Unique values in categorical columns:

 Label: 1 unique values
  Values: ['BENIGN']


In [5]:
# Basic statistics for numerical columns
print(f"\nBasic statistics for numerical columns:")
numerical_cols = df_sample.select_dtypes(include=[np.number]).columns
print(df_sample[numerical_cols].describe())


Basic statistics for numerical columns:
        Destination Port   Flow Duration   Total Fwd Packets  \
count        1000.000000    1.000000e+03          1000.00000   
mean         3423.367000    8.171073e+06            16.15200   
std         12493.393136    2.666404e+07            53.97928   
min            21.000000    0.000000e+00             1.00000   
25%            53.000000    2.346000e+04             2.00000   
50%            80.000000    7.743900e+04             2.00000   
75%           443.000000    1.859212e+05            25.25000   
max         60954.000000    1.199523e+08          1611.00000   

        Total Backward Packets  Total Length of Fwd Packets  \
count              1000.000000                   1000.00000   
mean                 19.012000                    641.69200   
std                  65.425594                   1600.66842   
min                   0.000000                      0.00000   
25%                   1.000000                     51.00000   
50% 

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
