# AIT Dataset - Data Loading and Cleaning

This notebook loads the AIT Fox dataset from HuggingFace and prepares it for analysis and cleaning.

**Dataset:** chYassine/ait-fox-raw-v02
- **Total Logs:** ~5.4M entries
- **Hosts:** 21 unique hosts
- **Log Types:** 11 different types


## 1. Import Required Libraries


In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 50)

print("✅ Libraries imported successfully")


## 2. Load Dataset from HuggingFace


In [None]:
# Dataset repository name
dataset_repo = "chYassine/ait-fox-raw-v02"

print(f"Loading dataset: {dataset_repo}")
print("This may take a few minutes...\n")

# Load dataset
dataset = load_dataset(dataset_repo, split='train')

print(f"✅ Dataset loaded successfully!")
print(f"   Total entries: {len(dataset):,}")


## 3. Convert to Pandas DataFrame


In [None]:
# Convert to DataFrame
print("Converting to DataFrame...")
df = pd.DataFrame(dataset)

print(f"✅ DataFrame created successfully!")
print(f"   Shape: {df.shape}")
print(f"   Columns: {list(df.columns)}")


## 4. Initial Data Exploration


In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()


In [None]:
# Basic information
print("Dataset Information:")
df.info()


In [None]:
# Data types of each column
print("\nData Types:")
print(df.dtypes)


In [None]:
# Statistical summary for numerical columns
print("Statistical Summary:")
df.describe()


## 5. Check for Missing Values


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (df.isnull().sum() / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing Count': missing_values.values,
    'Missing Percentage': missing_percentage.values
})

print("Missing Values Analysis:")
print(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("\n✅ No missing values found!")


## 6. Check for Duplicates


In [None]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"Total duplicate rows: {duplicates:,}")
print(f"Percentage of duplicates: {(duplicates / len(df)) * 100:.2f}%")

# Check for duplicates based on specific columns (e.g., content + host + log_type)
if 'content' in df.columns and 'host' in df.columns and 'log_type' in df.columns:
    duplicates_subset = df.duplicated(subset=['content', 'host', 'log_type']).sum()
    print(f"\nDuplicates based on content+host+log_type: {duplicates_subset:,}")
    print(f"Percentage: {(duplicates_subset / len(df)) * 100:.2f}%")


## 7. Categorical Data Analysis


In [None]:
# Analyze host distribution
if 'host' in df.columns:
    print("Host Distribution:")
    host_counts = df['host'].value_counts()
    print(f"\nTotal unique hosts: {len(host_counts)}")
    print(f"\nTop 10 Hosts:")
    print(host_counts.head(10))


In [None]:
# Visualize host distribution
if 'host' in df.columns:
    plt.figure(figsize=(12, 6))
    host_counts.head(15).plot(kind='bar')
    plt.title('Top 15 Hosts by Log Count')
    plt.xlabel('Host')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


In [None]:
# Analyze log type distribution
if 'log_type' in df.columns:
    print("Log Type Distribution:")
    log_type_counts = df['log_type'].value_counts()
    print(f"\nTotal unique log types: {len(log_type_counts)}")
    print(f"\nLog Type Breakdown:")
    print(log_type_counts)


In [None]:
# Visualize log type distribution
if 'log_type' in df.columns:
    plt.figure(figsize=(12, 6))
    log_type_counts.plot(kind='bar')
    plt.title('Log Type Distribution')
    plt.xlabel('Log Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


In [None]:
# Analyze binary vs text logs
if 'is_binary' in df.columns:
    print("Binary vs Text Logs:")
    binary_counts = df['is_binary'].value_counts()
    print(binary_counts)
    print(f"\nText logs: {binary_counts.get(False, 0):,}")
    print(f"Binary files: {binary_counts.get(True, 0):,}")
    print(f"Binary percentage: {(binary_counts.get(True, 0) / len(df)) * 100:.2f}%")


## 8. Content Analysis


In [None]:
# Analyze content length for text logs
if 'content' in df.columns and 'is_binary' in df.columns:
    # Filter text logs only
    text_logs = df[df['is_binary'] == False].copy()
    
    # Calculate content length
    text_logs['content_length'] = text_logs['content'].astype(str).str.len()
    
    print("Content Length Analysis (Text Logs Only):")
    print(f"Total text logs: {len(text_logs):,}")
    print(f"\nContent Length Statistics:")
    print(text_logs['content_length'].describe())


In [None]:
# Visualize content length distribution
if 'content' in df.columns and 'is_binary' in df.columns:
    plt.figure(figsize=(12, 6))
    plt.hist(text_logs['content_length'], bins=50, edgecolor='black')
    plt.title('Distribution of Content Length (Text Logs)')
    plt.xlabel('Content Length (characters)')
    plt.ylabel('Frequency')
    plt.xlim(0, text_logs['content_length'].quantile(0.95))  # Limit to 95th percentile for better visualization
    plt.tight_layout()
    plt.show()


In [None]:
# Check for empty or null content
if 'content' in df.columns:
    empty_content = df[df['content'].isna() | (df['content'].astype(str).str.strip() == '')]
    print(f"Rows with empty or null content: {len(empty_content):,}")
    print(f"Percentage: {(len(empty_content) / len(df)) * 100:.2f}%")


## 9. Cross-Analysis: Host vs Log Type


In [None]:
# Cross-tabulation of host and log_type
if 'host' in df.columns and 'log_type' in df.columns:
    print("Host-Log Type Cross Analysis:")
    cross_tab = pd.crosstab(df['host'], df['log_type'])
    print("\nCross-tabulation (showing first 10 hosts):")
    print(cross_tab.head(10))


In [None]:
# Hosts with most log type variety
if 'host' in df.columns and 'log_type' in df.columns:
    host_log_variety = df.groupby('host')['log_type'].nunique().sort_values(ascending=False)
    print("Hosts with Most Log Type Variety:")
    print(host_log_variety)


## 10. Sample Data Inspection


In [None]:
# Display random samples from different log types
if 'log_type' in df.columns and 'content' in df.columns:
    print("Sample Logs from Each Log Type:\n")
    log_types = df['log_type'].unique()
    
    for log_type in sorted(log_types)[:5]:  # Show first 5 log types
        print(f"\n{'='*80}")
        print(f"Log Type: {log_type}")
        print('='*80)
        sample = df[df['log_type'] == log_type].sample(n=1).iloc[0]
        print(f"Host: {sample.get('host', 'N/A')}")
        print(f"Path: {sample.get('path', 'N/A')}")
        content = str(sample.get('content', 'N/A'))
        print(f"Content: {content[:300]}...") if len(content) > 300 else print(f"Content: {content}")


## 11. Data Cleaning Operations


### 11.1 Remove Duplicates (if needed)


In [None]:
# Remove duplicate rows
# Uncomment the following lines if you want to remove duplicates

# print(f"Original shape: {df.shape}")
# df_cleaned = df.drop_duplicates()
# print(f"After removing duplicates: {df_cleaned.shape}")
# print(f"Removed {len(df) - len(df_cleaned):,} duplicate rows")

# Or remove duplicates based on specific columns:
# df_cleaned = df.drop_duplicates(subset=['content', 'host', 'log_type'], keep='first')

print("Duplicate removal cell (currently commented out)")


### 11.2 Handle Missing Values


In [None]:
# Handle missing values
# Uncomment and modify based on your needs

# Option 1: Drop rows with missing values in specific columns
# df_cleaned = df.dropna(subset=['content', 'host'])

# Option 2: Fill missing values
# df_cleaned = df.fillna({
#     'content': '',
#     'host': 'unknown',
#     'log_type': 'unknown'
# })

# Option 3: Drop all rows with any missing values
# df_cleaned = df.dropna()

print("Missing values handling cell (currently commented out)")


### 11.3 Filter Binary Files (if needed)


In [None]:
# Filter out binary files to keep only text logs
if 'is_binary' in df.columns:
    # Uncomment to filter out binary files
    # df_text_only = df[df['is_binary'] == False].copy()
    # print(f"Original dataset: {len(df):,} rows")
    # print(f"Text-only dataset: {len(df_text_only):,} rows")
    # print(f"Removed {len(df) - len(df_text_only):,} binary files")
    
    print("Binary filter cell (currently commented out)")
else:
    print("'is_binary' column not found")


### 11.4 Filter by Host or Log Type


In [None]:
# Filter data by specific hosts or log types

# Example: Keep only specific hosts
# hosts_to_keep = ['monitoring', 'inet-firewall', 'vpn', 'webserver']
# df_filtered = df[df['host'].isin(hosts_to_keep)]

# Example: Keep only specific log types
# log_types_to_keep = ['suricata', 'apache2', 'audit']
# df_filtered = df[df['log_type'].isin(log_types_to_keep)]

# Example: Exclude specific hosts
# hosts_to_exclude = ['attacker_0']
# df_filtered = df[~df['host'].isin(hosts_to_exclude)]

print("Filtering cell (currently commented out)")


### 11.5 Clean Content Field


In [None]:
# Clean the content field
if 'content' in df.columns:
    # Example cleaning operations (uncomment as needed)
    
    # Remove leading/trailing whitespace
    # df['content'] = df['content'].astype(str).str.strip()
    
    # Remove extra whitespace
    # df['content'] = df['content'].astype(str).str.replace(r'\s+', ' ', regex=True)
    
    # Remove specific characters or patterns
    # df['content'] = df['content'].astype(str).str.replace(r'[^\x00-\x7F]+', '', regex=True)  # Remove non-ASCII
    
    # Convert to lowercase (if case-insensitive analysis is needed)
    # df['content_lower'] = df['content'].astype(str).str.lower()
    
    print("Content cleaning cell (currently commented out)")


### 11.6 Create Derived Features


In [None]:
# Create useful derived features
if 'content' in df.columns:
    # Add content length
    # df['content_length'] = df['content'].astype(str).str.len()
    
    # Add word count
    # df['word_count'] = df['content'].astype(str).str.split().str.len()
    
    # Extract timestamp if present in content (example pattern)
    # df['timestamp'] = df['content'].astype(str).str.extract(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})')
    
    print("Feature engineering cell (currently commented out)")


## 12. Export Cleaned Data


In [None]:
# Export cleaned data to CSV
# Uncomment to save the cleaned data

# output_file = 'cleaned_ait_fox_data.csv'
# df.to_csv(output_file, index=False)
# print(f"✅ Cleaned data saved to: {output_file}")

# For large datasets, consider using parquet format
# output_file = 'cleaned_ait_fox_data.parquet'
# df.to_parquet(output_file, index=False)
# print(f"✅ Cleaned data saved to: {output_file}")

print("Export cell (currently commented out)")


## 13. Summary Statistics After Cleaning


In [None]:
# Display final summary after cleaning
print("Final Dataset Summary:")
print(f"\nShape: {df.shape}")
print(f"Total rows: {len(df):,}")
print(f"Total columns: {len(df.columns)}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
