In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kaggle

# Define the directory to save the dataset
data_dir = 'datasets/unsw_nb15'
os.makedirs(data_dir, exist_ok=True)

# Download the UNSW-NB15 dataset from Kaggle
kaggle.api.dataset_download_files('mrwellsdavid/unsw-nb15', path=data_dir, unzip=True)

# Path to the downloaded CSV file
file_path = os.path.join(data_dir, 'UNSW_NB15_training-set.csv')

# Check if the file exists
if os.path.exists(file_path):
    swift_data = pd.read_csv(file_path)
else:
    print(f"Error: The file '{file_path}' does not exist.")

    # raise FileNotFoundError(f"The file '{file_path}' was not found.")
    
# Proceed only if the file was loaded successfully
if 'swift_data' in locals():
    # Display basic info about the dataset
    print("Basic Information of the Dataset:")
    swift_data.info()

    # Summary statistics of the dataset
    print("\nSummary Statistics:")
    print(swift_data.describe())

    # Check for missing values
    print("\nMissing Values in Each Column:")
    print(swift_data.isnull().sum())

    # Visualization examples

    # Histogram of transaction amounts
    if 'amount' in swift_data.columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(swift_data['amount'], bins=50)
        plt.title('Distribution of Transaction Amounts')
        plt.xlabel('Amount')
        plt.ylabel('Frequency')
        plt.show()
    else:
        print("Column 'amount' not found in the dataset.")

    # Correlation heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(swift_data.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.show()

    # Example of visualizing categorical data - Distribution of Currency Types
    if 'currency' in swift_data.columns:
        plt.figure(figsize=(10, 6))
        sns.countplot(x='currency', data=swift_data)
        plt.title('Distribution of Currency Types')
        plt.xlabel('Currency')
        plt.ylabel('Count')
        plt.show()
    else:
        print("Column 'currency' not found in the dataset.")

    # Optional: Display the first few rows of the dataset to inspect
    print("\nFirst 5 Rows of the Dataset:")
    print(swift_data.head())
else:
    print("Dataset could not be loaded. Exiting the script.")

Dataset URL: https://www.kaggle.com/datasets/mrwellsdavid/unsw-nb15




Basic Information of the Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82332 entries, 0 to 82331
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 82332 non-null  int64  
 1   dur                82332 non-null  float64
 2   proto              82332 non-null  object 
 3   service            82332 non-null  object 
 4   state              82332 non-null  object 
 5   spkts              82332 non-null  int64  
 6   dpkts              82332 non-null  int64  
 7   sbytes             82332 non-null  int64  
 8   dbytes             82332 non-null  int64  
 9   rate               82332 non-null  float64
 10  sttl               82332 non-null  int64  
 11  dttl               82332 non-null  int64  
 12  sload              82332 non-null  float64
 13  dload              82332 non-null  float64
 14  sloss              82332 non-null  int64  
 15  dloss              82332 non-null  i

ValueError: could not convert string to float: 'udp'

<Figure size 1200x800 with 0 Axes>