In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kaggle

# Define the directory to save the dataset
data_dir = 'datasets/unsw_nb15'
os.makedirs(data_dir, exist_ok=True)

# Download the UNSW-NB15 dataset from Kaggle
kaggle.api.dataset_download_files('mrwellsdavid/unsw-nb15', path=data_dir, unzip=True)

# Path to the downloaded CSV file
file_path = os.path.join(data_dir, 'UNSW-NB15.csv')

# Check if the file exists
if os.path.exists(file_path):
    swift_data = pd.read_csv(file_path)
else:
    print(f"Error: The file '{file_path}' does not exist.")

    # raise FileNotFoundError(f"The file '{file_path}' was not found.")
    
# Proceed only if the file was loaded successfully
if 'swift_data' in locals():
    # Display basic info about the dataset
    print("Basic Information of the Dataset:")
    swift_data.info()

    # Summary statistics of the dataset
    print("\nSummary Statistics:")
    print(swift_data.describe())

    # Check for missing values
    print("\nMissing Values in Each Column:")
    print(swift_data.isnull().sum())

    # Visualization examples

    # Histogram of transaction amounts
    if 'amount' in swift_data.columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(swift_data['amount'], bins=50)
        plt.title('Distribution of Transaction Amounts')
        plt.xlabel('Amount')
        plt.ylabel('Frequency')
        plt.show()
    else:
        print("Column 'amount' not found in the dataset.")

    # Correlation heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(swift_data.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.show()

    # Example of visualizing categorical data - Distribution of Currency Types
    if 'currency' in swift_data.columns:
        plt.figure(figsize=(10, 6))
        sns.countplot(x='currency', data=swift_data)
        plt.title('Distribution of Currency Types')
        plt.xlabel('Currency')
        plt.ylabel('Count')
        plt.show()
    else:
        print("Column 'currency' not found in the dataset.")

    # Optional: Display the first few rows of the dataset to inspect
    print("\nFirst 5 Rows of the Dataset:")
    print(swift_data.head())
else:
    print("Dataset could not be loaded. Exiting the script.")



Dataset URL: https://www.kaggle.com/datasets/mrwellsdavid/unsw-nb15
Error: The file 'datasets/unsw_nb15/UNSW-NB15.csv' does not exist.
Dataset could not be loaded. Exiting the script.
