In [10]:
from dask.distributed import Client, LocalCluster
import time
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split, RandomizedSearchCV
from dask_ml.preprocessing import MinMaxScaler
from dask_ml.wrappers import ParallelPostFit
import joblib
from scipy.stats import uniform, loguniform

In [11]:
!ifconfig

em1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 9000
        inet 10.99.252.209  netmask 255.255.0.0  broadcast 10.99.255.255
        inet6 fe80::425c:fdff:fe78:76f6  prefixlen 64  scopeid 0x20<link>
        ether 40:5c:fd:78:76:f6  txqueuelen 1000  (Ethernet)
        RX packets 10304440050  bytes 72694729140074 (66.1 TiB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 12991805886  bytes 90338772785068 (82.1 TiB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0
        device interrupt 65  memory 0x93000000-937fffff  

lo: flags=73<UP,LOOPBACK,RUNNING>  mtu 65536
        inet 127.0.0.1  netmask 255.0.0.0
        inet6 ::1  prefixlen 128  scopeid 0x10<host>
        loop  txqueuelen 1000  (Local Loopback)
        RX packets 132858251  bytes 5134217238253 (4.6 TiB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 132858251  bytes 5134217238253 (4.6 TiB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0



In [12]:
def initialize_cluster(n_workers, threads_per_worker, memory_limit, processes):
    """Initialize Dask LocalCluster with specified configurations."""
    cluster = LocalCluster(
        n_workers=n_workers,
        threads_per_worker=threads_per_worker,
        memory_limit=memory_limit,
        processes=processes,
        host='10.99.252.209',
    )
    client = Client(cluster)
    print(f"Dask Dashboard Link: {client.dashboard_link}")
    cluster.scale(n_workers)
    return client, cluster


In [13]:
def data_loading_and_preprocessing(client, dataset_path, features, target):
    """Load and preprocess data using Dask."""
    df = dd.read_csv(dataset_path)
    X = df[features]
    Y = df[target]
    
    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)
    
    # Feature engineering
    X_normalized['Temp_Wind'] = X_normalized['Temperature(F)'] * X_normalized['Wind_Speed(mph)']
    X_normalized['Visibility_Wind'] = X_normalized['Visibility(mi)'] * X_normalized['Wind_Speed(mph)']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_normalized, Y, test_size=0.2, random_state=42, shuffle=False
    )
    
    return X_train, X_test, y_train, y_test

In [14]:
def model_training_and_evaluation(client, X_train, X_test, y_train, y_test):
    """Train and evaluate an SVM model using Dask with RandomizedSearchCV."""
    start_time = time.time()
    
    # Define the parameter space
    param_space = {
        'C': loguniform(1e-3, 1e3),
        'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
        'gamma': loguniform(1e-4, 1e0),
    }
    
    with joblib.parallel_backend('dask'):
        base_model = SVC(random_state=42)
        search = RandomizedSearchCV(
            estimator=base_model,
            param_distributions=param_space,
            n_iter=10,  # Number of parameter settings sampled
            cv=3,
            random_state=42
        )
        search.fit(X_train, y_train)
        
        best_model = ParallelPostFit(search.best_estimator_)
        y_pred = best_model.predict(X_test)
        
    end_time = time.time()
    
    accuracy = accuracy_score(y_test.compute(), y_pred.compute())
    
    return {"accuracy": accuracy, "time": end_time - start_time, "best_params": search.best_params_}

In [15]:
def visualize_performance(cluster_results):
    """Visualize execution time, speedup, and efficiency."""
    configs = list(cluster_results.keys())
    times = [cluster_results[config]['time'] for config in configs]
    speedups = [cluster_results[config]['speedup'] for config in configs]
    efficiencies = [cluster_results[config]['efficiency'] for config in configs]

    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 18))

    # Execution Time Graph
    ax1.plot(configs, times, marker='o', color='orange', linewidth=2)
    ax1.set_ylabel('Time (seconds)', color='orange')
    ax1.set_title('Execution Time')
    ax1.grid(True)

    # Speedup Graph
    ax2.plot(configs, speedups, marker='s', color='blue', linewidth=2)
    ax2.set_ylabel('Speedup', color='blue')
    ax2.set_title('Speedup')
    ax2.grid(True)

    # Efficiency Graph
    ax3.plot(configs, efficiencies, marker='^', color='green', linewidth=2)
    ax3.set_ylabel('Efficiency', color='green')
    ax3.set_title('Efficiency')
    ax3.grid(True)

    plt.tight_layout()
    plt.show()

In [20]:
def main():
    dataset_path = "Cleaned_US_Accidents_March23.csv"
    features = ['Temperature(F)', 'Visibility(mi)', 'Wind_Speed(mph)']
    target = 'Severity'

    # Cluster configurations: baseline uses processes=False; others use processes=True
    cluster_configs = {
        "baseline_1_worker_1_thread": {"n_workers": 20, "threads_per_worker": 2, "memory_limit": "8GB", "processes": True},
        "2": {"n_workers": 10, "threads_per_worker": 1, "memory_limit": "8GB", "processes": True},
        "4": {"n_workers": 12, "threads_per_worker": 1, "memory_limit": "8GB", "processes": True},
        "6": {"n_workers": 14, "threads_per_worker": 1, "memory_limit": "8GB", "processes": True},
        "8": {"n_workers": 16, "threads_per_worker": 1, "memory_limit": "8GB", "processes": True},
        "10": {"n_workers": 18, "threads_per_worker": 1, "memory_limit": "8GB", "processes": True},
    }

    baseline_time = None
    cluster_results = {}

    for config_name, config_params in cluster_configs.items():
        print(f"Testing {config_name}...")
    
        client, cluster = initialize_cluster(**config_params)

        try:
            X_train, X_test, y_train, y_test = data_loading_and_preprocessing(
                client,
                dataset_path,
                features,
                target
            )
            result = model_training_and_evaluation(client, X_train, X_test, y_train, y_test)

            # Handle baseline configuration
            if config_name == "baseline_1_worker_1_thread":
                baseline_time = result["time"]
                if baseline_time is None or baseline_time <= 0:
                    raise ValueError("Baseline time is not set or invalid. Ensure the baseline configuration runs correctly.")
                speedup = 1
                efficiency = 1
            else:
                # Ensure baseline time is initialized before calculating speedup and efficiency
                if baseline_time is None:
                    raise ValueError("Baseline time has not been initialized. Run the baseline configuration first.")
                speedup = baseline_time / result["time"]
                efficiency = speedup / config_params["n_workers"]

            cluster_results[config_name] = {
                "accuracy": result["accuracy"],
                "time": result["time"],
                "speedup": speedup,
                "efficiency": efficiency,
                "best_params": result["best_params"]
            }
            
            print(f"Best parameters for {config_name}: {result['best_params']}")
            
        finally:
            client.close()
            cluster.close()      

    visualize_performance(cluster_results)

In [None]:
if __name__ == "__main__":
    main()