1.  Make classification data and get it ready

For this, we used Claude.AI to help us generate an eqivalent make_circles in Rust

In [2]:
:dep rand = "0.8"
:dep rand_distr = "0.4"

In [3]:
use rand::Rng;
use rand::seq::SliceRandom;
use rand_distr::{Distribution, Normal};

/// Generate a binary classification dataset of points in concentric circles.
///
/// # Arguments
///
/// * `n_samples` - The number of points to generate
/// * `noise` - Standard deviation of Gaussian noise added to the data
/// * `factor` - Scale factor between inner and outer circle
/// * `shuffle` - Whether to shuffle the samples
///
/// # Returns
///
/// A tuple containing:
/// * A Vec<[f64; 2]> of 2D points
/// * A Vec<i32> of labels (0 for outer circle, 1 for inner circle)
pub fn make_circles(
    n_samples: usize,
    noise: f64,
    factor: f64,
    shuffle: bool,
) -> (Vec<[f64; 2]>, Vec<i32>) {
    let mut rng = rand::thread_rng();
    let mut features = Vec::with_capacity(n_samples);
    let mut labels = Vec::with_capacity(n_samples);
    
    // Half of the samples belong to each class
    let n_samples_out = n_samples / 2;
    let n_samples_in = n_samples - n_samples_out;
    
    // Outer circle
    for _ in 0..n_samples_out {
        let theta = rng.r#gen::<f64>() * 2.0 * std::f64::consts::PI;
        let x = theta.cos();
        let y = theta.sin();
        features.push([x, y]);
        labels.push(0);
    }
    
    // Inner circle
    for _ in 0..n_samples_in {
        let theta = rng.r#gen::<f64>() * 2.0 * std::f64::consts::PI;
        let x = theta.cos() * factor;
        let y = theta.sin() * factor;
        features.push([x, y]);
        labels.push(1);
    }
    
    // Add noise if specified
    if noise > 0.0 {
        let normal = Normal::new(0.0, noise).unwrap();
        for point in &mut features {
            point[0] += normal.sample(&mut rng);
            point[1] += normal.sample(&mut rng);
        }
    }
    
    // Shuffle the data if requested
    if shuffle {
        let mut indices: Vec<usize> = (0..n_samples).collect();
        indices.shuffle(&mut rng);
        
        let mut shuffled_features = Vec::with_capacity(n_samples);
        let mut shuffled_labels = Vec::with_capacity(n_samples);
        
        for &idx in &indices {
            shuffled_features.push(features[idx]);
            shuffled_labels.push(labels[idx]);
        }
        
        return (shuffled_features, shuffled_labels);
    }
    
    (features, labels)
}

In [4]:
let n_samples: usize = 1000;

In [5]:
let mut result = make_circles(n_samples, 0.03f64, 0.75f64, true);

In [6]:
let X = result.0;
let y = result.1;

In [None]:
println!("First 5 X features\n{:?}", &X[..5]);
println!("First 5 y labels\n{:?}", &y[..5]);

First 5 X features
[[0.7297431463910832, -0.14644919922976776], [0.6389028772090733, -0.5014708842499602], [-0.2896800900177738, -0.7697833829654961], [-1.0251343101954196, 0.2226558965869214], [0.9253492533878728, 0.32330910354959597]]
First 5 y labels


#### Let's do some EDA on this dataset.  Rather than using pandas, we will use Polars.  

In [8]:
:dep polars

[1, 1, 1, 0, 0]


In [9]:
use polars::prelude::*;

Make a dataframe of circle data

To do this, we need to create 3 series
* X1 from X.0
* X2 from X.1
* label from y

In [10]:
let X1: Vec<f64> = X.iter().map(|&v| (v[0])).collect();
let X2: Vec<f64> = X.iter().map(|&v| (v[1])).collect();

In [11]:
let mut df = df!(
    "X1" => &X1,
    "X2" => &X2,
    "label" => &y
)?;


In [18]:
println!("{}", df.head(Some(10)));

shape: (10, 3)
┌───────────┬───────────┬───────┐
│ X1        ┆ X2        ┆ label │
│ ---       ┆ ---       ┆ ---   │
│ f64       ┆ f64       ┆ i32   │
╞═══════════╪═══════════╪═══════╡
│ 0.729743  ┆ -0.146449 ┆ 1     │
│ 0.638903  ┆ -0.501471 ┆ 1     │
│ -0.28968  ┆ -0.769783 ┆ 1     │
│ -1.025134 ┆ 0.222656  ┆ 0     │
│ 0.925349  ┆ 0.323309  ┆ 0     │
│ -1.040104 ┆ 0.405431  ┆ 0     │
│ -0.623264 ┆ 0.749152  ┆ 0     │
│ 0.498709  ┆ 0.849855  ┆ 0     │
│ 0.672392  ┆ 0.32645   ┆ 1     │
│ 0.851452  ┆ 0.561597  ┆ 0     │
└───────────┴───────────┴───────┘


Let's see the distribution of labels

In [19]:
{
    let label_series = df.column("label")?.as_series().unwrap();
    let label_counts = label_series.value_counts(false, false, Default::default(), false)?;
    println!("{}", label_counts);
}

shape: (2, 2)
┌───────┬─────┐
│ label ┆     │
│ ---   ┆ --- │
│ i32   ┆ u32 │
╞═══════╪═════╡
│ 1     ┆ 500 │
│ 0     ┆ 500 │
└───────┴─────┘


()

500 each, nice and balanced

Now we will use plotters-rs to actually plot out the features.

In [14]:
:dep plotters = { version = "^0.3.0", default-features = false, features = ["evcxr", "all_series"] }

In [15]:
extern crate plotters;
use plotters::prelude::*;

In [16]:
println!("{:?}", &X1[0..5]);

[0.7297431463910832, 0.6389028772090733, -0.2896800900177738, -1.0251343101954196, 0.9253492533878728]


In [20]:
{
    let color_map = |y: i32| -> RGBColor {
    match y {
        0 => RED,
        1 => BLUE,
        _ => GREEN,
        }
    };

    let data: Vec<(f64, f64, i32)> = X1.clone().into_iter()
        .zip(X2.clone().into_iter())
        .zip(y.clone().into_iter())
        .map(|((a, b), c) | (a, b, c))
        .collect();
    
    evcxr_figure((640, 480), |root| {
        // The following code will create a chart context
        let mut chart = ChartBuilder::on(&root)
            .caption("Data Distribution", ("Arial", 20).into_font())
            .x_label_area_size(20)
            .y_label_area_size(20)
            .margin(17)
            .build_cartesian_2d(-1.2f64..1.2f64, -1.2f64..1.2f64)?;
        
        chart.configure_mesh()
            .disable_x_mesh()
            .disable_y_mesh()
            .draw()?;
    
        for (x1, x2, label) in data {
            chart.draw_series(std::iter::once(Circle::new((x1, x2), 4, color_map(label).filled(),
                                                          )))?;
        }
    
        
        Ok(())
    }).style("width: 70%")
}