In [None]:
:dep polars = { version = "0.35.4", features = ["describe", "to_dummies", "ndarray", "random"] }
:dep polars-core
:dep candle-core
:dep thiserror

In [None]:
use polars::prelude::*;
use polars_core::prelude::*;
use candle_core::{Device, Tensor};
use polars::frame::DataFrame;
use std::path::Path;

In [None]:
fn read_data_frame_from_csv(
    csv_file_path: &Path,
) -> DataFrame {
    CsvReader::from_path(csv_file_path)
        .expect("Cannot open file.")
        .has_header(true)
        .finish()
        .unwrap()
}

In [None]:
use std::fmt;

#[derive(thiserror::Error, Debug)]
pub struct MyError {
    details: String
}

impl fmt::Display for MyError {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "{}", self.details)
    }
}

impl From<PolarsError> for MyError {
    fn from(error: PolarsError) -> Self {
        MyError {
            details: format!("{}", error),
        }
    }
}

pub type MyResult<T> = std::result::Result<T, MyError>;

fn dataframe_to_tensor(df: &DataFrame) -> MyResult<Tensor> {
    let n_rows = df.height();
    let n_cols = df.width();

    // Collect DataFrame values into a Vec of Vecs
    let mut values = Vec::with_capacity(n_cols);
    for col in df.iter() {
        let col_vec: Vec<f64> = col
            .iter()
            .map(|val| val.extract::<f64>().unwrap())
            .collect();
        values.push(col_vec);
    };

    // Create Tensor from flattened Vec of Vecs
    Ok(Tensor::from_vec(
        values.into_iter().flatten().collect(),
        (n_cols, n_rows),
        &Device::Cpu,
    )
    .expect("error")
    .t()
    .expect("error"))
}

In [None]:
struct DataSet {
    pub X_train: Tensor,
    pub y_test: Tensor,
    pub X_test: Tensor,
    pub y_train: Tensor,
}

impl DataSet {
    pub fn new(X_train: Tensor, y_train: Tensor, X_test: Tensor, y_test: Tensor) -> Self {
        Self {
            X_train,
            y_train,
            X_test,
            y_test,
        }
    }

    pub fn from_df(
        df: &DataFrame, train_frac: f64, input_columns: &[&str], predict_columns: &[&str], seed: Option<u64>
    ) -> Result<Self> {
        let shuffled_df = df.sample_frac(&Series::new("frac", &[1.0]), false, true, Some(42))?;  // shuffle the dataframe
        
        let n_rows = df.height();
        let n_train_examples = (train_frac * n_rows as f64) as usize;

        let df_train = shuffled_df.slice(0, n_train_examples);
        let df_test = shuffled_df.slice(n_train_examples as i64, n_rows);

        let X_train = dataframe_to_tensor(&df_train.select(input_columns)?)?;
        let y_train = dataframe_to_tensor(&df_train.select(predict_columns)?)?;
        let X_test = dataframe_to_tensor(&df_test.select(input_columns)?)?;
        let y_test = dataframe_to_tensor(&df_test.select(predict_columns)?)?;

        Ok(Self::new(X_train, y_train, X_test, y_test))
    }
}

In [None]:
let iris_file_path: &Path = Path::new("Iris.csv");
let mut iris_df: DataFrame = read_data_frame_from_csv(iris_file_path);

In [None]:
iris_df.describe(None)?

In [None]:
iris_df.head(Some(5))

In [None]:
iris_df
.hstack_mut(
    iris_df["Species"]
    .to_dummies(None, false)?
    .get_columns()
)?
.drop_in_place("Species")?;

In [None]:
iris_df.head(Some(5))

In [None]:
let dataset = DataSet::from_df(
    &iris_df,
    0.8,
    &["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"],
    &["Species_Iris-setosa", "Species_Iris-versicolor", "Species_Iris-virginica"],
    Some(42),
)?;

In [None]:
(dataset.X_train.shape(), dataset.y_train.shape(), dataset.X_test.shape(), dataset.y_test.shape())

---