# Data loading

The first step of the Machine Learning lifecycle is to load, cleanup and process datasets into memory. Commonly, certain columns are meant to be processed in different data formats, so functions such as `str_column_to_float(...)` parse entire columns into language-specific data-types like integers, floats, etc.

In [9]:
from csv import reader

In [10]:
def load_csv(filename: str):
  dataset = []
  with open(filename, "r") as file:
    for row in reader(file):
      if row:
        dataset.append(row)
  return dataset


def str_column_to_float(dataset: list[list[str]], col_idx: int):
  _dataset = dataset.copy()
  for row in _dataset:
    row[col_idx] = float(row[col_idx].strip())
  return _dataset


def str_column_to_int(dataset: list[list[str]], col_idx: int):
  class_values = set(row[col_idx] for row in dataset)
  class_idx_map = {}
  for i, val in enumerate(class_values):
    class_idx_map[val] = i
  dataset_ = dataset.copy()
  for row in dataset_:
    row[col_idx] = class_idx_map[row[col_idx]]
  return dataset_, class_idx_map


In [14]:
filename = "../datasets/iris.csv"
dataset = load_csv(filename)
display(f'Loaded {filename} with {len(dataset)} rows and {len(dataset[0])} columns')
display(dataset[:5])

'Loaded ../datasets/iris.csv with 150 rows and 5 columns'

[['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'],
 ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'],
 ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'],
 ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'],
 ['5.0', '3.6', '1.4', '0.2', 'Iris-setosa']]

In [15]:
dataset = str_column_to_float(dataset, 0)
display(dataset[:5])

[[5.1, '3.5', '1.4', '0.2', 'Iris-setosa'],
 [4.9, '3.0', '1.4', '0.2', 'Iris-setosa'],
 [4.7, '3.2', '1.3', '0.2', 'Iris-setosa'],
 [4.6, '3.1', '1.5', '0.2', 'Iris-setosa'],
 [5.0, '3.6', '1.4', '0.2', 'Iris-setosa']]

In [16]:
dataset, class_map = str_column_to_int(dataset, -1)
display(class_map)
display(dataset[:5])

{'Iris-versicolor': 0, 'Iris-setosa': 1, 'Iris-virginica': 2}

[[5.1, '3.5', '1.4', '0.2', 1],
 [4.9, '3.0', '1.4', '0.2', 1],
 [4.7, '3.2', '1.3', '0.2', 1],
 [4.6, '3.1', '1.5', '0.2', 1],
 [5.0, '3.6', '1.4', '0.2', 1]]