Exercise 1: NumPy array Indexing/Slicing

In [8]:
from si.io.csv_file import read_csv

# Loading the dataset iris.csv
iris_dataset = read_csv('../datasets/iris/iris.csv', features=True, label=True)
print("\n1.1) Dataset loaded.")


1.1) Dataset loaded.


In [9]:

# Select the penultimate independent variable
pen_variable = iris_dataset.X[:, -2]
print("\n1.2) Penultimate independent variable (array) shape:", pen_variable.shape)


1.2) Penultimate independent variable (array) shape: (150,)


In [10]:
import numpy as np

# Select the last 10 samples
last_10_samples = iris_dataset.X[-10:, :]

# Calculate the mean of each independent variable
last10_means = np.mean(last_10_samples, axis=0)
print("\n1.3 Mean of last 10 samples (per feature):", last10_means)


1.3 Mean of last 10 samples (per feature): [6.45 3.03 5.33 2.17]


In [4]:
#Select all samples with values <= 6 for all independent variables/features.
mask_samples = np.all(iris_dataset.X <= 6, axis=1)
count_mask = mask_samples.sum()
print("\n1.4) Number of samples where every feature value <= 6:", int(count_mask))


1.4) Number of samples where every feature value <= 6: 89


In [5]:
#Select all samples with class/label different from 'Iris-setosa'.
count_not_setosa = np.sum(iris_dataset.y != 'Iris-setosa')
print("\n1.5) Number of samples with a label other than 'Iris-setosa':", int(count_not_setosa))


1.5) Number of samples with a label other than 'Iris-setosa': 100


Exercise 2: Examples of how to use dropna(), fillna() and remove_by_index() methods 

In [None]:
import sys
sys.path.append("Exercises\ex1.ipynb")  

In [None]:
import numpy as np
from si.data.dataset import Dataset
from si.io.csv_file import read_csv

iris_dataset = read_csv('../datasets/iris/iris.csv', features=True, label=True)
print("Loaded SI dataset:", iris_dataset.X.shape)

Loaded SI dataset: (150, 4)


Dropna()

In [None]:
# Creating a new Dataset with some missing values
iris_dataset_2 = Dataset(
    X=iris_dataset.X.copy(),
    y=iris_dataset.y.copy(),
    features=iris_dataset.features,
    label=iris_dataset.label
)
iris_dataset_2.X[0, 1] = np.nan
iris_dataset_2.X[5, 2] = np.nan
iris_dataset_2.X[10, 3] = np.nan
iris_dataset_2.X[15, 0] = np.nan
iris_dataset_2.X[20, 1] = np.nan
iris_dataset_2.y[25] = np.nan
print("Features values with missing values:\n",iris_dataset_2.X)
print("Shape of X with missing values:",iris_dataset_2.X.shape)
print("Shape of Y with missing values:",iris_dataset_2.y.shape)


Features values with missing values:
 [[5.1 nan 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 nan 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 nan]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [nan 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 nan 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.1 1.5 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.

In [None]:
dataset_new = iris_dataset_2.dropna()

print("Shape of X without missing values:",dataset_new.X.shape)
print("Shape of y without missing values:",dataset_new.y.shape)
print("Features values without missing values:\n",dataset_new.X)

Shape of X without missing values: (145, 4)
Shape of y without missing values: (145,)
Features values without missing values:
 [[4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.1 1.5 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]

Fillna()

In [None]:
#Creating another Dataset with some missing values
iris_dataset_3 = Dataset(
    X=iris_dataset.X.copy(),
    y=iris_dataset.y.copy(),
    features=iris_dataset.features,
    label=iris_dataset.label
)
iris_dataset_3.X[0, 1] = np.nan
iris_dataset_3.X[5, 2] = np.nan
iris_dataset_3.X[10, 3] = np.nan
iris_dataset_3.X[15, 0] = np.nan
iris_dataset_3.X[20, 1] = np.nan
iris_dataset_3.y[25] = np.nan

In [20]:
dataset_new_3= iris_dataset_3.fillna("mean")
print("Shape of X after fillna:",dataset_new_3.X.shape)
print("Shape of y after fillna:",dataset_new_3.y.shape)
print("Features after fillna:\n",dataset_new_3.X)

Shape of X after fillna: (150, 4)
Shape of y after fillna: (150,)
Features after fillna:
 [[5.1        3.04864865 1.4        0.2       ]
 [4.9        3.         1.4        0.2       ]
 [4.7        3.2        1.3        0.2       ]
 [4.6        3.1        1.5        0.2       ]
 [5.         3.6        1.4        0.2       ]
 [5.4        3.9        3.77248322 0.4       ]
 [4.6        3.4        1.4        0.3       ]
 [5.         3.4        1.5        0.2       ]
 [4.4        2.9        1.4        0.2       ]
 [4.9        3.1        1.5        0.1       ]
 [5.4        3.7        1.5        1.20536913]
 [4.8        3.4        1.6        0.2       ]
 [4.8        3.         1.4        0.1       ]
 [4.3        3.         1.1        0.1       ]
 [5.8        4.         1.2        0.2       ]
 [5.8442953  4.4        1.5        0.4       ]
 [5.4        3.9        1.3        0.4       ]
 [5.1        3.5        1.4        0.3       ]
 [5.7        3.8        1.7        0.3       ]
 [5.1        3.8 

Remove_by_index()

In [None]:
#Creating another Dataset with some missing values
iris_dataset_4 = Dataset(
    X=iris_dataset.X.copy(),
    y=iris_dataset.y.copy(),
    features=iris_dataset.features,
    label=iris_dataset.label
)
iris_dataset_4.X[0, 1] = np.nan
iris_dataset_4.X[5, 2] = np.nan
iris_dataset_4.X[10, 3] = np.nan
iris_dataset_4.X[15, 0] = np.nan
iris_dataset_4.X[20, 1] = np.nan
iris_dataset_4.y[25] = np.nan

In [29]:
dataset_new_4= iris_dataset_4.remove_by_index(6)

print("Shape of X after remove_by_index:",dataset_new_4.X.shape)
print("Shape of y after remove_by_index:",dataset_new_4.y.shape)
print("Features after remove_by_index:\n",dataset_new_4.X)


Shape of X after remove_by_index: (139, 4)
Shape of y after remove_by_index: (139,)
Features after remove_by_index:
 [[4.6 3.4 1.4 0.3]
 [4.4 2.9 1.4 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 nan 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.1 1.5 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.9 1.5]
 [5.5 2.3 4.  1.3]
 [6.5 2.8 4.6 1.5]
 [5.7 2.8 4.5 1.3]
 [6.3 3.3

Exercise 3: Testing the SelectPercentile class 

In [6]:

from si.feature_selection.select_percentile import SelectPercentile

In [7]:
selector = SelectPercentile(percentile=50) 
iris_reduced = selector.fit_transform(iris_dataset)

print("After SelectPercentile:", iris_reduced.X.shape)
print("Selected features:", iris_reduced.features)


After SelectPercentile: (150, 2)
Selected features: ['petal_length', 'petal_width']


Exercise 5: PCA

In [11]:

from si.decomposition.pca import PCA   # make sure your file is correctly placed


In [12]:
pca = PCA(n_components=2)
iris_pca = pca.fit_transform(iris_dataset)

print("\nReduced shape:", iris_pca.X.shape)
print("New features:", iris_pca.features)
print("\nExplained Variance:", pca.explained_variance)
print("Total explained:", np.sum(pca.explained_variance))



Reduced shape: (150, 2)
New features: ['PC1', 'PC2']

Explained Variance: [0.92461621 0.05301557]
Total explained: 0.9776317750248036


Exercise 6: Stratified_split


In [13]:
from si.model_selection.split import stratified_train_test_split


In [14]:
# Stratified split test
train, test = stratified_train_test_split(iris_dataset, test_size=0.3, random_state=42)

print("Train shape:", train.X.shape)
print("Test shape:", test.X.shape)

print("\nClass distribution in TRAIN:", dict(zip(*np.unique(train.y, return_counts=True))))
print("\nClass distribution in TEST:", dict(zip(*np.unique(test.y, return_counts=True))))

Train shape: (105, 4)
Test shape: (45, 4)

Class distribution in TRAIN: {'Iris-setosa': np.int64(35), 'Iris-versicolor': np.int64(35), 'Iris-virginica': np.int64(35)}

Class distribution in TEST: {'Iris-setosa': np.int64(15), 'Iris-versicolor': np.int64(15), 'Iris-virginica': np.int64(15)}


Exercise 9: Random Forest


In [15]:
from si.model_selection.split import train_test_split
from si.models.random_forest_classifier import RandomForestClassifier


In [16]:
# Split
train, test = train_test_split(iris_dataset, test_size=0.3, random_state=42)


In [17]:
# Train model
rf = RandomForestClassifier(n_estimators=20, max_depth=5)
rf.fit(train)

<si.models.random_forest_classifier.RandomForestClassifier at 0x23ab8d07c50>

In [18]:
# Evaluate
preds = rf.predict(test)
score = rf.score(test)

print("ðŸ”¹ RandomForestClassifier Accuracy on Iris Test Set:", score)
print("ðŸ”¹ First 10 Predictions:", preds[:10])
print("ðŸ”¹ True Labels:", test.y[:10])

ðŸ”¹ RandomForestClassifier Accuracy on Iris Test Set: 1.0
ðŸ”¹ First 10 Predictions: ['Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica'
 'Iris-versicolor' 'Iris-versicolor']
ðŸ”¹ True Labels: ['Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica'
 'Iris-versicolor' 'Iris-versicolor']
