# Lab 01 - NumPy Introduction
We start by downloading these datasets:
- Iris: 150 flowers ×5 columns (4 numeric features + species label).  
- MNIST: A 10k-row subset (test split) with the first column as label (0–9) and 784 pixel columns (28×28).

In [1]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" -O iris.csv

zsh:1: command not found: wget


In [2]:
!wget "https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/mnist_test.csv" -O mnist_test.csv

zsh:1: command not found: wget


In [3]:
import numpy as np

# Warm-up with Numpy
## Array creation and attributes

In [4]:
# array manual definition and key attributes
A0 = np.array([[2,3,7],[9,4,5]]) 
A1 = np.array([[4,9],[5,6],[1,7]])
print(A0.ndim, A0.shape, A1.size, A1.dtype)

2 (2, 3) 6 int64


In [5]:
# array definition using built-in functions
B0= np.zeros((2,3))
B1= np.ones((4,2),dtype=np.float32)
B2= np.full((2,3),3)

In [6]:
# array definition using sequences
C0= np.linspace(3,10,4)
C1= np.arange(2,8,2)

## Universal functions, aggregations and sorting

In [7]:
# basic operations + ufunc
print(A0+B2, B2**B0, np.sqrt(C0),np.exp(A1),np.log10(C1), sep='\n')


[[ 5  6 10]
 [12  7  8]]
[[1. 1. 1.]
 [1. 1. 1.]]
[1.73205081 2.30940108 2.76887462 3.16227766]
[[5.45981500e+01 8.10308393e+03]
 [1.48413159e+02 4.03428793e+02]
 [2.71828183e+00 1.09663316e+03]]
[0.30103    0.60205999 0.77815125]


In [8]:
# aggregations and axis behaviors
print(A0.mean(), A0.mean(axis=1), C0.std(), C1.sum(), A1.sum(axis=1))

5.0 [4. 6.] 2.6087459737497545 12 [13 11  8]


In [9]:
# sorting and ranking
print(np.sort(A0), np.sort(A0,axis=-2), np.argsort(A0),sep='\n')

[[2 3 7]
 [4 5 9]]
[[2 3 5]
 [9 4 7]]
[[0 1 2]
 [1 2 0]]


## Broadcasting

In [10]:
# column-wise normalization
print((A0-A0.min())/(A0.max()-A0.min()))

[[0.         0.14285714 0.71428571]
 [1.         0.28571429 0.42857143]]


## Indexing

In [11]:
# slicing submatrices
D = np.arange(1, 13).reshape(3, 4)
D_block=D[:2,:2] #creates a view, not a copy
D_columns=D[:,-2:]
print(D_block, D_block.shape, D_columns, D_columns.shape)

[[1 2]
 [5 6]] (2, 2) [[ 3  4]
 [ 7  8]
 [11 12]] (3, 2)


In [12]:
# masked arrays
D[D>6]=-1
D

array([[ 1,  2,  3,  4],
       [ 5,  6, -1, -1],
       [-1, -1, -1, -1]])

In [13]:
# views vs copies
D_new=D[:2,:].copy()
D_new[1,3]=8
D_new

array([[ 1,  2,  3,  4],
       [ 5,  6, -1,  8]])

In [14]:
# fancy indexing
D_fancy=D[::2,[1,3]]
print(D_fancy, D_fancy.shape)

[[ 2  4]
 [-1 -1]] (2, 2)


# Iris Dataset with NumPy
## Feature extraction and statistical analysis

In [15]:
# Dataset settings
X=np.genfromtxt('iris.csv', delimiter=',', usecols=(0,1,2,3), dtype=float)
y=np.genfromtxt('iris.csv', delimiter=',', usecols=4, dtype=str)

In [16]:
# verify shapes and dtypes
print(X.shape, X.dtype, y.shape, y.dtype)

(150, 4) float64 (150,) <U15


In [17]:
# global statistics
features=['Sepal length', 'Sepal width', 'Petal length', 'Petal width']
for i in range (0,4):
    print(f'Tot Mean {features[i]}', np.mean(X[:,i]),f'Tot Std Dev {features[i]}', np.std(X[:,i]))
    

Tot Mean Sepal length 5.843333333333334 Tot Std Dev Sepal length 0.8253012917851409
Tot Mean Sepal width 3.0540000000000003 Tot Std Dev Sepal width 0.4321465800705435
Tot Mean Petal length 3.758666666666666 Tot Std Dev Petal length 1.7585291834055212
Tot Mean Petal width 1.1986666666666668 Tot Std Dev Petal width 0.7606126185881716


In [18]:
# statistics analysis by species
species = np.unique(y)
means_mat=np.zeros((len(species),X.shape[1]))
for i,sp in enumerate(species):
    mask= (y==sp) 
    class_means= np.mean(X[mask],axis=0)
    class_stds= np.std(X[mask],axis=0)
    print(f'{sp} Means:', class_means, f'{sp} Std Devs:', class_stds)
    means_mat[i,:]=class_means

Iris-setosa Means: [5.006 3.418 1.464 0.244] Iris-setosa Std Devs: [0.34894699 0.37719491 0.17176728 0.10613199]
Iris-versicolor Means: [5.936 2.77  4.26  1.326] Iris-versicolor Std Devs: [0.51098337 0.31064449 0.46518813 0.19576517]
Iris-virginica Means: [6.588 2.974 5.552 2.026] Iris-virginica Std Devs: [0.62948868 0.31925538 0.54634787 0.27188968]


## Feature standardization

In [19]:
Z=((X-np.mean(X,axis=0))/np.std(X,axis=0))
Z.shape

(150, 4)

## Naïve single-feature classifier

In [20]:
# find most discriminative feature
diff_01 = np.abs(means_mat[0] - means_mat[1])  # setosa - versicolor
diff_02 = np.abs(means_mat[0] - means_mat[2])  # setosa - virginica
diff_12 = np.abs(means_mat[1] - means_mat[2])  # versicolor - virginica

all_diffs = np.vstack([diff_01, diff_02, diff_12])
min_diffs = np.min(all_diffs, axis=0)
best_feature = np.argmax(min_diffs)
print(f"Best separating feature index: {features[best_feature]} (index {best_feature})")

Best separating feature index: Petal length (index 2)


In [21]:
# define decision threshold
selected_sorted=np.sort(means_mat[:,best_feature])
thresholds = (selected_sorted[1:] + selected_sorted[:-1]) / 2

In [22]:
# Implement the classification rule
x_feat = X[:, best_feature]

def predict_classes(values, thresholds, species):
    preds = np.zeros(values.shape, dtype=object)
    preds[values < thresholds[0]] = species[0]
    preds[(values >= thresholds[0]) & (values < thresholds[1])] = species[1]
    preds[values >= thresholds[1]] = species[2]
    return preds

y_pred = predict_classes(x_feat, thresholds, species)
accuracy = np.mean(y_pred == y)
print(f"Accuracy: {accuracy:.3f}")

Accuracy: 0.947


# MNIST Dataset with NumPy
## Loading and prepare the data

In [23]:
X_1=np.genfromtxt('mnist_test.csv', delimiter=',', usecols=range(1,785), dtype=np.uint8)
y_1=np.genfromtxt('mnist_test.csv', delimiter=',', usecols=0, dtype=np.uint8)
print(X_1.shape,y_1.shape)

(10000, 784) (10000,)


## Visual inspection of samples

In [24]:
# reshaping some images
S_1=X_1[10].reshape(28,28)
S_2=X_1[888].reshape(28,28)
S_3=X_1[7654].reshape(28,28)

def pixel_to_char_grid(image_28x28):
    char_grid = np.empty((28,28), dtype=str)

    char_grid[(image_28x28 >= 0) & (image_28x28 < 64)] = " "
    char_grid[(image_28x28 >= 64) & (image_28x28 < 128)] = "."
    char_grid[(image_28x28 >= 128) & (image_28x28 < 192)] = "*"
    char_grid[(image_28x28 >= 192) & (image_28x28 < 256)] = "#"
    
    for row in char_grid:
        print("".join(row))
pixel_to_char_grid(S_2)


                            
                            
                            
                            
                            
          .*****..          
        .####...*##.        
      *###*      .##.       
     *##.         ##.       
      .           ##*       
                  ###       
                 .###       
                 ###.       
                *###        
                ###.        
              .*##*         
           *#######*.       
        *#########*###.     
      .#########.   .##*    
     .########*       .##   
     ########*         **   
     *#####*.               
       ...                  
                            
                            
                            
                            
                            


## Pixel frequency comparison by class

In [25]:
# build class masks
mask2 = (y_1==2)
mask7 = (y_1==7)
X_digit2=X_1[mask2]
X_digit7=X_1[mask7]

# count pixel activations
threshold = 128

active_count_2 = (X_digit2 > threshold).sum(axis=0)
active_count_7 = (X_digit7 > threshold).sum(axis=0)

# Comparing pixel distributions
diff = np.abs(active_count_2 - active_count_7)

max_diff_pos = np.argmax(diff)
max_diff_value = diff[max_diff_pos]

print("Pixel con differenza massima:", max_diff_pos)
print("Valore differenza massima:", max_diff_value)

Pixel con differenza massima: 154
Valore differenza massima: 701


## Pairwise distance analysis

In [26]:
# selecting samples
sample0=X_1[3,:]
sample1=X_1[5,:]
sample2=X_1[2,:]
sample3=X_1[0,:]

V = np.vstack([sample0, sample1, sample2, sample3])

In [27]:
# computing distances
n = V.shape[0]
norms = np.sum(V**2, axis=1)
dot_products = V @ V.T
D = norms[:, None] + norms[None, :] - 2 * dot_products

In [28]:
# interpreting distance matrix
print("Euclidean distances matrix:")
print(D)

Dn = D.shape[0]
min_dist = None
min_pair = (None, None)

for i in range(n):
    for j in range(n):
        if i != j: 
            if (min_dist is None) or (D[i, j] < min_dist):
                min_dist = D[i, j]
                min_pair = (i, j)

print(f"Most similar couple: indexes {min_pair} with distance {min_dist}")

Euclidean distances matrix:
[[26112 18077 17795 22502]
 [18077 10240  9744 14621]
 [17795  9744  9472 14217]
 [22502 14621 14217 18944]]
Most similar couple: indexes (1, 2) with distance 9744
