In [1]:
from sklearn.datasets import make_blobs
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

In [2]:
#Génération du dataset X constitué de 200 blobs gaussiens dans R^2
X, _ = make_blobs(n_samples=200, n_features=2, centers = 1, center_box = (-4, 4), random_state=42)

In [3]:
#Séparation du dataset x en 2 lots:  : un lot d'entrainement contenant 80% des points ainsi qu'un lot de test.
##la fonction train_test_split permet de séparer les données en 2 lots
#le paramètre test_size=0.2 indique que 20% des données doivent être utilisées pour le lot de test

X_train, X_test, _, _ = train_test_split(X,_, test_size=0.20)
print(f"Taille du lot d'entraînement: {len(X_train)}")
print(f"Taille du lot de test: {len(X_test)}")

Taille du lot d'entraînement: 160
Taille du lot de test: 40


In [4]:
#on veut obtenir v3(x)
# le 1er voisin = le point lui même (d=0)
# donc k= 1 + 3
nn = NearestNeighbors(n_neighbors=4)
#on veut construire une matrice D et la liste de voisinage V3(x)
#pour chaque point x du DATASET X    donc on fit sur X
nn.fit(X)

In [5]:
distances, indices = nn.kneighbors(X)
D = distances[:, :]  # distances aux 3 vrais voisins   (:,1: car on ignore le premier voisin, le point lui-même)
V3 = indices[:, 1:]   # indices des 3 vrais voisins

In [6]:
print("distances: \n", D)

distances: 
 [[0.         0.27774253 0.28814528 0.32723815]
 [0.         0.12187392 0.1387115  0.14535657]
 [0.         0.08160783 0.08828128 0.09167533]
 [0.         0.13312027 0.27267378 0.30772966]
 [0.         0.0425696  0.18881293 0.24766228]
 [0.         0.17341688 0.31304637 0.32226225]
 [0.         0.2307724  0.25166507 0.26644233]
 [0.         0.32094528 0.39914135 0.44234247]
 [0.         0.12447848 0.15818765 0.18874666]
 [0.         0.08370273 0.08496087 0.09244464]
 [0.         0.23135649 0.27572619 0.28193196]
 [0.         0.18140019 0.23837402 0.3374098 ]
 [0.         0.06898134 0.18671292 0.18907424]
 [0.         0.05493652 0.74097416 0.76872007]
 [0.         0.08801226 0.12187392 0.13092654]
 [0.         0.17341688 0.30442334 0.31488286]
 [0.         0.10191649 0.18027349 0.29330046]
 [0.         0.17264937 0.17404446 0.23811236]
 [0.         0.41420176 0.83698449 0.93104098]
 [0.         0.05276167 0.11383809 0.1426546 ]
 [0.         0.0780159  0.18093158 0.21157055]


Maintenant avec train et test

In [7]:
# Fit sur le lot d'entraînement
nn_train = NearestNeighbors(n_neighbors=5)
nn_train.fit(X_train)

#Plus proches voisins pour le lot d'entraînement
dist_train, ind_train = nn_train.kneighbors(X_train)

# 3Plus proches voisins pour le lot de test
# Attention : on calcule les voisins de X_test **par rapport à X_train**
dist_test, ind_test = nn_train.kneighbors(X_test)

In [8]:
print(dist_train)

[[0.         0.18671292 0.18907424 0.23209465 0.2644596 ]
 [0.         0.05493652 0.74097416 0.76872007 0.83508937]
 [0.         0.23805528 0.41766332 0.48890575 0.69703423]
 [0.         0.01348751 0.31304637 0.41355574 0.4148944 ]
 [0.         0.23065734 0.33758061 0.38003703 0.42107545]
 [0.         0.10620367 0.14160239 0.18198898 0.21384074]
 [0.         0.04224931 0.08139312 0.08587839 0.18145952]
 [0.         0.09167533 0.13348017 0.17273634 0.17796292]
 [0.         0.0683383  0.21073914 0.21764367 0.25154019]
 [0.         0.10974775 0.17519531 0.18407706 0.23029136]
 [0.         0.16046143 0.16589079 0.19604371 0.23705235]
 [0.         0.14529157 0.19951676 0.24149303 0.24882517]
 [0.         0.19784259 0.72322232 0.77197259 0.80966414]
 [0.         0.08653292 0.09681847 0.23969182 0.27209483]
 [0.         0.08587839 0.09962909 0.11515747 0.1236607 ]
 [0.         0.15165692 0.24684676 0.33846185 0.44495493]
 [0.         0.1236607  0.1453074  0.2075321  0.209264  ]
 [0.         0

In [9]:
print(dist_test)

[[0.08048521 0.15803973 0.2669283  0.33870415 0.35849526]
 [0.37624225 0.47346691 0.48074669 0.54063703 0.66248466]
 [0.012354   0.08160783 0.13209545 0.16491054 0.16684843]
 [0.13438854 0.23680274 0.32994601 0.36052554 0.40886601]
 [0.24515472 0.33553547 0.34602227 0.37065036 0.38185082]
 [0.05919052 0.11538516 0.12341448 0.12648042 0.14142374]
 [0.2313532  0.24592306 0.6078265  0.61358289 0.63038887]
 [0.26011938 0.2999504  0.52671969 0.58063319 0.60067104]
 [0.39914135 0.60417605 0.61465572 0.67499283 0.72178383]
 [0.0266436  0.12326288 0.12692369 0.33379579 0.33918885]
 [0.19122361 0.24530219 0.65502501 0.66038004 0.67701766]
 [0.23123884 0.29373196 0.34359347 0.34970151 0.39416793]
 [0.18795332 0.19917455 0.31128718 0.36609804 0.3663722 ]
 [0.09580455 0.10905185 0.1426546  0.20325152 0.25966262]
 [0.12187392 0.13092654 0.1640216  0.20367412 0.21325185]
 [0.11943337 0.34150933 0.51845138 0.54008195 0.59678053]
 [0.18027349 0.37467772 0.40177383 0.41096654 0.43784989]
 [0.43858437 0

In [10]:
from sklearn.neighbors import kneighbors_graph

# Construction du graphe 4-NN pour X_train
A = kneighbors_graph(
    X_train,          # dataset sur lequel on construit le graphe
    n_neighbors=4,    # chaque point est relié à ses 4 plus proches voisins
    mode='distance',  # on stocke les distances dans la matrice
    include_self=True # on inclut le point lui-même comme voisin
)

# A est une matrice creuse (sparse matrix)
print(A.shape)
print(A.toarray()[:15,:15])  #les 5 premières lignes et colones de la matrice A
#le principe des matrices creuses (sparse)

(160, 160)
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.     