# sklearn dataset API

Three APIs:  

> Loaders(load_\*)  
> Fetchers(fetch_\*)  
> Generators(generate_*)  

# Loading iris dataset

In [1]:
from sklearn.datasets import load_iris
data = load_iris()

In [2]:
type(data)

sklearn.utils.Bunch

In [3]:
data.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [4]:
data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [5]:
data.data[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [6]:
data.data.shape

(150, 4)

In [7]:
data.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [8]:
?load_iris

[1;31mSignature:[0m [0mload_iris[0m[1;33m([0m[1;33m*[0m[1;33m,[0m [0mreturn_X_y[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [0mas_frame[0m[1;33m=[0m[1;32mFalse[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Load and return the iris dataset (classification).

The iris dataset is a classic and very easy multi-class classification
dataset.

Classes                          3
Samples per class               50
Samples total                  150
Dimensionality                   4
Features            real, positive

Read more in the :ref:`User Guide <iris_dataset>`.

Parameters
----------
return_X_y : bool, default=False
    If True, returns ``(data, target)`` instead of a Bunch object. See
    below for more information about the `data` and `target` object.

    .. versionadded:: 0.18

as_frame : bool, default=False
    If True, the data is a pandas DataFrame including columns with
    appropriate dtypes (numeric). The target is
    a pandas DataFrame or Ser

In [9]:
feature_matrix, label_vector = load_iris(return_X_y=True)
print(f'Shape of feature matrix: {feature_matrix.shape}')
print(f'Shape of label vector: {label_vector.shape}')

Shape of feature matrix: (150, 4)
Shape of label vector: (150,)


# Loading diabetes dataset

In [10]:
from sklearn.datasets import load_diabetes
data = load_diabetes()

In [11]:
?load_diabetes

[1;31mSignature:[0m [0mload_diabetes[0m[1;33m([0m[1;33m*[0m[1;33m,[0m [0mreturn_X_y[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [0mas_frame[0m[1;33m=[0m[1;32mFalse[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Load and return the diabetes dataset (regression).

Samples total    442
Dimensionality   10
Features         real, -.2 < x < .2
Targets          integer 25 - 346

.. note::
   The meaning of each feature (i.e. `feature_names`) might be unclear
   (especially for `ltg`) as the documentation of the original dataset is
   not explicit. We provide information that seems correct in regard with
   the scientific literature in this field of research.

Read more in the :ref:`User Guide <diabetes_dataset>`.

Parameters
----------
return_X_y : bool, default=False.
    If True, returns ``(data, target)`` instead of a Bunch object.
    See below for more information about the `data` and `target` object.

    .. versionadded:: 0.18

as_frame : bool, default=

In [12]:
type(data)

sklearn.utils.Bunch

In [13]:
data.DESCR

'.. _diabetes_dataset:\n\nDiabetes dataset\n----------------\n\nTen baseline variables, age, sex, body mass index, average blood\npressure, and six blood serum measurements were obtained for each of n =\n442 diabetes patients, as well as the response of interest, a\nquantitative measure of disease progression one year after baseline.\n\n**Data Set Characteristics:**\n\n  :Number of Instances: 442\n\n  :Number of Attributes: First 10 columns are numeric predictive values\n\n  :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n\n  :Attribute Information:\n      - age     age in years\n      - sex\n      - bmi     body mass index\n      - bp      average blood pressure\n      - s1      tc, total serum cholesterol\n      - s2      ldl, low-density lipoproteins\n      - s3      hdl, high-density lipoproteins\n      - s4      tch, total cholesterol / HDL\n      - s5      ltg, possibly log of serum triglycerides level\n      - s6      glu, blood sugar

In [14]:
data.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [15]:
features, labels = load_diabetes(return_X_y=True)
print(f'shape of feature matrix: {features.shape}')
print(f'shape of label vector: {labels.shape}')

shape of feature matrix: (442, 10)
shape of label vector: (442,)


In [16]:
print('Features:')
print(features[:10])
print('\nLabels:')
print(labels[:10])

Features:
[[ 0.03807591  0.05068012  0.06169621  0.02187235 -0.0442235  -0.03482076
  -0.04340085 -0.00259226  0.01990842 -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 -0.02632783 -0.00844872 -0.01916334
   0.07441156 -0.03949338 -0.06832974 -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 -0.00567061 -0.04559945 -0.03419447
  -0.03235593 -0.00259226  0.00286377 -0.02593034]
 [-0.08906294 -0.04464164 -0.01159501 -0.03665645  0.01219057  0.02499059
  -0.03603757  0.03430886  0.02269202 -0.00936191]
 [ 0.00538306 -0.04464164 -0.03638469  0.02187235  0.00393485  0.01559614
   0.00814208 -0.00259226 -0.03199144 -0.04664087]
 [-0.09269548 -0.04464164 -0.04069594 -0.01944209 -0.06899065 -0.07928784
   0.04127682 -0.0763945  -0.04118039 -0.09634616]
 [-0.04547248  0.05068012 -0.04716281 -0.01599922 -0.04009564 -0.02480001
   0.00077881 -0.03949338 -0.06291295 -0.03835666]
 [ 0.06350368  0.05068012 -0.00189471  0.06662967  0.09061988  0.10891438
   0.02286863  0.01770335 -0.03581673  0.

# Loading digits dataset

In [17]:
from sklearn.datasets import load_digits
data = load_digits()

In [18]:
?load_digits

[1;31mSignature:[0m [0mload_digits[0m[1;33m([0m[1;33m*[0m[1;33m,[0m [0mn_class[0m[1;33m=[0m[1;36m10[0m[1;33m,[0m [0mreturn_X_y[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [0mas_frame[0m[1;33m=[0m[1;32mFalse[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Load and return the digits dataset (classification).

Each datapoint is a 8x8 image of a digit.

Classes                         10
Samples per class             ~180
Samples total                 1797
Dimensionality                  64
Features             integers 0-16

Read more in the :ref:`User Guide <digits_dataset>`.

Parameters
----------
n_class : int, default=10
    The number of classes to return. Between 0 and 10.

return_X_y : bool, default=False
    If True, returns ``(data, target)`` instead of a Bunch object.
    See below for more information about the `data` and `target` object.

    .. versionadded:: 0.18

as_frame : bool, default=False
    If True, the data is a pandas DataFra

In [19]:
type(data)

sklearn.utils.Bunch

In [20]:
data.DESCR

".. _digits_dataset:\n\nOptical recognition of handwritten digits dataset\n--------------------------------------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 1797\n    :Number of Attributes: 64\n    :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n    :Missing Attribute Values: None\n    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n    :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttps://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number of on pixel

In [21]:
data.feature_names

['pixel_0_0',
 'pixel_0_1',
 'pixel_0_2',
 'pixel_0_3',
 'pixel_0_4',
 'pixel_0_5',
 'pixel_0_6',
 'pixel_0_7',
 'pixel_1_0',
 'pixel_1_1',
 'pixel_1_2',
 'pixel_1_3',
 'pixel_1_4',
 'pixel_1_5',
 'pixel_1_6',
 'pixel_1_7',
 'pixel_2_0',
 'pixel_2_1',
 'pixel_2_2',
 'pixel_2_3',
 'pixel_2_4',
 'pixel_2_5',
 'pixel_2_6',
 'pixel_2_7',
 'pixel_3_0',
 'pixel_3_1',
 'pixel_3_2',
 'pixel_3_3',
 'pixel_3_4',
 'pixel_3_5',
 'pixel_3_6',
 'pixel_3_7',
 'pixel_4_0',
 'pixel_4_1',
 'pixel_4_2',
 'pixel_4_3',
 'pixel_4_4',
 'pixel_4_5',
 'pixel_4_6',
 'pixel_4_7',
 'pixel_5_0',
 'pixel_5_1',
 'pixel_5_2',
 'pixel_5_3',
 'pixel_5_4',
 'pixel_5_5',
 'pixel_5_6',
 'pixel_5_7',
 'pixel_6_0',
 'pixel_6_1',
 'pixel_6_2',
 'pixel_6_3',
 'pixel_6_4',
 'pixel_6_5',
 'pixel_6_6',
 'pixel_6_7',
 'pixel_7_0',
 'pixel_7_1',
 'pixel_7_2',
 'pixel_7_3',
 'pixel_7_4',
 'pixel_7_5',
 'pixel_7_6',
 'pixel_7_7']

In [22]:
data.target_names

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [23]:
features, labels = load_digits(return_X_y=True)
print(f'shape of feature matrix: {features.shape}')
print(f'shape of label vector: {labels.shape}')

shape of feature matrix: (1797, 64)
shape of label vector: (1797,)


In [24]:
print('Features:')
print(features[:10])
print('\nLabels:')
print(labels[:10])

Features:
[[ 0.  0.  5. 13.  9.  1.  0.  0.  0.  0. 13. 15. 10. 15.  5.  0.  0.  3.
  15.  2.  0. 11.  8.  0.  0.  4. 12.  0.  0.  8.  8.  0.  0.  5.  8.  0.
   0.  9.  8.  0.  0.  4. 11.  0.  1. 12.  7.  0.  0.  2. 14.  5. 10. 12.
   0.  0.  0.  0.  6. 13. 10.  0.  0.  0.]
 [ 0.  0.  0. 12. 13.  5.  0.  0.  0.  0.  0. 11. 16.  9.  0.  0.  0.  0.
   3. 15. 16.  6.  0.  0.  0.  7. 15. 16. 16.  2.  0.  0.  0.  0.  1. 16.
  16.  3.  0.  0.  0.  0.  1. 16. 16.  6.  0.  0.  0.  0.  1. 16. 16.  6.
   0.  0.  0.  0.  0. 11. 16. 10.  0.  0.]
 [ 0.  0.  0.  4. 15. 12.  0.  0.  0.  0.  3. 16. 15. 14.  0.  0.  0.  0.
   8. 13.  8. 16.  0.  0.  0.  0.  1.  6. 15. 11.  0.  0.  0.  1.  8. 13.
  15.  1.  0.  0.  0.  9. 16. 16.  5.  0.  0.  0.  0.  3. 13. 16. 16. 11.
   5.  0.  0.  0.  0.  3. 11. 16.  9.  0.]
 [ 0.  0.  7. 15. 13.  1.  0.  0.  0.  8. 13.  6. 15.  4.  0.  0.  0.  2.
   1. 13. 13.  0.  0.  0.  0.  0.  2. 15. 11.  1.  0.  0.  0.  0.  0.  1.
  12. 12.  1.  0.  0.  0.  0.  0.  1. 10.  8.  

# Other datasets  
> load_wine  
> load_breast_cancer  
> load_linnerud   

In [25]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
data.data.shape

(569, 30)

In [26]:
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [27]:
?load_breast_cancer

[1;31mSignature:[0m [0mload_breast_cancer[0m[1;33m([0m[1;33m*[0m[1;33m,[0m [0mreturn_X_y[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [0mas_frame[0m[1;33m=[0m[1;32mFalse[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Load and return the breast cancer wisconsin dataset (classification).

The breast cancer dataset is a classic and very easy binary classification
dataset.

Classes                          2
Samples per class    212(M),357(B)
Samples total                  569
Dimensionality                  30
Features            real, positive

Read more in the :ref:`User Guide <breast_cancer_dataset>`.

Parameters
----------
return_X_y : bool, default=False
    If True, returns ``(data, target)`` instead of a Bunch object.
    See below for more information about the `data` and `target` object.

    .. versionadded:: 0.18

as_frame : bool, default=False
    If True, the data is a pandas DataFrame including columns with
    appropriate dtypes (numeric). Th

# Fetchers

## fetch_california_housing

In [28]:
from sklearn.datasets import fetch_california_housing
?fetch_california_housing

[1;31mSignature:[0m
[0mfetch_california_housing[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mdata_home[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mdownload_if_missing[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mreturn_X_y[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mas_frame[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Load the California housing dataset (regression).

Samples total             20640
Dimensionality                8
Features                   real
Target           real 0.15 - 5.

Read more in the :ref:`User Guide <california_housing_dataset>`.

Parameters
----------
data_home : str, default=None
    Specify another download and cache folder for the datasets. By default
    all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

download_if_missing : bool, default=True
    If False, raise a IOError if 

In [29]:
housing_data = fetch_california_housing()
housing_data.DESCR

'.. _california_housing_dataset:\n\nCalifornia Housing dataset\n--------------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 20640\n\n    :Number of Attributes: 8 numeric, predictive attributes and the target\n\n    :Attribute Information:\n        - MedInc        median income in block\n        - HouseAge      median house age in block\n        - AveRooms      average number of rooms\n        - AveBedrms     average number of bedrooms\n        - Population    block population\n        - AveOccup      average house occupancy\n        - Latitude      house block latitude\n        - Longitude     house block longitude\n\n    :Missing Attribute Values: None\n\nThis dataset was obtained from the StatLib repository.\nhttp://lib.stat.cmu.edu/datasets/\n\nThe target variable is the median house value for California districts.\n\nThis dataset was derived from the 1990 U.S. census, using one row per census\nblock group. A block group is the smallest geographical unit

In [30]:
housing_data.data.shape

(20640, 8)

In [31]:
housing_data.data[:5]

array([[ 8.32520000e+00,  4.10000000e+01,  6.98412698e+00,
         1.02380952e+00,  3.22000000e+02,  2.55555556e+00,
         3.78800000e+01, -1.22230000e+02],
       [ 8.30140000e+00,  2.10000000e+01,  6.23813708e+00,
         9.71880492e-01,  2.40100000e+03,  2.10984183e+00,
         3.78600000e+01, -1.22220000e+02],
       [ 7.25740000e+00,  5.20000000e+01,  8.28813559e+00,
         1.07344633e+00,  4.96000000e+02,  2.80225989e+00,
         3.78500000e+01, -1.22240000e+02],
       [ 5.64310000e+00,  5.20000000e+01,  5.81735160e+00,
         1.07305936e+00,  5.58000000e+02,  2.54794521e+00,
         3.78500000e+01, -1.22250000e+02],
       [ 3.84620000e+00,  5.20000000e+01,  6.28185328e+00,
         1.08108108e+00,  5.65000000e+02,  2.18146718e+00,
         3.78500000e+01, -1.22250000e+02]])

In [32]:
housing_data.target.shape

(20640,)

In [33]:
housing_data.target[:5]

array([4.526, 3.585, 3.521, 3.413, 3.422])

In [34]:
housing_data.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [35]:
housing_data.target_names

['MedHouseVal']

## fetch_openml

In [36]:
from sklearn.datasets import fetch_openml
?fetch_openml

[1;31mSignature:[0m
[0mfetch_openml[0m[1;33m([0m[1;33m
[0m    [0mname[0m[1;33m:[0m [0mOptional[0m[1;33m[[0m[0mstr[0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mversion[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mint[0m[1;33m][0m [1;33m=[0m [1;34m'active'[0m[1;33m,[0m[1;33m
[0m    [0mdata_id[0m[1;33m:[0m [0mOptional[0m[1;33m[[0m[0mint[0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mdata_home[0m[1;33m:[0m [0mOptional[0m[1;33m[[0m[0mstr[0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtarget_column[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mList[0m[1;33m,[0m [0mNoneType[0m[1;33m][0m [1;33m=[0m [1;34m'default-target'[0m[1;33m,[0m[1;33m
[0m    [0mcache[0m[1;33m:[0m [0mbool[0m [1;33m=[0m [1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mreturn_X_y[0m[1;

In [37]:
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
print(f'Feature matrix shape: {X.shape}')
print(f'Label shape: {y.shape}')

Feature matrix shape: (70000, 784)
Label shape: (70000,)


## Other datasets to fetch  
> fetch_20newsgroups  
> fetch_kddcup99  

# Generators

## make_regression

In [38]:
from sklearn.datasets import make_regression
?make_regression

[1;31mSignature:[0m
[0mmake_regression[0m[1;33m([0m[1;33m
[0m    [0mn_samples[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [0mn_features[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mn_informative[0m[1;33m=[0m[1;36m10[0m[1;33m,[0m[1;33m
[0m    [0mn_targets[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mbias[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0meffective_rank[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtail_strength[0m[1;33m=[0m[1;36m0.5[0m[1;33m,[0m[1;33m
[0m    [0mnoise[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mshuffle[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mcoef[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mrandom_state[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Generate a random regression problem.

The inp

## Example 1  
Generate 100 samples with 5 features for a single label regression problem.

In [39]:
X, y = make_regression(n_samples=100, n_features=5, n_targets=1, shuffle=True, random_state=42)

In [40]:
X.shape

(100, 5)

In [41]:
y.shape

(100,)

## Example 2  
Generate 100 samples with 5 features for a multiple regression problem with 5 outputs.

In [42]:
X, y = make_regression(n_samples=100, n_features=5, n_targets=5, shuffle=True, random_state=42)

In [43]:
X.shape

(100, 5)

In [44]:
y.shape

(100, 5)

## make_classification

Generate a random n-class classification set up.

In [45]:
from sklearn.datasets import make_classification
?make_classification

[1;31mSignature:[0m
[0mmake_classification[0m[1;33m([0m[1;33m
[0m    [0mn_samples[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [0mn_features[0m[1;33m=[0m[1;36m20[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mn_informative[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mn_redundant[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mn_repeated[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mn_classes[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mn_clusters_per_class[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mweights[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mflip_y[0m[1;33m=[0m[1;36m0.01[0m[1;33m,[0m[1;33m
[0m    [0mclass_sep[0m[1;33m=[0m[1;36m1.0[0m[1;33m,[0m[1;33m
[0m    [0mhypercube[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mshift[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mscale[0m[1;33m=[0m[1;

## Example 1  
Generate a binary classification problem with 10 features and 100 samples.

In [46]:
X, y = make_classification(n_samples=100, n_features=10, n_classes=2, n_clusters_per_class=1, random_state=42)

In [47]:
X.shape

(100, 10)

In [48]:
y.shape

(100,)

In [49]:
X[:5]

array([[ 0.11422765, -1.71016839, -0.06822216, -0.14928517,  0.30780177,
         0.15030176, -0.05694562, -0.22595246, -0.36361221, -0.13818757],
       [ 0.70775194, -1.57022472, -0.23503183, -0.63604713,  0.62180996,
        -0.56246678,  0.97255445, -0.77719676,  0.63240774, -0.47809669],
       [ 0.63859246,  0.04739867,  0.33273433,  1.1046981 , -0.65183611,
        -1.66152006, -1.2110162 ,  1.09821151, -0.0660798 ,  0.68024225],
       [-0.23894805, -0.97755524,  0.0379061 ,  0.19896733,  0.50091719,
        -0.90756366,  0.75539123,  0.12437227, -0.57677133,  0.07871283],
       [-0.59239392, -0.05023811,  0.17573204, -1.43949185,  0.27045683,
        -0.86399077, -0.83095012,  0.60046915,  0.04852163,  0.32557953]])

In [50]:
y[:5]

array([1, 1, 1, 1, 0])

## Example 2  
Generate a three class classification problem with 10 features and 100 samples.

In [51]:
X, y = make_classification(n_samples=100, n_features=10, n_classes=3, n_clusters_per_class=1, random_state=42)

In [52]:
X.shape

(100, 10)

In [53]:
y.shape

(100,)

In [54]:
X[:5]

array([[-0.58351628, -1.73833907, -1.37298251, -1.77311485,  0.45918008,
         0.83392215, -1.66096093,  0.20768769, -0.07016571,  0.42961822],
       [-1.0044394 , -1.43862044,  0.47335819, -0.21188291,  0.0125924 ,
         0.22409248, -0.77300978,  0.49799829,  0.0976761 ,  0.02451017],
       [ 0.07740833,  0.19896733,  0.12437227,  0.17738132, -0.97755524,
         0.50091719,  0.75138712,  0.54336019,  0.09933231, -1.66940528],
       [-0.91759569, -0.9609536 ,  1.07746664,  0.4522739 , -0.32138584,
        -0.8254972 , -0.56372455,  0.24368721,  0.41293145, -0.8222204 ],
       [-0.96222828, -0.96090774,  1.21530116,  0.55980482, -1.24778318,
        -0.25256815, -1.43014138,  0.13074058,  1.6324113 , -0.44004449]])

In [55]:
y[:5]

array([2, 0, 1, 0, 0])

## make_multilabel_classification

This function helps us generating a random multi-label classification problem.

In [56]:
from sklearn.datasets import make_multilabel_classification
?make_multilabel_classification

[1;31mSignature:[0m
[0mmake_multilabel_classification[0m[1;33m([0m[1;33m
[0m    [0mn_samples[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [0mn_features[0m[1;33m=[0m[1;36m20[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mn_classes[0m[1;33m=[0m[1;36m5[0m[1;33m,[0m[1;33m
[0m    [0mn_labels[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mlength[0m[1;33m=[0m[1;36m50[0m[1;33m,[0m[1;33m
[0m    [0mallow_unlabeled[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0msparse[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mreturn_indicator[0m[1;33m=[0m[1;34m'dense'[0m[1;33m,[0m[1;33m
[0m    [0mreturn_distributions[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mrandom_state[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Generate a random multilabel classification problem.

For each sample, the gener

## Example 1  
Generate a multilabel classification problem with 100 samples, 20 features, 5 labels, and on an average 2 labels per example.

In [57]:
X, y =  make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=2)

In [58]:
X.shape

(100, 20)

In [59]:
y.shape

(100, 5)

In [60]:
X[:5]

array([[4., 6., 1., 1., 4., 3., 0., 2., 0., 4., 8., 7., 2., 2., 1., 2.,
        3., 4., 4., 0.],
       [0., 5., 0., 2., 2., 5., 1., 0., 1., 3., 1., 7., 1., 1., 0., 3.,
        2., 2., 4., 3.],
       [3., 4., 1., 3., 2., 8., 0., 1., 0., 1., 5., 7., 3., 7., 0., 1.,
        2., 3., 6., 2.],
       [2., 2., 4., 1., 2., 4., 1., 1., 2., 0., 5., 9., 2., 4., 0., 7.,
        6., 2., 5., 1.],
       [0., 2., 2., 5., 4., 2., 0., 1., 0., 2., 3., 6., 1., 2., 1., 0.,
        5., 0., 5., 3.]])

In [61]:
y[:5]

array([[0, 1, 0, 0, 0],
       [0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 1]])

## make_blobs  

make_blobs enables us to generate random data for clustering.


In [62]:
from sklearn.datasets import make_blobs
?make_blobs

[1;31mSignature:[0m
[0mmake_blobs[0m[1;33m([0m[1;33m
[0m    [0mn_samples[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [0mn_features[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mcenters[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcluster_std[0m[1;33m=[0m[1;36m1.0[0m[1;33m,[0m[1;33m
[0m    [0mcenter_box[0m[1;33m=[0m[1;33m([0m[1;33m-[0m[1;36m10.0[0m[1;33m,[0m [1;36m10.0[0m[1;33m)[0m[1;33m,[0m[1;33m
[0m    [0mshuffle[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mrandom_state[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mreturn_centers[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Generate isotropic Gaussian blobs for clustering.

Read more in the :ref:`User Guide <sample_generators>`.

Parameters
----------
n_samples : int or array-like, default=100
    If int, it is 

Generate a random dataset of 10 samples with 2 features each for clustering.

In [63]:
X, y = make_blobs(n_samples=10, n_features=2, centers=3, random_state=42)
print(f'Feature matrix shape: {X.shape}')
print(f'Label shape: {y.shape}')

Feature matrix shape: (10, 2)
Label shape: (10,)


We can find the cluster membership of each point in y.

In [64]:
y

array([2, 2, 1, 2, 0, 0, 0, 1, 1, 0])