In [10]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs, make_moons
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, cophenet
from scipy.spatial.distance import pdist, squareform
from sklearn.preprocessing import StandardScaler
from sklearn.utils import Bunch
from collections import Counter
import pandas as pd
import scipy

In [11]:
def cargar_mat(path: str):
    m = scipy.io.loadmat(path, squeeze_me=True, struct_as_record=False)

    X_train = np.asarray(m["x_entena"], dtype=np.float32)
    X_test  = np.asarray(m["x_prueba"],  dtype=np.float32)
    y_train = np.asarray(m["y_entrena"]).reshape(-1).astype(np.int64)

    # Paquetes listos
    datasets_np = {"train": {"X": X_train, "y": y_train},
                   "test":  {"X": X_test}}

    df_train = pd.DataFrame(X_train)
    df_train["y"] = y_train
    df_test  = pd.DataFrame(X_test)

    feat_names = [f"x{i}" for i in range(X_train.shape[1])]
    train_bunch = Bunch(data=X_train, target=y_train, feature_names=feat_names)
    test_bunch  = Bunch(data=X_test,  feature_names=feat_names)

    return datasets_np, df_train, df_test, train_bunch, test_bunch


datasets_np, df_train, df_test, train_bunch, test_bunch = cargar_mat("dato_taller.mat")

In [13]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1000 non-null   float32
 1   1       1000 non-null   float32
 2   2       1000 non-null   float32
 3   3       1000 non-null   float32
 4   4       1000 non-null   float32
 5   5       1000 non-null   float32
 6   6       1000 non-null   float32
 7   7       1000 non-null   float32
 8   8       1000 non-null   float32
 9   9       1000 non-null   float32
 10  10      1000 non-null   float32
 11  11      1000 non-null   float32
 12  12      1000 non-null   float32
 13  13      1000 non-null   float32
 14  14      1000 non-null   float32
 15  15      1000 non-null   float32
 16  16      1000 non-null   float32
 17  17      1000 non-null   float32
 18  18      1000 non-null   float32
 19  19      1000 non-null   float32
 20  y       1000 non-null   int64  
dtypes: float32(20), int64(1)
memory usage:

In [14]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       10000 non-null  float32
 1   1       10000 non-null  float32
 2   2       10000 non-null  float32
 3   3       10000 non-null  float32
 4   4       10000 non-null  float32
 5   5       10000 non-null  float32
 6   6       10000 non-null  float32
 7   7       10000 non-null  float32
 8   8       10000 non-null  float32
 9   9       10000 non-null  float32
 10  10      10000 non-null  float32
 11  11      10000 non-null  float32
 12  12      10000 non-null  float32
 13  13      10000 non-null  float32
 14  14      10000 non-null  float32
 15  15      10000 non-null  float32
 16  16      10000 non-null  float32
 17  17      10000 non-null  float32
 18  18      10000 non-null  float32
 19  19      10000 non-null  float32
dtypes: float32(20)
memory usage: 781.4 KB
