In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import category_encoders as ce
from pathlib import Path

In [2]:
np.random.seed(10)

# GETTING THE DATASET
Here I download the dataset (3/4 Mb, but pandas is a bit slow to download it), delete the rows with missing values and replace the targets with a 0 and 1 instead of '<=50K', '>50K'

In [3]:
DATASET_PATH = Path("./adult.data")
DATASET_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' if not DATASET_PATH.exists() else DATASET_PATH
columns = ['Age', 'Workclass', 'Fnlwgt', 'Education', 'Education-num', 'Marital-status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Capital-gain', 'Capital-loss', 'Hours-per-week', 'Native-country', 'Target']
df = pd.read_csv(DATASET_URL, names=columns, na_values=['?', ' ?', '? '])

In [4]:
# strip the strings
df_str = df.select_dtypes(['object'])
df[df_str.columns] = df_str.apply(lambda x: x.str.strip())
# remove missing values and duplicates
df = df.dropna()
df = df.drop_duplicates()
# map the target into {0,1}
map_targ = {
        '<=50K': 0,
        '>50K': 1
}

feat = df.iloc[:, :-1]
targ = df.iloc[:, -1]
targ = targ.map(map_targ)

targenc = ce.TargetEncoder(verbose=1,return_df=True)
df = targenc.fit_transform(feat, targ)
df['Target'] = targ

In [5]:
df

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Target
0,39,0.268960,77516,0.421658,13,0.048399,0.133907,0.106689,0.263855,0.313955,2174,0,40,0.254411,0
1,50,0.285829,83311,0.421658,13,0.455011,0.485342,0.455728,0.263855,0.313955,0,0,13,0.254411,0
2,38,0.218918,215646,0.164328,9,0.107312,0.061527,0.106689,0.263855,0.313955,0,0,40,0.254411,0
3,53,0.218918,234721,0.056298,7,0.455011,0.061527,0.455728,0.129972,0.313955,0,0,40,0.254411,0
4,28,0.218918,338409,0.421658,13,0.455011,0.448686,0.493599,0.129972,0.113783,0,0,40,0.271739,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,0.218918,257302,0.253968,12,0.455011,0.305159,0.493599,0.263855,0.113783,0,0,38,0.254411,0
32557,40,0.218918,154374,0.164328,9,0.455011,0.124236,0.455728,0.263855,0.313955,0,0,40,0.254411,1
32558,58,0.218918,151910,0.164328,9,0.096735,0.133907,0.066334,0.263855,0.113783,0,0,40,0.254411,0
32559,22,0.218918,201490,0.164328,9,0.048399,0.133907,0.014343,0.263855,0.313955,0,0,20,0.254411,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30139 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             30139 non-null  int64  
 1   Workclass       30139 non-null  float64
 2   Fnlwgt          30139 non-null  int64  
 3   Education       30139 non-null  float64
 4   Education-num   30139 non-null  int64  
 5   Marital-status  30139 non-null  float64
 6   Occupation      30139 non-null  float64
 7   Relationship    30139 non-null  float64
 8   Race            30139 non-null  float64
 9   Sex             30139 non-null  float64
 10  Capital-gain    30139 non-null  int64  
 11  Capital-loss    30139 non-null  int64  
 12  Hours-per-week  30139 non-null  int64  
 13  Native-country  30139 non-null  float64
 14  Target          30139 non-null  int64  
dtypes: float64(8), int64(7)
memory usage: 4.7 MB


In [7]:
df['Target'].value_counts()

0    22633
1     7506
Name: Target, dtype: int64

In [8]:
# percentage of zeroes and ones
ZEROS_PERC, ONES_PERC = df['Target'].value_counts(normalize=True)

In [9]:
df.head(3)

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Target
0,39,0.26896,77516,0.421658,13,0.048399,0.133907,0.106689,0.263855,0.313955,2174,0,40,0.254411,0
1,50,0.285829,83311,0.421658,13,0.455011,0.485342,0.455728,0.263855,0.313955,0,0,13,0.254411,0
2,38,0.218918,215646,0.164328,9,0.107312,0.061527,0.106689,0.263855,0.313955,0,0,40,0.254411,0


# Features preparation and Centroid Computation

Extract the numerical features and scale them

In [10]:
from sklearn.preprocessing import StandardScaler, Normalizer

In [11]:
NUMERICAL_FEATURES = ['Age', 'Fnlwgt', 'Education-num', 'Capital-gain', 'Capital-loss', 'Hours-per-week']

df_numerical = df[NUMERICAL_FEATURES + ['Target']].copy()

# Scale the numerical features
scaler = StandardScaler()
df_numerical[NUMERICAL_FEATURES] = scaler.fit_transform(df_numerical[NUMERICAL_FEATURES])

In [12]:
df_numerical

Unnamed: 0,Age,Fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Target
0,0.042516,-1.062676,1.128996,0.145925,-0.218673,-0.078031,0
1,0.880215,-1.007829,1.128996,-0.147502,-0.218673,-2.332060,0
2,-0.033639,0.244669,-0.440434,-0.147502,-0.218673,-0.078031,0
3,1.108678,0.425206,-1.225149,-0.147502,-0.218673,-0.078031,0
4,-0.795183,1.406572,1.128996,-0.147502,-0.218673,-0.078031,0
...,...,...,...,...,...,...,...
32556,-0.871338,0.638926,0.736639,-0.147502,-0.218673,-0.244996,0
32557,0.118670,-0.335246,-0.440434,-0.147502,-0.218673,-0.078031,1
32558,1.489450,-0.358567,-0.440434,-0.147502,-0.218673,-0.078031,0
32559,-1.252110,0.110688,-0.440434,-0.147502,-0.218673,-1.747682,0


# Clustering with the already found parameters.

In [13]:
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.cm as cm
from kneed import KneeLocator

CLUSTERS_0 = 5
CLUSTERS_1 = 4

In [14]:
def run_kmeans(X, range_nclusters):
    '''runs kmeans for different values of k
    Args:
        range_nclusters: range of values or list containing the specific values.
    '''
    inertia, models = [], []
    for k in range_nclusters:
        print(f"running KMeans with {k=}")
        k_means = KMeans(init='k-means++', n_clusters= k, n_init=20, random_state=123)
        model = k_means.fit(X) #returns fitted estimator
        models.append(model)
        inertia.append(k_means.inertia_)
    return inertia, models

## Class 0
Compute the clusters for class $0$

In [15]:
zeroes = df_numerical[df_numerical['Target'] == 0].copy()

In [16]:
wcss_0, models_0 = run_kmeans(zeroes[NUMERICAL_FEATURES], range_nclusters=[CLUSTERS_0])

running KMeans with k=5


In [17]:
kmeans_0 = models_0[0]
assert kmeans_0.n_clusters == CLUSTERS_0

In [18]:
clusters_0 = kmeans_0.predict(zeroes[NUMERICAL_FEATURES])
centroids_0 = kmeans_0.cluster_centers_
zeroes['Cluster'] = clusters_0
zeroes

Unnamed: 0,Age,Fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Target,Cluster
0,0.042516,-1.062676,1.128996,0.145925,-0.218673,-0.078031,0,1
1,0.880215,-1.007829,1.128996,-0.147502,-0.218673,-2.332060,0,2
2,-0.033639,0.244669,-0.440434,-0.147502,-0.218673,-0.078031,0,0
3,1.108678,0.425206,-1.225149,-0.147502,-0.218673,-0.078031,0,2
4,-0.795183,1.406572,1.128996,-0.147502,-0.218673,-0.078031,0,3
...,...,...,...,...,...,...,...,...
32553,-0.490566,-0.697134,1.521353,-0.147502,-0.218673,-2.499025,0,0
32555,-1.252110,1.139131,-0.048076,-0.147502,-0.218673,-0.078031,0,3
32556,-0.871338,0.638926,0.736639,-0.147502,-0.218673,-0.244996,0,3
32558,1.489450,-0.358567,-0.440434,-0.147502,-0.218673,-0.078031,0,2


In [19]:
zeroes['Cluster'].value_counts(normalize=True)

0    0.325763
1    0.265851
2    0.223523
3    0.155790
4    0.029073
Name: Cluster, dtype: float64

In [20]:
centroids_0

array([[-0.91519071, -0.31545353, -0.44952021, -0.13175274, -0.21808602,
        -0.61974068],
       [-0.14038439, -0.36038229,  0.57185097, -0.12207358, -0.21555057,
         0.60708917],
       [ 1.21783812, -0.22795048, -0.67368979, -0.12235867, -0.21570559,
        -0.32054865],
       [-0.48620289,  1.64817609, -0.29054581, -0.13075671, -0.21579687,
        -0.11932275],
       [ 0.01207687, -0.07068077, -0.09399062, -0.14750209,  4.25785802,
        -0.03590945]])

## Class 1
Compute the clusters for class $1$

In [21]:
ones = df_numerical[df_numerical['Target'] == 1].copy()

In [22]:
wcss_1, models_1 = run_kmeans(ones[NUMERICAL_FEATURES], range_nclusters=[CLUSTERS_1])

running KMeans with k=4


In [23]:
kmeans_1 = models_1[0]
assert kmeans_1.n_clusters == CLUSTERS_1

In [24]:
clusters_1 = kmeans_1.predict(ones[NUMERICAL_FEATURES])
centroids_1 = kmeans_1.cluster_centers_
ones['Cluster'] = clusters_1
ones

Unnamed: 0,Age,Fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Target,Cluster
7,1.032523,0.187844,-0.440434,-0.147502,-0.218673,0.339381,1,1
8,-0.566720,-1.363035,1.521353,1.753432,-0.218673,0.756794,1,3
9,0.270979,-0.287213,1.128996,0.551379,-0.218673,-0.078031,1,3
10,-0.109793,0.858146,-0.048076,-0.147502,-0.218673,3.261270,1,1
11,-0.642875,-0.459014,1.128996,-0.147502,-0.218673,-0.078031,1,3
...,...,...,...,...,...,...,...,...
32538,-0.033639,-0.479051,1.128996,1.879765,-0.218673,0.339381,1,3
32545,0.042516,-0.741041,0.736639,-0.147502,-0.218673,-1.747682,1,3
32554,1.108678,1.249989,1.521353,-0.147502,-0.218673,-0.078031,1,3
32557,0.118670,-0.335246,-0.440434,-0.147502,-0.218673,-0.078031,1,1


In [25]:
ones['Cluster'].value_counts()

3    3439
1    3184
0     735
2     148
Name: Cluster, dtype: int64

In [26]:
centroids_1

array([[ 4.44838975e-01, -1.48055068e-02,  7.43578160e-01,
        -1.47502095e-01,  4.67052352e+00,  4.57733408e-01],
       [ 4.48305861e-01, -6.51023184e-03, -2.94162470e-01,
         7.99845813e-02, -2.18166258e-01,  3.32223415e-01],
       [ 5.88975178e-01,  3.32644329e-02,  1.12369385e+00,
         1.33494835e+01, -2.18673351e-01,  7.78228649e-01],
       [ 3.81767107e-01, -2.63194291e-02,  1.33629855e+00,
         2.21335002e-01, -2.18203858e-01,  4.30656185e-01]])

### Saving the centroids

In [27]:
CENTROIDS_FILE = "centroids"
np.savez(CENTROIDS_FILE, class_0 = centroids_0, class_1 = centroids_1)

# Load the data

In [28]:
RF_DATA_PATH = "dataset_backups/adult_randfor.data.npz"
ADULT_TRAIN_WITH_CLUSTERS = "adult_trainset.csv"
ADULT_TEST_WITH_CLUSTERS = "adult_testset.csv"

df_train = pd.read_csv(ADULT_TRAIN_WITH_CLUSTERS)
df_test  = pd.read_csv(ADULT_TEST_WITH_CLUSTERS)
loaded = np.load(RF_DATA_PATH, allow_pickle=True)
print("Keys in the dict: ", end="")
for key in loaded.keys():
    print(key, end=" ")

Keys in the dict: x_train x_test y_train y_test 

In [29]:
X_train, y_train, X_test, y_test = loaded["x_train"], loaded["y_train"], loaded["x_test"], loaded["y_test"]

In [30]:
X_train

array([[ 0.11866996,  0.28582866, -0.89505798, ..., -0.21867335,
         0.75679395,  0.25441118],
       [-0.49056568,  0.21891843,  1.97075657, ..., -0.21867335,
        -2.16509452,  0.05445545],
       [-0.33825677,  0.21891843, -0.18004533, ..., -0.21867335,
         0.75679395,  0.25441118],
       ...,
       [ 1.18483233,  0.55865922, -0.40906971, ..., -0.21867335,
        -0.07803133,  0.25441118],
       [ 1.03252342,  0.38706257,  4.04352285, ..., -0.21867335,
        -0.07803133,  0.25441118],
       [ 1.18483233,  0.28582866, -1.12109155, ..., -0.21867335,
         0.75679395,  0.25441118]])

In [31]:
df_train_features = df_train.iloc[:,1:-2]
df_test_features  = df_test.iloc[:, 1:-2]

In [32]:
# Checking that the data used for the random forest match the ones in the sets
assert (df_train_features == X_train).all().all()
assert (df_test_features == X_test).all().all()

### Separate the two classes for the train dataset and for each class/cluster compute the 5 nearest records to the centroid (considering only the numerical features)

In [33]:
test_zeroes = df_test[df_test['Target'] == 0]
test_ones = df_test[df_test['Target'] == 1]

In [34]:
print(test_zeroes['Cluster'].value_counts())
print(test_ones['Cluster'].value_counts())

0    1770
1    1444
2    1214
3     846
4     158
Name: Cluster, dtype: int64
3    826
1    764
0    177
2     35
Name: Cluster, dtype: int64


keep only the numerical features and the Cluster

In [35]:
test_zeroes = test_zeroes[NUMERICAL_FEATURES + ['Cluster']]
test_ones = test_ones[NUMERICAL_FEATURES + ['Cluster']]

In [36]:
print(test_zeroes['Cluster'].value_counts())
print(test_ones['Cluster'].value_counts())

0    1770
1    1444
2    1214
3     846
4     158
Name: Cluster, dtype: int64
3    826
1    764
0    177
2     35
Name: Cluster, dtype: int64


## Class 0

In [37]:
CENTROID_0 = centroids_0

In [38]:
len(CENTROID_0)

5

In [39]:
assert(len(CENTROID_0[0]) == len(NUMERICAL_FEATURES))

In [40]:
test_zeroes

Unnamed: 0,Age,Fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Cluster
0,-1.404419,-0.106742,-0.048076,-0.147502,-0.218673,-0.078031,0
1,0.804060,0.807131,1.128996,-0.147502,-0.218673,1.591619,1
2,-0.185948,-0.291311,-0.440434,-0.147502,-0.218673,-0.244996,0
3,-0.185948,-0.697134,-0.440434,-0.147502,-0.218673,-0.078031,0
4,0.727906,2.002122,-0.440434,-0.147502,-0.218673,-0.078031,3
...,...,...,...,...,...,...,...
5427,-0.262102,-0.123409,-0.832791,-0.147502,-0.218673,-0.078031,0
5428,1.717914,-0.259368,-2.402221,-0.147502,3.863529,-0.078031,4
5429,-0.338257,-0.777716,-0.048076,-0.147502,-0.218673,0.756794,1
5430,-0.566720,0.683618,-0.440434,-0.147502,-0.218673,-0.078031,3


In [41]:
from sklearn.metrics import pairwise_distances

def find_nearest_neighbors(df: pd.DataFrame, centroids, k, columns_to_consider=NUMERICAL_FEATURES, group_by=['Cluster']):
    """
    
    """
    assert len(columns_to_consider) == len(centroids[0])
    # indices of the elements to put in the final dataframe
    clustered_ind = []
    for (_, cluster), cent in zip(df.groupby(['Cluster']), centroids):
        print(cluster['Cluster'].value_counts())
        print(f"Centroid: {cent}")
        features = cluster[columns_to_consider].copy()
        distances = pairwise_distances(features, [cent])
        features['Dist'] = distances
        # sort by distance and get the first k rows
        sorted_f = features.sort_values(by=['Dist']).head(k)
        clustered_ind += sorted_f.index.to_list()
    return df.loc[clustered_ind]
        

In [42]:
tmp_0 = find_nearest_neighbors(test_zeroes, CENTROID_0, 5)

0    1770
Name: Cluster, dtype: int64
Centroid: [-0.91519071 -0.31545353 -0.44952021 -0.13175274 -0.21808602 -0.61974068]
1    1444
Name: Cluster, dtype: int64
Centroid: [-0.14038439 -0.36038229  0.57185097 -0.12207358 -0.21555057  0.60708917]
2    1214
Name: Cluster, dtype: int64
Centroid: [ 1.21783812 -0.22795048 -0.67368979 -0.12235867 -0.21570559 -0.32054865]
3    846
Name: Cluster, dtype: int64
Centroid: [-0.48620289  1.64817609 -0.29054581 -0.13075671 -0.21579687 -0.11932275]
4    158
Name: Cluster, dtype: int64
Centroid: [ 0.01207687 -0.07068077 -0.09399062 -0.14750209  4.25785802 -0.03590945]


In [43]:
assert len(tmp_0) == 5 * 5

## Class 1

In [44]:
CENTROID_1 = centroids_1

In [45]:
centroids_1

array([[ 4.44838975e-01, -1.48055068e-02,  7.43578160e-01,
        -1.47502095e-01,  4.67052352e+00,  4.57733408e-01],
       [ 4.48305861e-01, -6.51023184e-03, -2.94162470e-01,
         7.99845813e-02, -2.18166258e-01,  3.32223415e-01],
       [ 5.88975178e-01,  3.32644329e-02,  1.12369385e+00,
         1.33494835e+01, -2.18673351e-01,  7.78228649e-01],
       [ 3.81767107e-01, -2.63194291e-02,  1.33629855e+00,
         2.21335002e-01, -2.18203858e-01,  4.30656185e-01]])

In [46]:
len(CENTROID_1)

4

In [47]:
assert(len(CENTROID_1[0]) == len(NUMERICAL_FEATURES))

In [48]:
test_ones

Unnamed: 0,Age,Fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Cluster
5432,-0.033639,0.044852,1.521353,13.349484,-0.218673,1.591619,2
5433,-0.109793,-0.598409,1.521353,-0.147502,4.669585,-0.078031,0
5434,0.651751,-0.456989,2.306068,-0.147502,-0.218673,1.591619,3
5435,1.870222,1.829157,1.128996,-0.147502,4.350624,-0.078031,0
5436,-0.033639,0.151973,1.128996,-0.147502,-0.218673,-0.078031,3
...,...,...,...,...,...,...,...
7229,-0.719029,-0.842643,1.128996,-0.147502,-0.218673,0.339381,3
7230,0.880215,-0.583512,1.128996,-0.147502,-0.218673,0.756794,3
7231,0.651751,1.706694,1.913711,-0.147502,-0.218673,0.756794,3
7232,0.804060,3.351670,-0.440434,-0.147502,-0.218673,0.088934,1


In [49]:
tmp_1 = find_nearest_neighbors(test_ones, CENTROID_1, 5)

0    177
Name: Cluster, dtype: int64
Centroid: [ 0.44483898 -0.01480551  0.74357816 -0.14750209  4.67052352  0.45773341]
1    764
Name: Cluster, dtype: int64
Centroid: [ 0.44830586 -0.00651023 -0.29416247  0.07998458 -0.21816626  0.33222341]
2    35
Name: Cluster, dtype: int64
Centroid: [ 0.58897518  0.03326443  1.12369385 13.3494835  -0.21867335  0.77822865]
3    826
Name: Cluster, dtype: int64
Centroid: [ 0.38176711 -0.02631943  1.33629855  0.221335   -0.21820386  0.43065618]


In [50]:
final_dataset = df_test.loc[tmp_0.index.append(tmp_1.index)]

In [51]:
final_dataset.to_csv("adult_clustered_5_closest.csv")

In [52]:
final_dataset.shape

(45, 17)

# EMBEDDING THE NEW DATASET IN THE RANDOM FOREST DATA

In [53]:
df_read = pd.read_csv("adult_clustered_5_closest.csv")
df_read.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Age,Workclass,Fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Target,Cluster
0,2642,10006,-0.719029,0.218918,-0.298864,0.164328,-0.440434,0.048399,0.116515,0.066334,0.263855,0.313955,-0.147502,-0.218673,-0.495444,0.254411,0,0
1,1495,10240,-0.719029,0.218918,-0.26305,0.164328,-0.440434,0.048399,0.041134,0.014343,0.263855,0.113783,-0.147502,-0.218673,-0.495444,0.254411,0,0
2,3039,26141,-0.795183,0.218918,-0.499826,0.164328,-0.440434,0.455011,0.041134,0.455728,0.263855,0.313955,-0.147502,-0.218673,-0.495444,0.254411,0,0
3,2936,13014,-0.795183,0.218918,-0.129268,0.164328,-0.440434,0.048399,0.041134,0.014343,0.263855,0.113783,-0.147502,-0.218673,-0.745892,0.254411,0,0
4,5232,17857,-1.099801,0.218918,-0.18473,0.164328,-0.440434,0.048399,0.041134,0.106689,0.263855,0.113783,-0.147502,-0.218673,-0.495444,0.254411,0,0


In [54]:
features = df_read.iloc[:,2:-2]
targ = df_read.iloc[:,-2]

In [55]:
features.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country
0,-0.719029,0.218918,-0.298864,0.164328,-0.440434,0.048399,0.116515,0.066334,0.263855,0.313955,-0.147502,-0.218673,-0.495444,0.254411
1,-0.719029,0.218918,-0.26305,0.164328,-0.440434,0.048399,0.041134,0.014343,0.263855,0.113783,-0.147502,-0.218673,-0.495444,0.254411
2,-0.795183,0.218918,-0.499826,0.164328,-0.440434,0.455011,0.041134,0.455728,0.263855,0.313955,-0.147502,-0.218673,-0.495444,0.254411
3,-0.795183,0.218918,-0.129268,0.164328,-0.440434,0.048399,0.041134,0.014343,0.263855,0.113783,-0.147502,-0.218673,-0.745892,0.254411
4,-1.099801,0.218918,-0.18473,0.164328,-0.440434,0.048399,0.041134,0.106689,0.263855,0.113783,-0.147502,-0.218673,-0.495444,0.254411


In [82]:
features = features.to_numpy()
targ = targ.to_numpy()

In [58]:
new_dict = {}
loaded = np.load("./dataset_backups/adult_randfor.data.npz")
print(*loaded.keys())
for key in loaded.keys():
    new_dict[str(key)] = loaded[key]
new_dict["x_test_clustered"] = features
new_dict["y_test_clustered"] = targ
new_dict

x_train x_test y_train y_test


{'x_train': array([[ 0.11866996,  0.28582866, -0.89505798, ..., -0.21867335,
          0.75679395,  0.25441118],
        [-0.49056568,  0.21891843,  1.97075657, ..., -0.21867335,
         -2.16509452,  0.05445545],
        [-0.33825677,  0.21891843, -0.18004533, ..., -0.21867335,
          0.75679395,  0.25441118],
        ...,
        [ 1.18483233,  0.55865922, -0.40906971, ..., -0.21867335,
         -0.07803133,  0.25441118],
        [ 1.03252342,  0.38706257,  4.04352285, ..., -0.21867335,
         -0.07803133,  0.25441118],
        [ 1.18483233,  0.28582866, -1.12109155, ..., -0.21867335,
          0.75679395,  0.25441118]]),
 'x_test': array([[-1.40441914,  0.26896013, -0.106742  , ..., -0.21867335,
         -0.07803133,  0.25441118],
        [ 0.80406006,  0.2946299 ,  0.80713136, ..., -0.21867335,
          1.59161922,  0.25441118],
        [-0.18594786,  0.21891843, -0.29131116, ..., -0.21867335,
         -0.24499639,  0.25441118],
        ...,
        [ 0.65175115,  0.21891843

In [59]:
np.savez_compressed(
        "./adult_randfor.data-extended",
        **new_dict
    )