In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = {'age': [25, 32, 47, 51, 28, 38, 45, 61],
        'gender': ['M', 'F', 'F', 'M', 'F', 'M', 'F', 'M'],
        'income': [50000, 55000, 60000, 65000, 48000, 58000, 62000, 68000],
        'country': ['US', 'UK', 'UK', 'US', 'US', 'UK', 'UK', 'US'],
        'target': [0, 1, 1, 0, 1, 1, 0, 0]}

df = pd.DataFrame(data)

In [7]:
df

Unnamed: 0,age,gender,income,country,target
0,25,M,50000,US,0
1,32,F,55000,UK,1
2,47,F,60000,UK,1
3,51,M,65000,US,0
4,28,F,48000,US,1
5,38,M,58000,UK,1
6,45,F,62000,UK,0
7,61,M,68000,US,0


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   age      8 non-null      int64 
 1   gender   8 non-null      object
 2   income   8 non-null      int64 
 3   country  8 non-null      object
 4   target   8 non-null      int64 
dtypes: int64(3), object(2)
memory usage: 448.0+ bytes


In [25]:
# Display the number of unique values in each column
unique_values_per_column = df.nunique()

print("Number of unique values in each column:")
print(unique_values_per_column)

Number of unique values in each column:
age        8
gender     2
income     8
country    2
target     2
dtype: int64


In [27]:
#Initial data memory usage
memory_usage = df.memory_usage(deep=True)
total_memory_usage = memory_usage.sum()
print(f"\nTotal memory usage of the DataFrame: {total_memory_usage / (1024 ** 2):.2f} MB")


Total memory usage of the DataFrame: 0.00 MB


In [17]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [18]:
preprocessor = make_column_transformer(
    (StandardScaler(), ['age', 'income']),
    (OneHotEncoder(), ['gender', 'country'])
)

In [19]:
preprocessor

In [20]:
X_transformed = preprocessor.fit_transform(df)
X_transformed

array([[-1.37468109, -1.26270991,  0.        ,  1.        ,  0.        ,
         1.        ],
       [-0.7685225 , -0.49743118,  1.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.53038877,  0.26784756,  1.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.8767651 ,  1.03312629,  0.        ,  1.        ,  0.        ,
         1.        ],
       [-1.11489884, -1.5688214 ,  1.        ,  0.        ,  0.        ,
         1.        ],
       [-0.24895799, -0.03826394,  0.        ,  1.        ,  1.        ,
         0.        ],
       [ 0.3572006 ,  0.57395905,  1.        ,  0.        ,  1.        ,
         0.        ],
       [ 1.74270595,  1.49229353,  0.        ,  1.        ,  0.        ,
         1.        ]])

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

preprocessor2 = ColumnTransformer(
 transformers=[
 ('standardscaler', StandardScaler(), ['age', 'income']),
 ('onehotencoder', OneHotEncoder(), ['gender'])
 ]
)

In [23]:
X_transformed2 = preprocessor2.fit_transform(df)
X_transformed2

array([[-1.37468109, -1.26270991,  0.        ,  1.        ],
       [-0.7685225 , -0.49743118,  1.        ,  0.        ],
       [ 0.53038877,  0.26784756,  1.        ,  0.        ],
       [ 0.8767651 ,  1.03312629,  0.        ,  1.        ],
       [-1.11489884, -1.5688214 ,  1.        ,  0.        ],
       [-0.24895799, -0.03826394,  0.        ,  1.        ],
       [ 0.3572006 ,  0.57395905,  1.        ,  0.        ],
       [ 1.74270595,  1.49229353,  0.        ,  1.        ]])