## Prep features for training

>This notebook will walk through all necessary steps to prepare the features for training.

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

import dill
import numpy as np
import pandas as pd
from datasets import load_dataset
from loguru import logger
from pydantic import BaseModel
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

sys.path.insert(0, "..")


In [3]:
class Args(BaseModel):
    run_name: str = "appendix"
    experiment_name: str = "feature-prep"
    testing: bool = True
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    tfm_chunk_size: int = 5000

    sequence_length: int = 10

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}/{self.experiment_name}")
        if not self.testing:
            os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "run_name": "appendix",
  "experiment_name": "feature-prep",
  "testing": true,
  "notebook_persist_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/appendix/feature-prep",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "tfm_chunk_size": 5000,
  "sequence_length": 10
}


## Load metadata

In [4]:
metadata_raw = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Electronics", trust_remote_code=True
)
metadata_raw_df = metadata_raw["full"].to_pandas()
metadata_raw_df

Using the latest cached version of the module from /home/dinhln/.cache/huggingface/modules/datasets_modules/datasets/McAuley-Lab--Amazon-Reviews-2023/16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8 (last modified on Sat Apr 12 00:18:05 2025) since it couldn't be found locally at McAuley-Lab/Amazon-Reviews-2023, or remotely on the Hugging Face Hub.


Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,All Electronics,FS-1051 FATSHARK TELEPORTER V3 HEADSET,3.5,6,[],[Teleporter V3 The “Teleporter V3” kit sets a ...,,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Fat Shark,"[Electronics, Television & Video, Video Glasses]","{""Date First Available"": ""August 2, 2014"", ""Ma...",B00MCW7G9M,,,
1,All Electronics,Ce-H22B12-S1 4Kx2K Hdmi 4Port,5.0,1,"[UPC: 662774021904, Weight: 0.600 lbs]",[HDMI In - HDMI Out],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",SIIG,"[Electronics, Television & Video, Accessories,...","{""Product Dimensions"": ""0.83 x 4.17 x 2.05 inc...",B00YT6XQSE,,,
2,Computers,Digi-Tatoo Decal Skin Compatible With MacBook ...,4.5,246,[WARNING: Please IDENTIFY MODEL NUMBER on the ...,[],19.99,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['AL 2Sides Video', 'MacBook Protect...",Digi-Tatoo,"[Electronics, Computers & Accessories, Laptop ...","{""Brand"": ""Digi-Tatoo"", ""Color"": ""Fresh Marble...",B07SM135LS,,,
3,AMAZON FASHION,NotoCity Compatible with Vivoactive 4 band 22m...,4.5,233,[☛NotoCity 22mm band is designed for Vivoactiv...,[],9.99,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",NotoCity,"[Electronics, Wearable Technology, Clips, Arm ...","{""Date First Available"": ""May 29, 2020"", ""Manu...",B089CNGZCW,,,
4,Cell Phones & Accessories,Motorola Droid X Essentials Combo Pack,3.8,64,"[New Droid X Essentials Combo Pack, Exclusive ...",[all Genuine High Quality Motorola Made Access...,14.99,"{'hi_res': [None, None, None, None, None], 'la...","{'title': [], 'url': [], 'user_id': []}",Verizon,"[Electronics, Computers & Accessories, Compute...","{""Product Dimensions"": ""11.6 x 6.9 x 3.1 inche...",B004E2Z88O,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1610007,Computers,"Wintec FileMate Pro USB Flash Drive, 3FMUSB32G...",5.0,1,"[32GB / 32 GB file storage, USB mass storage d...",[--New in retail packaging --Fast USB 2.0 data...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Wintec Industries,"[Electronics, Computers & Accessories, Data St...","{""Product Dimensions"": ""0.78 x 0.31 x 2.75 inc...",B003NUIU9M,,,
1610008,,Tsugar Noise Reduction Wireless Headphones Blu...,1.0,2,[High Fidelity Sound: Intelligent noise reduct...,[Description: 100% brand new high quality 1.Hi...,,"{'hi_res': [None, 'https://m.media-amazon.com/...","{'title': [], 'url': [], 'user_id': []}",Tsugar,"[Electronics, Headphones, Earbuds & Accessorie...","{""Best Sellers Rank"": {""Electronics"": 547760, ...",B0BHVY33TL,,,
1610009,,"Hardshell Case for MacBook Pro (16-inch, 2021)...",4.6,11,"[Compatible with MacBook Pro 16-inch (2021), I...",[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Incase Designs,"[Electronics, Computers & Accessories, Laptop ...","{""Product Dimensions"": ""9.88 x 0.94 x 14.13 in...",B09SQGRFFH,,,
1610010,Computers,"FYY 12-13.3"" Laptop Sleeve Case Bag, PU Leathe...",4.0,35,[【Compatibility】FYY laptop Bag sleeve perfect ...,[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",FYY,"[Electronics, Computers & Accessories, Laptop ...","{""Standing screen display size"": ""12.3 Inches""...",B091JWCSG5,,,


In [5]:
metadata_raw_df.dtypes

main_category       object
title               object
average_rating     float64
rating_number        int64
features            object
description         object
price               object
images              object
videos              object
store               object
categories          object
details             object
parent_asin         object
bought_together     object
subtitle            object
author              object
dtype: object

In [6]:
with pd.option_context("display.max_colwidth", None):
    display(
        metadata_raw_df.iloc[4:6][
            [
                "title",
                "main_category",
                "categories",
                "features",
                "description",
                "price",
                "store",
                "details",
            ]
        ]
    )

Unnamed: 0,title,main_category,categories,features,description,price,store,details
4,Motorola Droid X Essentials Combo Pack,Cell Phones & Accessories,"[Electronics, Computers & Accessories, Computer Accessories & Peripherals, Memory Cards, Micro SD Cards]","[New Droid X Essentials Combo Pack, Exclusive Package Incredible Value Worth $145!!!, Includes all Genuine High Quality Motorola Made Accessories]","[all Genuine High Quality Motorola Made Accessories, including Multimedia Station with HDMI technology, HDMI Cable and AC Wall Charger, Motorola Navigation / Music Vehicle Charging Mount Car Dock and Motorola 12v Vehicle Power Adapter Charger!]",14.99,Verizon,"{""Product Dimensions"": ""11.6 x 6.9 x 3.1 inches"", ""Item Weight"": ""1.5 pounds"", ""Other display features"": ""Wireless"", ""Manufacturer"": ""Verizon"", ""Date First Available"": ""November 26, 2010""}"
5,Raymarine Wi-Fish DownVision Blackbox Sonar with Wi-Fi,Sports & Outdoors,"[Electronics, Car & Vehicle Electronics, Marine Electronics, Fish Finders & Depth Finders]","[Black box Wi-Fi CHIRP DownVision sonar module, Connect with the free Wi-Fish mobile app for iOS and Android devices, Return to the location of favorite fishing spots using the Wi-Fish augmented reality mode and your phone's GPS, Pause, zoom, and rewind the sonar right on your phone; Save you favorite catch and share with friends online, Award winning wide-spectrum CHIRP DownVision sonar technology for photo-like images, high-speed tracking, and better deep water resolution; Includes transom mount CHIRP transducer with temperature sensor]",[Transform your smartphone into a powerful CHIRP DownVision sonar with the Wi-Fish wireless sonar. Simply download the free Wi-Fish app and connect to the Wi-Fish module using standard Wi-Fi and you will be streaming real time sonar right to your phone or tablet. The Wi-Fish app also lets you rewind and save sonar imagery for sharing with friends online instantly. iOS7 or higher or Android 4 or higher compatible.],,Raymarine,"{""Item Package Dimensions L x W x H"": ""9.3 x 8 x 4.4 inches"", ""Package Weight"": ""2.6 Pounds"", ""Item Dimensions LxWxH"": ""15.75 x 15.75 x 15.75 inches"", ""Item Weight"": ""0.1 Kilograms"", ""Brand Name"": ""Raymarine"", ""Suggested Users"": ""unisex-adult"", ""Number of Items"": ""1"", ""Manufacturer"": ""Raymarine"", ""Part Number"": ""E70290"", ""Model Year"": ""2016"", ""Date First Available"": ""February 3, 2015""}"


## Necessary transformations

### Tf-idf

In [70]:
# # Tf-idf vectorization
# corpus = "Leo Messi is the goat"
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit(corpus)
# X_trans = X.transform(corpus)

# X_trans

# # Get error because the expected input is an iterable

In [None]:
# Tf-idf vectorization
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',#####333Dddassdaasdasdsssasdadsadsc   
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit(corpus)
X_trans = X.transform(corpus)

X_trans

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 21 stored elements and shape (4, 9)>

In [9]:
# Tf-idf vectorization ( also works with input type numpy array)
corpus = np.array([
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
])
vectorizer = TfidfVectorizer()
X = vectorizer.fit(corpus)
X_trans = X.transform(corpus)

X_trans

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 21 stored elements and shape (4, 9)>

In [10]:
# Tf-idf vectorization ( also works with input type pd series)
corpus = pd.Series([
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
])
vectorizer = TfidfVectorizer()
X = vectorizer.fit(corpus)
X_trans = X.transform(corpus)

X_trans

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 21 stored elements and shape (4, 9)>

In [11]:
#The result of Tf-idf vectorization is a sparse matrix. We need to convert it to a dense format.
X_trans.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [12]:
# Get the feature names
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

### Standard scaler

In [13]:
from sklearn.preprocessing import StandardScaler
import numpy as np

X = np.array([
  [1, 10],
  [2, 20],
  [3, 30],
  [4, 40],
  [5, 50]
])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(X_scaled)


[[-1.41421356 -1.41421356]
 [-0.70710678 -0.70710678]
 [ 0.          0.        ]
 [ 0.70710678  0.70710678]
 [ 1.41421356  1.41421356]]


In [14]:
print(scaler.mean_)

[ 3. 30.]


In [15]:
x0_mean = X[:, 0].mean()
x0_std = X[:, 0].std()
x0_scaled = (X[:, 0] - x0_mean) / x0_std
x0_scaled

array([-1.41421356, -0.70710678,  0.        ,  0.70710678,  1.41421356])

In [16]:
# We can see that sklearn.preprocessing.StandardScaler scales each feature (column) independently.

In [17]:
# With pandas df
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Sample DataFrame
df = pd.DataFrame({
    'height': [150, 160, 170],
    'weight': [60, 65, 70]
})

scaler = StandardScaler()
scaler.fit(df)

print("IN :", scaler.feature_names_in_)
print("OUT:", scaler.get_feature_names_out())
X = scaler.transform(df)
print(X)


IN : ['height' 'weight']
OUT: ['height' 'weight']
[[-1.22474487 -1.22474487]
 [ 0.          0.        ]
 [ 1.22474487  1.22474487]]


### Onehot encoding

In [18]:
X = np.array([
    "Leo Messi",
    "Ronaldo",
    "Pedri"
])

encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X.reshape(-1, 1))
X_encoded.toarray()   # need to convert to dense format

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [19]:
encoder.get_feature_names_out()

array(['x0_Leo Messi', 'x0_Pedri', 'x0_Ronaldo'], dtype=object)

### Imputer

In [20]:
from sklearn.impute import SimpleImputer

X = np.array([
    "Leo Messi",
    "Ronaldo",
    "Pedri",
    np.nan,
], dtype=object).reshape(-1, 1)

imputer = SimpleImputer(strategy="constant", fill_value="Unknown hehe")
X_imputed = imputer.fit_transform(X)
print(X_imputed)


[['Leo Messi']
 ['Ronaldo']
 ['Pedri']
 ['Unknown hehe']]


### Regex to parse price

In [22]:
data = pd.Series(["from 15.6 to hehe", "1.5", "none"])
print(data.str)
price_pattern = r"\b((?:\d+\.\d*)|(?:\d+))\b"
display(data.str.extract(price_pattern))

<pandas.core.strings.accessor.StringMethods object at 0x7129a8f83ed0>


Unnamed: 0,0
0,15.6
1,1.5
2,


In [23]:
def extract_price(X):
    price_pattern = r"\b((?:\d+\.\d*)|(?:\d+))\b"
    return X.str.extract(price_pattern).astype(float)

In [24]:
func_tfm = FunctionTransformer(
    extract_price,
    validate=False,
)
func_tfm.fit(data)

In [25]:
func_tfm.transform(data)

Unnamed: 0,0
0,15.6
1,1.5
2,


### Sklearn pipeline

In [28]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
    '',
]

pp = Pipeline(
    steps=[
        ("tfidf", TfidfVectorizer()),
        ("to_dense", FunctionTransformer(lambda x: x.toarray(), validate=False)),
        ("scaler", StandardScaler()),
    ]

)
pp.fit(corpus)
pp.transform(corpus)

array([[-0.5       ,  0.51114841,  1.22474487,  0.84003261, -0.5       ,
        -0.5       ,  0.84003261, -0.5       ,  0.84003261],
       [-0.5       ,  1.31836891, -0.81649658,  0.13076072, -0.5       ,
         2.        ,  0.13076072, -0.5       ,  0.13076072],
       [ 2.        , -1.17033286, -0.81649658,  0.07745191,  2.        ,
        -0.5       ,  0.07745191,  2.        ,  0.07745191],
       [-0.5       ,  0.51114841,  1.22474487,  0.84003261, -0.5       ,
        -0.5       ,  0.84003261, -0.5       ,  0.84003261],
       [-0.5       , -1.17033286, -0.81649658, -1.88827786, -0.5       ,
        -0.5       , -1.88827786, -0.5       , -1.88827786]])

In [49]:
pp[-3].get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

### Column transformer to work with pandas dataframe

In [67]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd

# Sample data
df = pd.DataFrame({
    'city': ['Hanoi', 'Saigon', 'Danang'],
    'age': [25, 32, 47]
})

# Feature groups
cat_features = ['city']
num_features = ['age']

# Pipelines for each type
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder()),
    ('vibe', FunctionTransformer(lambda x: x , validate=False, feature_names_out = "one-to-one")),
])

num_pipe = Pipeline([
    ('scaler', StandardScaler())
])

# Full column transformer
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, cat_features),
    ('num', num_pipe, num_features)
])

# Fit it first
preprocessor.fit(df)



In [64]:
preprocessor.get_feature_names_out()

array(['cat__city_Danang', 'cat__city_Hanoi', 'cat__city_Saigon',
       'num__age'], dtype=object)

In [68]:
preprocessor.transform(df)

array([[ 0.        ,  1.        ,  0.        , -1.05332743],
       [ 0.        ,  0.        ,  1.        , -0.29057308],
       [ 1.        ,  0.        ,  0.        ,  1.34390052]])