In [3]:
import os
import mlflow

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, TargetEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

AttributeError: Module 'numpy.core' has no attribute 'numerictypes'

In [None]:
df = pd.read_pickle('../data/clean_data.pkl')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1437 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   battery_power  1437 non-null   int64   
 1   blue           1437 non-null   category
 2   clock_speed    1437 non-null   float16 
 3   dual_sim       1437 non-null   category
 4   fc             1437 non-null   int8    
 5   four_g         1437 non-null   category
 6   int_memory     1437 non-null   int8    
 7   m_dep          1437 non-null   float16 
 8   mobile_wt      1437 non-null   int64   
 9   n_cores        1437 non-null   int8    
 10  pc             1437 non-null   int8    
 11  px_height      1437 non-null   int64   
 12  px_width       1437 non-null   int64   
 13  ram            1437 non-null   int64   
 14  sc_h           1437 non-null   int8    
 15  sc_w           1437 non-null   int8    
 16  talk_time      1437 non-null   int8    
 17  three_g        1437 non-null   categor

In [None]:
df = df.rename(columns={'price_range': 'target'})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)

In [None]:
cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()
cat_features

['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']

In [None]:
num_features = X_train.select_dtypes(include=['number']).columns.to_list()
num_features

['battery_power',
 'clock_speed',
 'fc',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time']

In [None]:
s_scaler = StandardScaler()
l_encoder = TargetEncoder() 
classifier = RandomForestClassifier()

In [None]:
# –î–ª—è —É–¥–æ–±–Ω–æ–π —Ä–∞–±–æ—Ç—ã —Å–æ —Å—Ç–æ–ª–±—Ü–∞–º–∏
preprocessor = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –¥–ª—è —á–∏—Å–ª–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
        ('cat', l_encoder, cat_features), # –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –¥–ª—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
    ],
    remainder='drop' ) # –£–¥–∞–ª—è–µ–º —Å—Ç–æ–ª–±—Ü—ã, –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ –∑–∞—Ç—Ä–æ–Ω—É—Ç—ã –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è

In [None]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', classifier)])

pipeline.fit(X_train, y_train)

In [None]:
predictions = pipeline.predict(X_test) 

metrics = {}
metrics["precision"] = precision_score(y_test, predictions, average='weighted')   
metrics["recall"] = recall_score(y_test, predictions, average='weighted')
#metrics["roc_auc"] = roc_auc_score(y_test, predictions, average='ovo')
metrics["f1"] = f1_score(y_test, predictions, average='weighted')

metrics

{'precision': np.float64(0.8425744117188717),
 'recall': np.float64(0.8388888888888889),
 'f1': np.float64(0.8390611541774332)}

In [None]:
# –†–∞–±–æ—Ç–∞–µ–º —Å MLflow –ª–æ–∫–∞–ª—å–Ω–æ
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)   
mlflow.set_registry_uri(registry_uri) 

In [None]:
# –Ω–∞–∑–≤–∞–Ω–∏–µ —Ç–µ—Å—Ç–æ–≤–æ–≥–æ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞, –∑–∞–ø—É—Å–∫–∞ (run) –≤–Ω—É—Ç—Ä–∏ –Ω–µ–≥–æ, –∏–º–µ–Ω–∏, –ø–æ–¥ –∫–æ—Ç–æ—Ä—ã–º –º–æ–¥–µ–ª—å –±—É–¥–µ—Ç —Ä–µ–≥–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞—Ç—å—Å—è
EXPERIMENT_NAME = "estate_project"
RUN_NAME = "baseline model"
REGISTRY_MODEL_NAME = "estate_model_rf"

In [None]:
# –û–±—è–∑–∞—Ç–µ–ª—å–Ω–æ –ª–æ–≥–∏—Ä—É–µ–º —Å–∏–≥–Ω–∞—Ç—É—Ä—É –º–æ–¥–µ–ª–∏ –∏ –ø—Ä–∏–º–µ—Ä –≤—Ö–æ–¥–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö. –ü–æ–¥–≥–æ—Ç–æ–≤–∏–º –∏—Ö
from mlflow.models import infer_signature

signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)



In [None]:
# –ë—É–¥–µ–º –ª–æ–≥–∏—Ä–æ–≤–∞—Ç—å requirements –∏ –∞—Ä—Ç–µ—Ñ–∞–∫—Ç - —Ç–µ–∫—Å—Ç–æ–≤—ã–π —Ñ–∞–π–ª
req_file = '../requirements.txt'

In [None]:
# –ü–∞—Ä–∞–º–µ—Ç—Ä—ã, –∫–æ—Ç–æ—Ä–æ—ã–µ –±—É–¥—É—Ç –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω—ã, –º–æ–∂–µ–º –∑–∞–¥–∞–≤–∞—Ç—å –≤—Ä—É—á–Ω—É—é –∏–ª–∏ –ø–æ–ª–Ω–æ—Å—Ç—å—é –≤–∑—è—Ç—å –∏–∑ –º–æ–¥–µ–ª–∏
#params_dict = {'n_estimators': 10, 'max_depth': 10}
params_dict = pipeline.get_params()

In [None]:
# –ö–æ–≥–¥–∞ —Å–æ–∑–¥–∞–µ–º –Ω–æ–≤—ã–π —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç, —Ç–æ: 
#experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

# –í–ø–æ—Å–ª–µ–¥—Å—Ç–≤–∏–∏. —á—Ç–æ–±—ã –¥–æ–±–∞–≤–ª—è—Ç—å –∑–∞–ø—É—Å–∫–∏ –≤ —ç—Ç–æ—Ç –∂–µ —ç–∫—Å–µ–ø—Ä–∏–º–µ–Ω—Ç –º—ã –¥–æ–ª–∂–Ω—ã –ø–æ–ª—É—á–∏—Ç—å –µ–≥–æ id:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # –ø–æ–ª—É—á–∞–µ–º —É–Ω–∏–∫–∞–ª—å–Ω—ã–π –∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä –∑–∞–ø—É—Å–∫–∞ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

 - numpy (current: 2.1.3, required: numpy==2.1.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 18.00it/s]
 - numpy (current: 2.1.3, required: numpy==2.1.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/11/14 13:18:19 INFO mlflow.tracking._tracking_service.client: üèÉ View run baseline model at: http://127.0.0.1:5000/#/experiments/1/runs/c45c94c37cb345fd9bd82a6d21b56482.
2024/11/14 13:18:19 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [None]:
mlflow.sklearn.autolog()

with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:
    pipeline.fit(X_train, y_train)

2024/11/14 13:18:32 INFO mlflow.tracking._tracking_service.client: üèÉ View run auto at: http://127.0.0.1:5000/#/experiments/1/runs/a20b8ecc172449c9b006918c51e896a4.
2024/11/14 13:18:32 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [None]:
mlflow.sklearn.autolog(disable=True)

In [None]:
classifier2 = RandomForestClassifier(n_estimators=10, max_depth=6)

In [None]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', classifier2)])

pipeline.fit(X_train, y_train)

In [None]:
predictions = pipeline.predict(X_test) 

metrics = {}
metrics["precision"] = precision_score(y_test, predictions, average='weighted')   
metrics["recall"] = recall_score(y_test, predictions, average='weighted')
#metrics["roc_auc"] = roc_auc_score(y_test, predictions, average='ovo')
metrics["f1"] = f1_score(y_test, predictions, average='weighted')

metrics

{'precision': np.float64(0.6604324594476533),
 'recall': np.float64(0.6472222222222223),
 'f1': np.float64(0.6324089962886389)}

In [None]:
# !!! –ü—Ä–æ–≤–µ—Ä–∏—Ç—å –Ω–∞–∑–≤–∞–Ω–∏–µ –ø—Ä–æ–≥–æ–Ω–∞ –∞ —Ç–∞–∫–∂–µ –≤—Å–µ –ª–æ–≥–∏—Ä—É–µ–º—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –∏ –∞—Ä—Ç–µ—Ñ–∞–∫—Ç—ã, —á—Ç–æ –æ–Ω–∏ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É—é—Ç –≤—Ç–æ—Ä–æ–π "–º–∞–ª–µ–Ω—å–∫–æ–π" –º–æ–¥–µ–ª–∏. 

RUN_NAME = 'smaller_model'

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # –ø–æ–ª—É—á–∞–µ–º —É–Ω–∏–∫–∞–ª—å–Ω—ã–π –∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä –∑–∞–ø—É—Å–∫–∞ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_params(pipeline.get_params())

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

 - numpy (current: 2.1.3, required: numpy==2.1.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 30.02it/s]
 - numpy (current: 2.1.3, required: numpy==2.1.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/11/14 13:18:34 INFO mlflow.tracking._tracking_service.client: üèÉ View run smaller_model at: http://127.0.0.1:5000/#/experiments/1/runs/6c30459f197d426999dc386a4c5187da.
2024/11/14 13:18:34 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [None]:
# No model
# –õ–æ–≥–∏—Ä–æ–≤–∞—Ç—å –º–æ–∂–Ω–æ —Ç–æ–ª—å–∫–æ –∞—Ä—Ç–µ—Ñ–∞–∫—Ç—ã, –±–µ–∑ –º–æ–¥–µ–ª–∏. –ù–∞–ø—Ä–∏–º–µ—Ä, –∑–∞–ª–æ–≥–∏—Ä–æ–∞–≤—Ç—å –≥—Ä–∞—Ñ–∏–∫–∏ –ø–æ—Å–ª–µ —ç—Ç–∞–ø–∞ EDA

RUN_NAME = 'no_model'
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id 


run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/11/14 13:18:34 INFO mlflow.tracking._tracking_service.client: üèÉ View run no_model at: http://127.0.0.1:5000/#/experiments/1/runs/0b756fd3fd1c434690337d0591650513.
2024/11/14 13:18:34 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [None]:
from sklearn.preprocessing import QuantileTransformer, SplineTransformer, PolynomialFeatures, MinMaxScaler

In [None]:
X_train_sklearn = X_train.copy()

In [None]:
pf = PolynomialFeatures(degree=2)

In [None]:
X_train_sklearn

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
1741,946,1,1.400391,0,9,0,26,0.099976,186,6,10,273,891,1637,7,3,6,1,0,1
232,1715,0,1.000000,1,4,1,31,0.500000,83,8,17,638,1615,625,17,6,13,1,0,0
1675,1630,1,2.800781,1,0,1,32,0.899902,80,6,1,712,1726,1751,12,10,20,1,0,1
470,752,0,0.500000,1,1,0,48,0.700195,87,7,13,164,728,3860,7,5,16,0,0,0
1915,936,0,1.000000,0,1,0,18,0.199951,153,3,18,1330,1686,2391,12,6,3,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654,666,1,0.500000,1,7,1,54,0.399902,81,3,13,58,1353,1254,15,9,9,1,1,1
426,1190,1,2.199219,1,9,0,47,0.300049,186,6,10,1417,1441,624,9,3,19,1,1,1
690,1403,0,2.699219,0,2,1,26,0.099976,164,5,3,461,1251,3371,13,9,9,1,0,1
736,503,0,2.500000,0,3,0,57,0.600098,185,6,11,778,1291,305,11,8,16,0,0,1


In [None]:
pf.fit_transform(X_train_sklearn[['m_dep','battery_power']])

array([[1.00000000e+00, 9.99755859e-02, 9.46000000e+02, 9.99511778e-03,
        9.45769043e+01, 8.94916000e+05],
       [1.00000000e+00, 5.00000000e-01, 1.71500000e+03, 2.50000000e-01,
        8.57500000e+02, 2.94122500e+06],
       [1.00000000e+00, 8.99902344e-01, 1.63000000e+03, 8.09824228e-01,
        1.46684082e+03, 2.65690000e+06],
       ...,
       [1.00000000e+00, 9.99755859e-02, 1.40300000e+03, 9.99511778e-03,
        1.40265747e+02, 1.96840900e+06],
       [1.00000000e+00, 6.00097656e-01, 5.03000000e+02, 3.60117197e-01,
        3.01849121e+02, 2.53009000e+05],
       [1.00000000e+00, 1.99951172e-01, 7.70000000e+02, 3.99804711e-02,
        1.53962402e+02, 5.92900000e+05]])

In [None]:
sp = SplineTransformer(n_knots=3, degree=3)

In [None]:
sp.fit_transform(X_train_sklearn[['px_height']])

array([[0.06690626, 0.60688358, 0.32320198, 0.00300818, 0.        ],
       [0.00772773, 0.38764177, 0.56078677, 0.04384373, 0.        ],
       [0.00375873, 0.33656692, 0.59811978, 0.06155457, 0.        ],
       ...,
       [0.02664969, 0.50540044, 0.45201833, 0.01593154, 0.        ],
       [0.00163569, 0.2917294 , 0.6257347 , 0.08090021, 0.        ],
       [0.01031397, 0.41172227, 0.5411553 , 0.03680847, 0.        ]])

In [None]:
qt = QuantileTransformer()

In [None]:
qt.fit_transform(X_train_sklearn[['px_height']])

array([[0.22872873],
       [0.54554555],
       [0.60047141],
       ...,
       [0.4034034 ],
       [0.64314314],
       [0.51588829]])

In [None]:
pf = PolynomialFeatures(degree=2)
qt = QuantileTransformer()
sp = SplineTransformer(n_knots=3, degree=3)

In [None]:
pf_pipeline = Pipeline(steps=[
    ('poly', pf),
    ('scale', StandardScaler())
])

In [None]:
preprocessor_sklearn = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –¥–ª—è —á–∏—Å–ª–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
        ('cat', l_encoder, cat_features), # –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –¥–ª—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
        ('quantile', qt,num_features),
        ('poly', pf_pipeline, ['m_dep', 'battery_power']), # –í –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –¥–æ–±–∞–≤–ª—è–µ–º —Å–æ–∑–¥–∞–Ω–Ω—ã–π —Ä–∞–Ω–µ–µ pipeline
        ('spline', sp, ['px_height'])
    ],
    remainder='drop',
    ) # –£–¥–∞–ª—è–µ–º —Å—Ç–æ–ª–±—Ü—ã, –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ –∑–∞—Ç—Ä–æ–Ω—É—Ç—ã –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è

In [None]:
X_train_sklearn[['m_dep', 'battery_power']] = X_train_sklearn[['m_dep', 'battery_power']].astype('float64')
X_train_sklearn[['m_dep', 'battery_power']] = X_train_sklearn[['m_dep', 'battery_power']].astype('float64')

In [None]:
X_train_sklearn

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
1741,946.0,1,1.400391,0,9,0,26,0.099976,186,6,10,273,891,1637,7,3,6,1,0,1
232,1715.0,0,1.000000,1,4,1,31,0.500000,83,8,17,638,1615,625,17,6,13,1,0,0
1675,1630.0,1,2.800781,1,0,1,32,0.899902,80,6,1,712,1726,1751,12,10,20,1,0,1
470,752.0,0,0.500000,1,1,0,48,0.700195,87,7,13,164,728,3860,7,5,16,0,0,0
1915,936.0,0,1.000000,0,1,0,18,0.199951,153,3,18,1330,1686,2391,12,6,3,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654,666.0,1,0.500000,1,7,1,54,0.399902,81,3,13,58,1353,1254,15,9,9,1,1,1
426,1190.0,1,2.199219,1,9,0,47,0.300049,186,6,10,1417,1441,624,9,3,19,1,1,1
690,1403.0,0,2.699219,0,2,1,26,0.099976,164,5,3,461,1251,3371,13,9,9,1,0,1
736,503.0,0,2.500000,0,3,0,57,0.600098,185,6,11,778,1291,305,11,8,16,0,0,1


In [None]:
X_train_sklearn_raw = preprocessor_sklearn.fit_transform(X_train_sklearn, y_train)


In [None]:
X_train_sklearn = pd.DataFrame(X_train_sklearn_raw, columns=preprocessor_sklearn.get_feature_names_out())

In [None]:
with pd.option_context('display.max_rows', 5, 'display.max_columns', None):
    display (X_train_sklearn)

Unnamed: 0,num__battery_power,num__clock_speed,num__fc,num__int_memory,num__m_dep,num__mobile_wt,num__n_cores,num__pc,num__px_height,num__px_width,num__ram,num__sc_h,num__sc_w,num__talk_time,cat__blue_0,cat__blue_1,cat__blue_2,cat__blue_3,cat__dual_sim_0,cat__dual_sim_1,cat__dual_sim_2,cat__dual_sim_3,cat__four_g_0,cat__four_g_1,cat__four_g_2,cat__four_g_3,cat__three_g_0,cat__three_g_1,cat__three_g_2,cat__three_g_3,cat__touch_screen_0,cat__touch_screen_1,cat__touch_screen_2,cat__touch_screen_3,cat__wifi_0,cat__wifi_1,cat__wifi_2,cat__wifi_3,quantile__battery_power,quantile__clock_speed,quantile__fc,quantile__int_memory,quantile__m_dep,quantile__mobile_wt,quantile__n_cores,quantile__pc,quantile__px_height,quantile__px_width,quantile__ram,quantile__sc_h,quantile__sc_w,quantile__talk_time,poly__1,poly__m_dep,poly__battery_power,poly__m_dep^2,poly__m_dep battery_power,poly__battery_power^2,spline__px_height_sp_0,spline__px_height_sp_1,spline__px_height_sp_2,spline__px_height_sp_3,spline__px_height_sp_4
0,-0.656113,-0.138190,1.093728,-0.372868,-1.375675,1.246637,0.620719,0.046466,-0.870069,-0.861281,-0.454690,-1.458894,-1.201724,-0.912773,0.250583,0.229707,0.252922,0.266786,0.270543,0.223552,0.270569,0.235332,0.261478,0.225201,0.278410,0.234906,0.245861,0.232279,0.265456,0.256404,0.257598,0.231854,0.269305,0.241242,0.275804,0.236774,0.239131,0.248284,0.318318,0.462462,0.839840,0.389890,0.000000,0.868869,0.674174,0.526026,0.228729,0.258258,0.362454,0.094595,0.000000,0.234735,0.0,-1.375675,-0.656113,-1.052531,-1.184632,-0.742832,0.066906,0.606884,0.323202,0.003008,0.0
1,1.107839,-0.631921,-0.062934,-0.099164,0.000693,-1.615754,1.496885,1.218069,-0.051504,0.806629,-1.381706,1.020950,-0.392308,0.359910,0.238231,0.238193,0.269646,0.253928,0.240502,0.242836,0.261905,0.254755,0.247757,0.243223,0.243284,0.265733,0.245810,0.232065,0.270217,0.251908,0.247606,0.233176,0.271611,0.247606,0.251186,0.225140,0.281942,0.241729,0.824324,0.359359,0.580080,0.473974,0.507007,0.031031,1.000000,0.856857,0.545546,0.724725,0.110595,0.804805,0.432432,0.600601,0.0,0.000693,1.107839,-0.273520,0.539330,1.126755,0.007728,0.387642,0.560787,0.043844,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,-1.672278,1.217764,-0.294267,1.324097,0.345101,1.218847,0.620719,0.213838,0.262466,0.060216,-1.674833,-0.466957,0.147303,0.905345,0.243638,0.241283,0.269128,0.245950,0.241957,0.237320,0.271869,0.248852,0.244569,0.227619,0.275997,0.251813,0.228674,0.257028,0.257172,0.257111,0.229364,0.236402,0.278922,0.255308,0.253009,0.252965,0.241015,0.253006,0.002002,0.847347,0.507007,0.880380,0.601101,0.855856,0.674174,0.579580,0.643143,0.516016,0.011191,0.340841,0.607608,0.756757,0.0,0.345101,-1.672278,0.083899,-0.716263,-1.329303,0.001636,0.291729,0.625735,0.080900,0.0
1076,-1.059827,-0.015360,0.399731,1.543060,-1.031688,1.524539,-0.255447,-0.120906,-0.129996,0.502535,1.227130,-1.458894,-1.201724,-1.276396,0.259934,0.231856,0.241264,0.266942,0.242302,0.232783,0.258915,0.265998,0.245837,0.241034,0.267291,0.245837,0.251144,0.278314,0.210249,0.260234,0.253452,0.241918,0.258074,0.246556,0.242173,0.237661,0.257857,0.262307,0.187732,0.498999,0.700200,0.943944,0.215716,0.948448,0.408909,0.464965,0.515888,0.648854,0.850203,0.094595,0.000000,0.123624,0.0,-1.031688,-1.059827,-0.955204,-1.050439,-1.018765,0.010314,0.411722,0.541155,0.036808,0.0


In [None]:
pipeline_sklearn = Pipeline(steps=[
    ('transform', preprocessor_sklearn),
    ('model', classifier)
])

model_sklearn = pipeline_sklearn.fit(X_train, y_train)

In [None]:
model_sklearn


In [None]:
predictions = model_sklearn.predict(X_test) 
metrics = {}
metrics["precision"] = precision_score(y_test, predictions, average='weighted')   
metrics["recall"] = recall_score(y_test, predictions, average='weighted')
#metrics["roc_auc"] = roc_auc_score(y_test, predictions, average='ovo')
metrics["f1"] = f1_score(y_test, predictions, average='weighted')

metrics

{'precision': np.float64(0.8625300564572075),
 'recall': np.float64(0.8555555555555555),
 'f1': np.float64(0.8563639081171636)}

In [None]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
RUN_NAME = 'fe_sklearn'

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # –ø–æ–ª—É—á–∞–µ–º —É–Ω–∏–∫–∞–ª—å–Ω—ã–π –∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä –∑–∞–ø—É—Å–∫–∞ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(model_sklearn, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_params(model_sklearn.get_params())

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

 - numpy (current: 2.1.3, required: numpy==2.1.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 26.74it/s]
 - numpy (current: 2.1.3, required: numpy==2.1.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/11/14 13:18:39 INFO mlflow.tracking._tracking_service.client: üèÉ View run fe_sklearn at: http://127.0.0.1:5000/#/experiments/1/runs/c8c88b2b848e47879631b1984b548db5.
2024/11/14 13:18:39 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [None]:
import numpy as np

In [None]:
import sys
sys.modules.keys()



In [None]:
import construct
print(construct.__version__)

2.10.70


In [None]:
import numpy as np
np.bool = np.bool_

In [None]:
from autofeat import AutoFeatRegressor
transformations = ["1/", "exp", "log", "abs", "sqrt", "^2", "^3", "1+", "1-", "sin", "cos", "exp-", "2^"]

AttributeError: module 'numba.core.types' has no attribute 'bool'

In [None]:
afreg = AutoFeatRegressor(verbose=1, feateng_steps=2, max_gb=8, transformations=["log", "sqrt"],feateng_cols=num_features)
X_train_arf = afreg.fit_transform(X_train,y_train)
X_train_arf

2024-11-14 11:03:49,099 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 406 features.
2024-11-14 11:03:49,101 INFO: [AutoFeat] With 1077 data points this new feature matrix would use about 0.00 gb of space.
2024-11-14 11:03:49,105 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/             14 features transformed

2024-11-14 11:03:54,626 INFO: [feateng] Generated 26 transformed features from 14 original features - done.
2024-11-14 11:03:54,649 INFO: [feateng] Step 2: first combination of features


[feateng]             700/            780 feature tuples combined

2024-11-14 11:03:57,164 INFO: [feateng] Generated 780 feature combinations from 780 original feature tuples - done.
2024-11-14 11:03:57,183 INFO: [feateng] Generated altogether 808 new features in 2 steps
2024-11-14 11:03:57,188 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2024-11-14 11:03:57,340 INFO: [feateng] Generated a total of 559 additional features


[featsel] Scaling data...

2024-11-14 11:04:10,480 INFO: [featsel] Feature selection run 1/5


done.


2024-11-14 11:04:32,418 INFO: [featsel] Feature selection run 2/5
2024-11-14 11:04:47,961 INFO: [featsel] Feature selection run 3/5
2024-11-14 11:05:03,344 INFO: [featsel] Feature selection run 4/5
2024-11-14 11:05:14,687 INFO: [featsel] Feature selection run 5/5
2024-11-14 11:05:32,942 INFO: [featsel] 156 features after 5 feature selection runs
  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:
2024-11-14 11:05:33,167 INFO: [featsel] 97 features after correlation filtering
2024-11-14 11:05:35,009 INFO: [featsel] 10 features after noise filtering
2024-11-14 11:05:35,013 INFO: [AutoFeat] Computing 7 new features.


[AutoFeat]     6/    7 new features

2024-11-14 11:05:39,793 INFO: [AutoFeat]     7/    7 new features ...done.
2024-11-14 11:05:39,799 INFO: [AutoFeat] Final dataframe with 27 feature columns (7 new).
2024-11-14 11:05:39,803 INFO: [AutoFeat] Training final regression model.
2024-11-14 11:05:39,868 INFO: [AutoFeat] Trained model: largest coefficients:
2024-11-14 11:05:39,878 INFO: -1.1259730337830902
2024-11-14 11:05:39,892 INFO: -0.000893 * mobile_wt
2024-11-14 11:05:39,900 INFO: 0.000788 * ram
2024-11-14 11:05:39,911 INFO: 0.000215 * px_height
2024-11-14 11:05:39,927 INFO: 0.000146 * sqrt(px_width)*sqrt(ram)
2024-11-14 11:05:39,941 INFO: 0.000028 * px_height*log(n_cores)
2024-11-14 11:05:39,958 INFO: [AutoFeat] Final score: 0.9161


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,three_g,touch_screen,wifi,battery_power*sqrt(ram),sqrt(px_width)*sqrt(ram),sqrt(battery_power)*log(px_height),battery_power*sqrt(px_width),ram*sc_h,ram*log(int_memory),px_height*log(n_cores)
0,946.0,1.0,1.400391,0.0,9.0,0.0,26.0,0.099976,186.0,6.0,...,1.0,0.0,1.0,38275.024389,1207.711472,172.531158,28237.743465,11459.0,5333.504033,489.150335
1,1715.0,0.0,1.000000,1.0,4.0,1.0,31.0,0.500000,83.0,8.0,...,1.0,0.0,0.0,42875.000000,1004.676565,267.456312,68920.812350,10625.0,2146.242003,1326.683704
2,1630.0,1.0,2.800781,1.0,0.0,1.0,32.0,0.899902,80.0,6.0,...,1.0,0.0,1.0,68207.271606,1738.455061,265.174707,67718.604534,21012.0,6068.503566,1275.732742
3,752.0,0.0,0.500000,1.0,1.0,0.0,48.0,0.700195,87.0,7.0,...,0.0,0.0,0.0,46720.931498,1676.329323,139.851691,20290.069295,27020.0,14942.835902,319.129264
4,936.0,0.0,1.000000,0.0,1.0,0.0,18.0,0.199951,153.0,3.0,...,1.0,1.0,1.0,45768.390140,2007.791324,220.061472,38433.030794,28692.0,6910.878873,1461.154344
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1072,666.0,1.0,0.500000,1.0,7.0,1.0,54.0,0.399902,81.0,3.0,...,1.0,1.0,1.0,23584.300371,1302.559787,104.787755,24497.576778,18810.0,5002.185994,63.719513
1073,1190.0,1.0,2.199219,1.0,9.0,0.0,47.0,0.300049,186.0,6.0,...,1.0,1.0,1.0,29726.190472,948.253131,250.315962,45173.001893,5616.0,2402.492103,2538.923168
1074,1403.0,0.0,2.699219,0.0,2.0,1.0,26.0,0.099976,164.0,5.0,...,1.0,0.0,1.0,81458.619796,2053.563001,229.736492,49623.378150,43823.0,10983.043430,741.950878
1075,503.0,0.0,2.500000,0.0,3.0,0.0,57.0,0.600098,185.0,6.0,...,0.0,0.0,1.0,8784.517346,627.499004,149.294809,18073.035689,3355.0,1233.130637,1393.988867


In [None]:
class AutoFeatWrapper():
    def __init__(self, feateng_cols, feateng_steps=1, max_gb=16, transformations=["1/", "exp", "log"], n_jobs=-1, verbose=1):
        self.feateng_cols = feateng_cols
        self.feateng_steps = feateng_steps
        self.max_gb = max_gb
        self.transformations = transformations
        self.n_jobs = n_jobs
        self.afreg = AutoFeatRegressor(feateng_cols=self.feateng_cols,
                                     feateng_steps=self.feateng_steps,
                                     max_gb=self.max_gb,
                                     transformations=self.transformations,
                                     n_jobs=self.n_jobs)
        
    def fit(self, X, y=None):
        self.afreg.fit(X, y)
        return self
    
    def transform(self, X):
        return self.afreg.transform(X)
    
    def get_feature_names_out(self, input_features=None):
        # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –¥–∞–Ω–Ω—ã–µ –∏ –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –∏–º–µ–Ω–∞ —Ñ–∏—á–µ–π –∏–∑ DataFrame
        transformed_X = self.afreg.transform(pd.DataFrame(np.zeros((1, len(self.feateng_cols))), columns=self.feateng_cols))
        return transformed_X.columns.tolist()

In [None]:
afreg_pipeline = Pipeline(steps=[
    ('autofeat', AutoFeatWrapper( feateng_steps=2, max_gb=16, transformations=["log", "sqrt"],feateng_cols=num_features)),
    ('scaler', StandardScaler()),
])

In [None]:
preprocessor_afr = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –¥–ª—è —á–∏—Å–ª–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
        ('cat', l_encoder, cat_features), # –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –¥–ª—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
        ('afr', afreg_pipeline, num_features), # –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è autofeat
    ],
    remainder='drop', # –£–¥–∞–ª—è–µ–º —Å—Ç–æ–ª–±—Ü—ã, –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ –∑–∞—Ç—Ä–æ–Ω—É—Ç—ã –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è–º–∏
    ) 

In [None]:
X_train_afr_raw =  preprocessor_afr.fit_transform(X_train,y_train)
X_train_afr = pd.DataFrame(X_train_afr_raw, columns=preprocessor_afr.get_feature_names_out())

  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:


In [None]:
with pd.option_context('display.max_rows', 5, 'display.max_columns', None):
    display (X_train_afr)

    

Unnamed: 0,num__battery_power,num__clock_speed,num__fc,num__int_memory,num__m_dep,num__mobile_wt,num__n_cores,num__pc,num__px_height,num__px_width,num__ram,num__sc_h,num__sc_w,num__talk_time,cat__blue_0,cat__blue_1,cat__blue_2,cat__blue_3,cat__dual_sim_0,cat__dual_sim_1,cat__dual_sim_2,cat__dual_sim_3,cat__four_g_0,cat__four_g_1,cat__four_g_2,cat__four_g_3,cat__three_g_0,cat__three_g_1,cat__three_g_2,cat__three_g_3,cat__touch_screen_0,cat__touch_screen_1,cat__touch_screen_2,cat__touch_screen_3,cat__wifi_0,cat__wifi_1,cat__wifi_2,cat__wifi_3,afr__battery_power,afr__clock_speed,afr__fc,afr__int_memory,afr__m_dep,afr__mobile_wt,afr__n_cores,afr__pc,afr__px_height,afr__px_width,afr__ram,afr__sc_h,afr__sc_w,afr__talk_time,afr__battery_power*sqrt(ram),afr__sqrt(px_width)*sqrt(ram),afr__log(battery_power)*log(px_width),afr__sqrt(battery_power)*log(px_height),afr__sqrt(clock_speed)*mobile_wt
0,-0.656113,-0.138190,1.093728,-0.372868,-1.375675,1.246637,0.620719,0.046466,-0.870069,-0.861281,-0.454690,-1.458894,-1.201724,-0.912773,0.262884,0.237083,0.246517,0.253514,0.254666,0.233646,0.261686,0.250001,0.254755,0.238086,0.273784,0.233372,0.249620,0.231245,0.274103,0.245031,0.243139,0.231657,0.275201,0.250001,0.241557,0.246022,0.252845,0.259573,-0.656113,-0.138190,1.093728,-0.372868,-1.375675,1.246637,0.620719,0.046466,-0.870069,-0.861281,-0.454690,-1.458894,-1.201724,-0.912773,-0.632932,-0.636390,-0.860205,-0.815446,0.798702
1,1.107839,-0.631921,-0.062934,-0.099164,0.000693,-1.615754,1.496885,1.218069,-0.051504,0.806629,-1.381706,1.020950,-0.392308,0.359910,0.244913,0.242611,0.258512,0.253963,0.239008,0.243597,0.245976,0.271414,0.245589,0.241135,0.252234,0.261041,0.248103,0.234399,0.266356,0.251142,0.249412,0.239892,0.263654,0.247042,0.232983,0.232944,0.279956,0.254112,1.107839,-0.631921,-0.062934,-0.099164,0.000693,-1.615754,1.496885,1.218069,-0.051504,0.806629,-1.381706,1.020950,-0.392308,0.359910,-0.454647,-1.015053,1.337005,1.064001,-1.255439
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,-1.672278,1.217764,-0.294267,1.324097,0.345101,1.218847,0.620719,0.213838,0.262466,0.060216,-1.674833,-0.466957,0.147303,0.905345,0.250586,0.236527,0.278652,0.234232,0.251141,0.228322,0.280781,0.239753,0.237196,0.234716,0.293321,0.234759,0.252374,0.247549,0.242950,0.257118,0.248216,0.233889,0.260151,0.257743,0.268307,0.236232,0.247741,0.247716,-1.672278,1.217764,-0.294267,1.324097,0.345101,1.218847,0.620719,0.213838,0.262466,0.060216,-1.674833,-0.466957,0.147303,0.905345,-1.775917,-1.718494,-1.374774,-1.275508,1.883414
1076,-1.059827,-0.015360,0.399731,1.543060,-1.031688,1.524539,-0.255447,-0.120906,-0.129996,0.502535,1.227130,-1.458894,-1.201724,-1.276396,0.251749,0.230775,0.249447,0.268026,0.250606,0.224118,0.257842,0.267431,0.263006,0.227502,0.270124,0.239365,0.252425,0.261986,0.233141,0.252425,0.250600,0.236270,0.269672,0.243457,0.249439,0.251657,0.233621,0.265275,-1.059827,-0.015360,0.399731,1.543060,-1.031688,1.524539,-0.255447,-0.120906,-0.129996,0.502535,1.227130,-1.458894,-1.201724,-1.276396,-0.357646,1.343789,-0.344511,-0.714176,1.097457


In [None]:
pipeline_afr = Pipeline(steps=[('preprocessor', preprocessor_afr), 
                               ('model', classifier)])

pipeline_afr.fit(X_train, y_train)

  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:


In [None]:
X_test

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
1791,1203,1,0.500000,1,0,1,11,0.899902,109,2,12,35,510,1672,17,13,19,1,1,0
192,1490,1,0.500000,1,4,1,64,0.300049,150,8,8,1417,1464,3600,17,9,7,1,1,1
1515,595,0,2.599609,0,0,1,20,0.399902,122,2,18,623,816,1593,19,11,2,1,0,1
1776,1072,1,2.199219,1,6,0,19,0.399902,114,8,16,380,877,2715,15,9,16,0,1,0
781,1368,0,3.000000,1,10,0,50,0.199951,97,7,18,698,829,2496,13,6,5,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1137,1,1.000000,0,18,0,7,1.000000,196,3,19,942,1179,3616,13,5,12,1,1,1
984,1261,1,0.500000,1,0,1,11,0.199951,90,4,0,858,1591,348,14,9,14,1,0,1
517,1059,0,2.400391,0,8,0,61,0.099976,134,3,18,1613,1916,3716,15,13,11,1,1,0
1567,940,0,2.900391,0,0,0,16,0.700195,115,4,1,499,1090,2192,15,13,9,1,1,1


In [None]:
predictions = pipeline_afr.predict(X_test) 
metrics = {}
metrics["precision"] = precision_score(y_test, predictions, average='weighted')   
metrics["recall"] = recall_score(y_test, predictions, average='weighted')
#metrics["roc_auc"] = roc_auc_score(y_test, predictions, average='ovo')
metrics["f1"] = f1_score(y_test, predictions, average='weighted')

metrics

{'precision': np.float64(0.8778334654956693),
 'recall': np.float64(0.875),
 'f1': np.float64(0.8754912894398643)}

In [None]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name='autofeat', experiment_id=experiment_id) as run:
    # –ø–æ–ª—É—á–∞–µ–º —É–Ω–∏–∫–∞–ª—å–Ω—ã–π –∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä –∑–∞–ø—É—Å–∫–∞ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline_afr, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_params(pipeline_afr.get_params())

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

 - numpy (current: 2.0.0, required: numpy==2.1.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 20.22it/s]
 - numpy (current: 2.0.0, required: numpy==2.1.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/11/14 11:20:18 INFO mlflow.tracking._tracking_service.client: üèÉ View run autofeat at: http://127.0.0.1:5000/#/experiments/1/runs/fb35cf1e60a14e3b9de67ef3fb74af97.
2024/11/14 11:20:18 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [None]:
from sklearn.feature_selection import RFE
X_train_afr

Unnamed: 0,num__battery_power,num__clock_speed,num__fc,num__int_memory,num__m_dep,num__mobile_wt,num__n_cores,num__pc,num__px_height,num__px_width,...,afr__px_width,afr__ram,afr__sc_h,afr__sc_w,afr__talk_time,afr__battery_power*sqrt(ram),afr__sqrt(px_width)*sqrt(ram),afr__log(battery_power)*log(px_width),afr__sqrt(battery_power)*log(px_height),afr__sqrt(clock_speed)*mobile_wt
0,-0.656113,-0.138190,1.093728,-0.372868,-1.375675,1.246637,0.620719,0.046466,-0.870069,-0.861281,...,-0.861281,-0.454690,-1.458894,-1.201724,-0.912773,-0.632932,-0.636390,-0.860205,-0.815446,0.798702
1,1.107839,-0.631921,-0.062934,-0.099164,0.000693,-1.615754,1.496885,1.218069,-0.051504,0.806629,...,0.806629,-1.381706,1.020950,-0.392308,0.359910,-0.454647,-1.015053,1.337005,1.064001,-1.255439
2,0.912864,1.588664,-0.988265,-0.044423,1.376642,-1.699124,0.620719,-1.459880,0.114452,1.062344,...,1.062344,-0.350263,-0.218972,0.686914,1.632592,0.527174,0.353453,1.367132,1.018827,-0.493099
3,-1.101115,-1.248483,-0.756932,0.831430,0.689508,-1.504593,1.058802,0.548582,-1.114517,-1.236791,...,-1.236791,1.581631,-1.458894,-0.662113,0.905345,-0.305587,0.237588,-1.611700,-1.462475,-1.577274
4,-0.679051,-0.631921,-0.756932,-0.810794,-1.031688,0.329560,-0.693530,1.385441,1.500406,0.970194,...,0.970194,0.235992,-0.218972,-0.392308,-1.458208,-0.342506,0.855769,0.252962,0.125619,-0.206711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1072,-1.298384,-1.248483,0.631063,1.159875,-0.343714,-1.671334,-0.693530,0.548582,-1.352238,0.203048,...,0.203048,-0.805527,0.524981,0.417109,-0.367337,-1.202311,-0.459497,-0.774076,-2.156714,-1.640836
1073,-0.096420,0.846863,1.093728,0.776689,-0.687281,1.246637,0.620719,0.046466,1.695517,0.405777,...,0.405777,-1.382622,-0.962926,-1.201724,1.450780,-0.964265,-1.120283,0.427268,0.724635,1.633560
1074,0.392165,1.463425,-0.525600,-0.372868,-1.375675,0.635252,0.182636,-1.125136,-0.448452,-0.031934,...,-0.031934,1.133695,0.029012,0.417109,-0.367337,1.040767,0.941134,0.472143,0.317177,1.537780
1075,-1.672278,1.217764,-0.294267,1.324097,0.345101,1.218847,0.620719,0.213838,0.262466,0.060216,...,0.060216,-1.674833,-0.466957,0.147303,0.905345,-1.775917,-1.718494,-1.374774,-1.275508,1.883414


In [None]:
rfe_selector = RFE(estimator=classifier, n_features_to_select=12, step = 0.2) #drop 20% of features each iteration
X_train_rfe = rfe_selector.fit_transform(X_train_afr,y_train)

In [None]:
X_train_afr_rfe = pd.DataFrame(X_train_rfe, columns=rfe_selector.get_feature_names_out())
X_train_afr_rfe

Unnamed: 0,num__battery_power,num__px_height,num__px_width,num__ram,afr__battery_power,afr__px_height,afr__px_width,afr__ram,afr__battery_power*sqrt(ram),afr__sqrt(px_width)*sqrt(ram),afr__log(battery_power)*log(px_width),afr__sqrt(battery_power)*log(px_height)
0,-0.656113,-0.870069,-0.861281,-0.454690,-0.656113,-0.870069,-0.861281,-0.454690,-0.632932,-0.636390,-0.860205,-0.815446
1,1.107839,-0.051504,0.806629,-1.381706,1.107839,-0.051504,0.806629,-1.381706,-0.454647,-1.015053,1.337005,1.064001
2,0.912864,0.114452,1.062344,-0.350263,0.912864,0.114452,1.062344,-0.350263,0.527174,0.353453,1.367132,1.018827
3,-1.101115,-1.114517,-1.236791,1.581631,-1.101115,-1.114517,-1.236791,1.581631,-0.305587,0.237588,-1.611700,-1.462475
4,-0.679051,1.500406,0.970194,0.235992,-0.679051,1.500406,0.970194,0.235992,-0.342506,0.855769,0.252962,0.125619
...,...,...,...,...,...,...,...,...,...,...,...,...
1072,-1.298384,-1.352238,0.203048,-0.805527,-1.298384,-1.352238,0.203048,-0.805527,-1.202311,-0.459497,-0.774076,-2.156714
1073,-0.096420,1.695517,0.405777,-1.382622,-0.096420,1.695517,0.405777,-1.382622,-0.964265,-1.120283,0.427268,0.724635
1074,0.392165,-0.448452,-0.031934,1.133695,0.392165,-0.448452,-0.031934,1.133695,1.040767,0.941134,0.472143,0.317177
1075,-1.672278,0.262466,0.060216,-1.674833,-1.672278,0.262466,0.060216,-1.674833,-1.775917,-1.718494,-1.374774,-1.275508


In [None]:
rfe_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_afr), 
    ('rfe_extractor', RFE(estimator=classifier, n_features_to_select=12, step = 0.2)),
    ('model', classifier)
])

rfe_pipeline.fit(X_train, y_train)

  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:


In [None]:
predictions_rfe = rfe_pipeline.predict(X_test)

metrics = {}
metrics["precision"] = precision_score(y_test, predictions, average='weighted')   
metrics["recall"] = recall_score(y_test, predictions, average='weighted')
#metrics["roc_auc"] = roc_auc_score(y_test, predictions, average='ovo')
metrics["f1"] = f1_score(y_test, predictions, average='weighted')

metrics

{'precision': np.float64(0.8778334654956693),
 'recall': np.float64(0.875),
 'f1': np.float64(0.8754912894398643)}

In [None]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
RUN_NAME = 'rfe_feature_selection'

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # –ø–æ–ª—É—á–∞–µ–º —É–Ω–∏–∫–∞–ª—å–Ω—ã–π –∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä –∑–∞–ø—É—Å–∫–∞ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(rfe_pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_params(model_sklearn.get_params())

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

 - numpy (current: 2.0.0, required: numpy==2.1.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 19.38it/s]
 - numpy (current: 2.0.0, required: numpy==2.1.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/11/14 11:25:24 INFO mlflow.tracking._tracking_service.client: üèÉ View run rfe_feature_selection at: http://127.0.0.1:5000/#/experiments/1/runs/8721788e29394f7e86546a3df3fe008d.
2024/11/14 11:25:24 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [None]:
rfe_skl_selector = RFE(estimator=classifier, n_features_to_select=12, step = 0.2) #drop 20% of features each iteration
X_train_skl_rfe = rfe_skl_selector.fit_transform(X_train_sklearn,y_train)

In [None]:
X_train_skl_rfe = pd.DataFrame(X_train_skl_rfe, columns=rfe_skl_selector.get_feature_names_out())
X_train_skl_rfe

Unnamed: 0,num__battery_power,num__px_height,num__px_width,num__ram,quantile__battery_power,quantile__px_height,quantile__px_width,quantile__ram,poly__battery_power,poly__battery_power^2,spline__px_height_sp_1,spline__px_height_sp_3
0,-0.656113,-0.870069,-0.861281,-0.454690,0.318318,0.228729,0.258258,0.362454,-0.656113,-0.742832,0.606884,0.003008
1,1.107839,-0.051504,0.806629,-1.381706,0.824324,0.545546,0.724725,0.110595,1.107839,1.126755,0.387642,0.043844
2,0.912864,0.114452,1.062344,-0.350263,0.767710,0.600471,0.806306,0.388571,0.912864,0.866985,0.336567,0.061555
3,-1.101115,-1.114517,-1.236791,1.581631,0.178592,0.120818,0.146146,0.959039,-1.101115,-1.043795,0.646040,0.000555
4,-0.679051,1.500406,0.970194,0.235992,0.315315,0.907177,0.778278,0.572491,-0.679051,-0.760026,0.044057,0.386952
...,...,...,...,...,...,...,...,...,...,...,...,...
1072,-1.298384,-1.352238,0.203048,-0.805527,0.119989,0.025025,0.552553,0.269517,-1.298384,-1.155211,0.665145,0.000010
1073,-0.096420,1.695517,0.405777,-1.382622,0.484484,0.931629,0.622623,0.109109,-0.096420,-0.266658,0.027969,0.446332
1074,0.392165,-0.448452,-0.031934,1.133695,0.602603,0.403403,0.489936,0.822732,0.392165,0.237953,0.505400,0.015932
1075,-1.672278,0.262466,0.060216,-1.674833,0.002002,0.643143,0.516016,0.011191,-1.672278,-1.329303,0.291729,0.080900


In [None]:
rfe_cols = X_train_skl_rfe.columns.tolist()
rfe_cols

['num__battery_power',
 'num__px_height',
 'num__px_width',
 'num__ram',
 'quantile__battery_power',
 'quantile__px_height',
 'quantile__px_width',
 'quantile__ram',
 'poly__battery_power',
 'poly__battery_power^2',
 'spline__px_height_sp_1',
 'spline__px_height_sp_3']

In [None]:
rfe_idx = rfe_skl_selector.support_
rfe_idx

array([ True, False, False, False, False, False, False, False,  True,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False,  True,  True,  True, False, False, False, False, False,
        True, False, False,  True, False,  True, False,  True, False])

In [None]:
with open('rfe_skl_idx.txt', 'w+') as f:
    f.write(str(rfe_idx))
with open('rfe_skl_cols.txt', 'w+') as f:
    f.write(str(rfe_cols))

In [None]:
class ColumnExtractor(object):

    def __init__(self, cols):
        self.cols = cols

    def transform(self, X):
        return X[:,self.cols]
    
    def fit(self, X, y=None):
        return self

In [None]:
rfe_skl_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_sklearn), 
    ('rfe_extractor', ColumnExtractor(rfe_idx)),
    ('model', classifier)
])

rfe_skl_pipeline.fit(X_train, y_train)

In [None]:
predictions_rfe_skl = rfe_skl_pipeline.predict(X_test)

metrics = {}
metrics["precision"] = precision_score(y_test, predictions, average='weighted')   
metrics["recall"] = recall_score(y_test, predictions, average='weighted')
#metrics["roc_auc"] = roc_auc_score(y_test, predictions, average='ovo')
metrics["f1"] = f1_score(y_test, predictions, average='weighted')

metrics
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
RUN_NAME = 'rfe_skl_feature_selection'

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # –ø–æ–ª—É—á–∞–µ–º —É–Ω–∏–∫–∞–ª—å–Ω—ã–π –∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä –∑–∞–ø—É—Å–∫–∞ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(rfe_pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact('rfe_skl_cols.txt')
    mlflow.log_artifact('rfe_skl_idx.txt')
    mlflow.log_params(model_sklearn.get_params())

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

 - numpy (current: 2.0.0, required: numpy==2.1.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 18.55it/s]
 - numpy (current: 2.0.0, required: numpy==2.1.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/11/14 11:36:04 INFO mlflow.tracking._tracking_service.client: üèÉ View run rfe_skl_feature_selection at: http://127.0.0.1:5000/#/experiments/1/runs/adf7ba80d53543e2ae0dfd74273eac5e.
2024/11/14 11:36:04 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector 

In [None]:
import sklearn

In [None]:
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'd2_absolute_error_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'neg_root_mean_squared_log_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall

In [None]:
import numpy

In [None]:
sfs = SequentialFeatureSelector(RandomForestClassifier(n_estimators=3), 
                                k_features=3,
                                forward=True,
                                floating=False, # True to drop selected features
                                scoring='f1_weighted',
                                cv=2)

sfs.fit(X_train_sklearn,y_train)



AttributeError: `np.NINF` was removed in the NumPy 2.0 release. Use `-np.inf` instead.