https://machinelearningmastery.com/feature-extraction-on-tabular-data/

In [1]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import pandas as pd

pd.set_option( 'display.max_columns' , None )

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
def load_dataset() :
    
    url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/wine.csv'
    
    df = pd.read_csv( url , header = None )
    
    dataset = df.values
    
    X = dataset[ : , : -1 ]
    
    y = dataset[ : , -1 ]
    
    y = LabelEncoder().fit_transform( y.astype( 'str' ) )
    
    return X , y

In [5]:
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.model_selection import cross_val_score

In [6]:
def evaluate_model( X , y , model ) :
    
    cv = RepeatedStratifiedKFold( n_splits = 10 , n_repeats = 3 , random_state = 999 )
    
    scores = cross_val_score( model , X , y , scoring = 'accuracy' , cv = cv , n_jobs = -1 )
    
    return scores

# Baseline Model Performance

In [7]:
X , y = load_dataset()

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
model = LogisticRegression( solver = 'liblinear' )

In [10]:
scores = evaluate_model( X , y , model )

In [11]:
from numpy import mean

from numpy import std

In [12]:
print( f'ACCURACY : {mean( scores ):.3f} ( {std( scores ):.3f} )' )

ACCURACY : 0.951 ( 0.043 )


# Feature Extraction Approach to Data Preparation

In [22]:
X , y = load_dataset()

model = LogisticRegression( solver = 'liblinear' )

In [23]:
from sklearn.preprocessing import MinMaxScaler

from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import RobustScaler

from sklearn.preprocessing import QuantileTransformer

from sklearn.preprocessing import KBinsDiscretizer

from sklearn.decomposition import PCA

from sklearn.decomposition import TruncatedSVD

In [24]:
transforms = list()


transforms.append( ('mms', MinMaxScaler() ) )

transforms.append( ( 'ss' , StandardScaler() ) )

transforms.append( ( 'rs' , RobustScaler() ) )

transforms.append( ( 'qt', QuantileTransformer( n_quantiles = 100 , output_distribution = 'normal' ) ) )

transforms.append( ( 'kbd' , KBinsDiscretizer( n_bins = 10 , encode = 'ordinal' , strategy = 'uniform' ) ) )

transforms.append( ( 'pca' , PCA( n_components = 7 ) ) )

transforms.append( ( 'svd' , TruncatedSVD( n_components = 7 ) ) )

In [25]:
from sklearn.pipeline import FeatureUnion

In [26]:
feature_union = FeatureUnion( transforms )

In [27]:
from sklearn.pipeline import Pipeline

In [28]:
steps = list()

steps.append( ( 'fu' , feature_union ) )

steps.append( ( 'm' , model ) )

pipeline = Pipeline( steps = steps )

In [29]:
scores = evaluate_model( X , y , model )

In [30]:
print( f'ACCURACY : {mean( scores ):.3f} ( {std( scores ):.3f} )')

0.951 ( 0.043 )
