In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import lazytransform as lt
from autofeat import AutoFeatClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [56]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None)
df.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
X = df.iloc[0:1000, :-1]
X["WorkClass"] = LabelEncoder().fit_transform(X["WorkClass"])
X["Education"] = LabelEncoder().fit_transform(X["Education"])
X["MaritalStatus"] = LabelEncoder().fit_transform(X["MaritalStatus"])
X["Occupation"] = LabelEncoder().fit_transform(X["Occupation"])
X["Relationship"] = LabelEncoder().fit_transform(X["Relationship"])
X["Race"] = LabelEncoder().fit_transform(X["Race"])
X["Gender"] = LabelEncoder().fit_transform(X["Gender"])
X["NativeCountry"] = LabelEncoder().fit_transform(X["NativeCountry"])
X.astype(float)
X.drop(["CapitalLoss"], axis=1, inplace=True)
y = (df.iloc[0:1000, -1] == " >50K").astype(int)

seed = 42
np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)


In [57]:
print("Before transformation")
X_train

Before transformation


Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,HoursPerWeek,NativeCountry
29,49,3,193366,11,9,2,3,0,4,1,0,40,28
535,39,2,207853,2,8,2,13,0,4,1,0,50,28
695,25,0,202480,7,12,4,0,2,4,1,0,45,28
557,31,3,323069,11,9,5,1,4,4,0,0,20,0
836,25,3,499233,11,9,0,1,1,4,1,0,40,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,17,0,304873,0,6,4,0,3,4,0,34095,32,28
270,61,3,69867,11,9,2,4,0,4,1,0,40,28
860,18,3,216284,1,7,4,1,3,4,0,0,20,28
435,33,3,208405,12,14,2,10,0,4,1,0,50,28


In [58]:
# LazyTransformer
lazy = lt.LazyTransformer(model=None, encoders="auto", scalers="std", verbose=2)
X_train, y_train = lazy.fit_transform(X_train, y_train)
X_test = lazy.transform(X_test)

    Single_Label Binary_Classification problem 
Shape of dataset: (800, 13). Now we classify variables into different types...
    Returning dictionary for variable types with following keys:
                        continuous_vars = 0, int_vars = 13, 
                        discrete_string_vars = 0, nlp_vars = 0,
                        date_vars = 0, time_deltas = 0,
                        categorical_vars = 0, date_zones = 0
    no date time variables detected in this dataset
    Beware! onehot encoding can create hundreds if not 1000s of variables...
label encoder selected for transforming all categorical variables
Using OneHotEncoder() and My_LabelEncoder() as encoders
Caution: ### When you have categorical or date-time vars in data, scaling may not be helpful. ##
Check the pipeline creation statement for errors (if any):
	make_column_transformer((imp, intvars),(imp, floatvars),    remainder=remainder)
    no other vars left in dataset to transform...
Time taken to define data p

In [59]:
print("After transformation")
X_train

After transformation


Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,HoursPerWeek,NativeCountry
29,3,2,1,2,3,1,0,0,4,2,0,3,4
535,2,1,1,0,3,1,3,0,4,2,0,4,4
695,1,0,1,1,4,2,0,1,4,2,0,3,4
557,2,2,3,2,3,3,0,2,4,0,0,1,0
836,1,2,4,2,3,0,0,0,4,2,0,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1,0,2,0,2,2,0,1,4,0,12,2,4
270,4,2,0,2,3,1,0,0,4,2,0,3,4
860,1,2,2,0,2,2,0,1,4,0,0,1,4
435,2,2,1,3,5,1,2,0,4,2,0,4,4


In [60]:
model = AutoFeatClassifier()
X_train_with_feature_creation = model.fit_transform(X_train, y_train)
X_train_with_feature_creation.head()

KeyboardInterrupt: 

In [None]:
X_test_with_feature_creation = model.transform(X_test)

In [None]:
model_1 = LogisticRegression().fit(X_train, y_train)
model_2 = LogisticRegression().fit(X_train_with_feature_creation, y_train)

In [None]:
print(accuracy_score(y_test, model_1.predict(X_test)))
print(accuracy_score(y_test, model_2.predict(X_test_with_feature_creation)))

0.95
0.95
