In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from autofeat import FeatureSelector, AutoFeatRegressor, AutoFeatClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes, fetch_california_housing
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [60]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None)
df.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
X = df.iloc[:, :-1]
X["WorkClass"] = LabelEncoder().fit_transform(X["WorkClass"])
X["Education"] = LabelEncoder().fit_transform(X["Education"])
X["MaritalStatus"] = LabelEncoder().fit_transform(X["MaritalStatus"])
X["Occupation"] = LabelEncoder().fit_transform(X["Occupation"])
X["Relationship"] = LabelEncoder().fit_transform(X["Relationship"])
X["Race"] = LabelEncoder().fit_transform(X["Race"])
X["Gender"] = LabelEncoder().fit_transform(X["Gender"])
X["NativeCountry"] = LabelEncoder().fit_transform(X["NativeCountry"])
X.astype(float)
X.drop(["CapitalLoss"], axis=1, inplace=True)
y = (df.iloc[:, -1] == " >50K").astype(int)

seed = 42
np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

In [61]:
print("Before transformation")
X_train

Before transformation


Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,HoursPerWeek,NativeCountry
5514,33,2,198183,9,13,4,10,1,4,0,0,50,39
19777,36,4,86459,8,11,2,4,0,4,1,0,50,39
10781,58,6,203039,6,5,5,3,1,4,1,0,40,39
32240,21,4,180190,8,11,2,5,0,4,1,0,46,39
9876,27,4,279872,15,10,0,8,1,4,1,0,40,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,47,4,359461,9,13,2,3,0,4,1,0,40,39
5390,31,4,147215,2,8,0,8,4,4,0,0,21,39
860,18,4,216284,1,7,4,1,3,4,0,0,20,39
15795,50,6,54261,11,9,2,5,0,4,1,0,84,39


In [62]:
ssScaler = StandardScaler()
X_train = ssScaler.fit_transform(X_train)
X_test = ssScaler.transform(X_test)

In [63]:
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)
print("After transformation")
X_train

After transformation


Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,HoursPerWeek,NativeCountry
0,-0.408756,-1.286589,0.080051,-0.332671,1.133702,0.923471,0.810488,-0.275318,0.395813,-1.418827,-0.145715,0.779460,0.292818
1,-0.188857,0.088475,-0.981653,-0.590985,0.357049,-0.407828,-0.608779,-0.898166,0.395813,0.704807,-0.145715,0.779460,0.292818
2,1.423734,1.463540,0.126197,-1.107613,-1.972910,1.589120,-0.845324,-0.275318,0.395813,0.704807,-0.145715,-0.031510,0.292818
3,-1.288351,0.088475,-0.090935,-0.590985,0.357049,-0.407828,-0.372234,-0.898166,0.395813,0.704807,-0.145715,0.455072,0.292818
4,-0.848554,0.088475,0.856334,1.217214,-0.031277,-1.739126,0.337399,-0.275318,0.395813,0.704807,-0.145715,-0.031510,0.292818
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,0.617438,0.088475,1.612662,-0.332671,1.133702,-0.407828,-0.845324,-0.898166,0.395813,0.704807,-0.145715,-0.031510,0.292818
26044,-0.555355,0.088475,-0.404294,-2.140869,-0.807930,-1.739126,0.337399,1.593226,0.395813,-1.418827,-0.145715,-1.572355,0.292818
26045,-1.508250,0.088475,0.252063,-2.399183,-1.196257,0.923471,-1.318413,0.970378,0.395813,-1.418827,-0.145715,-1.653452,0.292818
26046,0.837337,1.463540,-1.287628,0.183958,-0.419604,-0.407828,-0.372234,-0.898166,0.395813,0.704807,-0.145715,3.536760,0.292818


In [64]:
model = AutoFeatClassifier(feateng_steps=1)
X_train_with_feature_creation = model.fit_transform(X_train, y_train)
X_train_with_feature_creation.head()

KeyboardInterrupt: 

In [None]:
X_test_with_feature_creation = model.transform(X_test)

In [None]:
model_1 = LogisticRegression().fit(X_train, y_train)
model_2 = LogisticRegression().fit(X_train_with_feature_creation, y_train)

In [None]:
print(accuracy_score(y_test, model_1.predict(X_test)), end="")
accuracy_score(y_test, model_2.predict(X_test_with_feature_creation))

0.7950.8
