In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from autofeat import FeatureSelector, AutoFeatRegressor, AutoFeatClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes, fetch_california_housing
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [40]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None)
df.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
X = df.iloc[0:1000, :-1]
X["WorkClass"] = LabelEncoder().fit_transform(X["WorkClass"])
X["Education"] = LabelEncoder().fit_transform(X["Education"])
X["MaritalStatus"] = LabelEncoder().fit_transform(X["MaritalStatus"])
X["Occupation"] = LabelEncoder().fit_transform(X["Occupation"])
X["Relationship"] = LabelEncoder().fit_transform(X["Relationship"])
X["Race"] = LabelEncoder().fit_transform(X["Race"])
X["Gender"] = LabelEncoder().fit_transform(X["Gender"])
X["NativeCountry"] = LabelEncoder().fit_transform(X["NativeCountry"])
X.astype(float)
X.drop(["CapitalLoss"], axis=1, inplace=True)
y = (df.iloc[0:1000, -1] == " >50K").astype(int)

seed = 42
np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

In [41]:
print("Before transformation")
X_train

Before transformation


Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,HoursPerWeek,NativeCountry
29,49,3,193366,11,9,2,3,0,4,1,0,40,28
535,39,2,207853,2,8,2,13,0,4,1,0,50,28
695,25,0,202480,7,12,4,0,2,4,1,0,45,28
557,31,3,323069,11,9,5,1,4,4,0,0,20,0
836,25,3,499233,11,9,0,1,1,4,1,0,40,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,17,0,304873,0,6,4,0,3,4,0,34095,32,28
270,61,3,69867,11,9,2,4,0,4,1,0,40,28
860,18,3,216284,1,7,4,1,3,4,0,0,20,28
435,33,3,208405,12,14,2,10,0,4,1,0,50,28


In [42]:
ssScaler = StandardScaler()
X_train = ssScaler.fit_transform(X_train)
X_test = ssScaler.transform(X_test)

In [43]:
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)
print("After transformation")
X_train

After transformation


Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,HoursPerWeek,NativeCountry
0,0.818420,-0.005252,0.021487,0.190805,-0.408627,-0.413888,-0.865199,-0.941302,0.417511,0.717741,-0.223783,0.020831,0.296806
1,0.066887,-0.845555,0.157161,-2.141616,-0.793670,-0.413888,1.481104,-0.941302,0.417511,0.717741,-0.223783,0.871095,0.296806
2,-0.985261,-2.526162,0.106842,-0.845827,0.746502,0.892267,-1.569090,0.270350,0.417511,0.717741,-0.223783,0.445963,0.296806
3,-0.534340,-0.005252,1.236183,0.190805,-0.408627,1.545344,-1.334460,1.482001,0.417511,-1.393261,-0.223783,-1.679696,-4.675863
4,-0.985261,-0.005252,2.885996,0.190805,-0.408627,-1.720042,-1.334460,-0.335476,0.417511,0.717741,-0.223783,0.020831,0.296806
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,-1.586488,-2.526162,1.065774,-2.659932,-1.563755,0.892267,-1.569090,0.876176,0.417511,-1.393261,12.498325,-0.659379,0.296806
796,1.720261,-0.005252,-1.135107,0.190805,-0.408627,-0.413888,-0.630569,-0.941302,0.417511,0.717741,-0.223783,0.020831,0.296806
797,-1.511334,-0.005252,0.236119,-2.400774,-1.178712,0.892267,-1.334460,0.876176,0.417511,-1.393261,-0.223783,-1.679696,0.296806
798,-0.384034,-0.005252,0.162331,0.449963,1.516588,-0.413888,0.777213,-0.941302,0.417511,0.717741,-0.223783,0.871095,0.296806


In [44]:
model = AutoFeatClassifier(feateng_steps=1)
X_train_with_feature_creation = model.fit_transform(X_train, y_train)
X_train_with_feature_creation.head()

  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:


Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,HoursPerWeek,NativeCountry,Age**2,1/MaritalStatus,1/Education,Relationship**3
0,0.81842,-0.005252,0.021487,0.190805,-0.408627,-0.413888,-0.865199,-0.941302,0.417511,0.717741,-0.223783,0.020831,0.296806,0.669812,-2.416114,5.240952,-0.83404
1,0.066887,-0.845555,0.157161,-2.141616,-0.79367,-0.413888,1.481104,-0.941302,0.417511,0.717741,-0.223783,0.871095,0.296806,0.004474,-2.416114,-0.466937,-0.83404
2,-0.985261,-2.526162,0.106842,-0.845827,0.746502,0.892267,-1.56909,0.27035,0.417511,0.717741,-0.223783,0.445963,0.296806,0.970739,1.120741,-1.182275,0.01976
3,-0.53434,-0.005252,1.236183,0.190805,-0.408627,1.545344,-1.33446,1.482001,0.417511,-1.393261,-0.223783,-1.679696,-4.675863,0.28552,0.647105,5.240952,3.254961
4,-0.985261,-0.005252,2.885996,0.190805,-0.408627,-1.720042,-1.33446,-0.335476,0.417511,0.717741,-0.223783,0.020831,0.296806,0.970739,-0.581381,5.240952,-0.037756


In [45]:
X_test_with_feature_creation = model.transform(X_test)

In [46]:
model_1 = LogisticRegression().fit(X_train, y_train)
model_2 = LogisticRegression().fit(X_train_with_feature_creation, y_train)

In [47]:
print(accuracy_score(y_test, model_1.predict(X_test)), end="")
accuracy_score(y_test, model_2.predict(X_test_with_feature_creation))

0.7950.8
