In [1]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.metrics import f1_score

sklearn.__version__

'0.23.1'

In [2]:
X,y = datasets.load_breast_cancer(return_X_y=True, as_frame=True)

def clf_score(X,y):
    """Baseline model fit"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    clf = RandomForestClassifier()
    clf.fit(X_train,y_train)
    preds = clf.predict(X_test)
    f1 = f1_score(y_test, preds, average='weighted')
    
    return f1

print(f"f1 score:",clf_score(X,y))

f1 score: 0.9643539227816376


In [3]:
# Get a list of numerical columns
numerical_cols = X.select_dtypes([np.number]).columns

In [4]:
# Check if all columns are numerical
len(numerical_cols)/len(X.columns)*100.00

100.0

In [5]:
class generate_features:
    "A class to append automatically generated features to a dataframe."
    def __init__(self, df, col_list):
        self.df = df
        self.col_list = col_list
        self.generate_cols()
        
    def generate_cols(self):
        
        def square_root(x):
            return np.sqrt(x)
        
        def log10(x):
            return np.log10(x) if x else 0
        
        def inverse_transform(x):
            return -1 / x if x else 0
    
        for col in self.col_list:
            self.df[f"{col}_sqrt"]= self.df[col].apply(square_root)
            self.df[f"{col}_log10"]= self.df[col].apply(log10)
            self.df[f"{col}_inverse"]= self.df[col].apply(inverse_transform)

In [6]:
new_X = generate_features(X, numerical_cols)

In [7]:
X2 = new_X.df

In [8]:
X2

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst concavity_inverse,worst concave points_sqrt,worst concave points_log10,worst concave points_inverse,worst symmetry_sqrt,worst symmetry_log10,worst symmetry_inverse,worst fractal dimension_sqrt,worst fractal dimension_log10,worst fractal dimension_inverse
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,-1.404692,0.515170,-0.576099,-3.767898,0.678307,-0.337148,-2.173441,0.344819,-0.924818,-8.410429
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,-4.139073,0.431277,-0.730487,-5.376344,0.524404,-0.560667,-3.636364,0.298362,-1.050512,-11.233431
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,-2.220249,0.492950,-0.614394,-4.115226,0.601082,-0.442132,-2.767783,0.295939,-1.057595,-11.418132
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,-1.455816,0.507445,-0.589223,-3.883495,0.814739,-0.177963,-1.506478,0.415933,-0.761954,-5.780347
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,-2.500000,0.403113,-0.789147,-6.153846,0.486210,-0.626353,-4.230118,0.277092,-1.114752,-13.024225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,-2.434867,0.470744,-0.654430,-4.512635,0.453872,-0.686133,-4.854369,0.266740,-1.147825,-14.054814
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,-3.110420,0.403485,-0.788346,-6.142506,0.507149,-0.589729,-3.888025,0.257624,-1.178028,-15.067048
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,-2.938584,0.376563,-0.848324,-7.052186,0.470956,-0.654038,-4.508566,0.279643,-1.106793,-12.787724
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,-1.065303,0.514782,-0.576754,-3.773585,0.639296,-0.388595,-2.446782,0.352136,-0.906578,-8.064516


In [9]:
clf_score(X2,y)

0.9647766759894907

In [10]:
scores_without = []
scores_with = []
for i in range(100):
    score_without = clf_score(X,y)
    score_with = clf_score(X2,y)
    scores_without.append(score_without)
    scores_with.append(score_with)
    
print("100 tests: with vs without added features")
print(f"without:{np.mean(scores_without)}")
print(f"with:{np.mean(scores_with)}")

100 tests: with vs without added features
without:0.9588831828465681
with:0.9590909842991309
