In [1]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
from typing import List, Optional
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/subashgandyer/datasets/main/great_customers.csv')

In [3]:
df['workclass'].unique()

array(['private', 'self_employed', 'government', nan], dtype=object)

In [4]:
df['occupation'].unique()

array(['sales', 'clerical', 'professional', 'farm', 'craft', 'factory',
       'tech', nan, 'service', 'executive', 'trucker', 'cleaner',
       'lawenf', 'estate_agent', 'soldier'], dtype=object)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13599 entries, 0 to 13598
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 13599 non-null  int64  
 1   age                     13178 non-null  float64
 2   workclass               13056 non-null  object 
 3   salary                  13177 non-null  float64
 4   education_rank          13599 non-null  int64  
 5   marital-status          13599 non-null  object 
 6   occupation              13056 non-null  object 
 7   race                    13599 non-null  object 
 8   sex                     13599 non-null  object 
 9   mins_beerdrinking_year  13175 non-null  float64
 10  mins_exercising_year    13178 non-null  float64
 11  works_hours             13599 non-null  int64  
 12  tea_per_year            11170 non-null  float64
 13  coffee_per_year         11188 non-null  float64
 14  great_customer_class    13599 non-null

In [6]:
df.isna().sum()

user_id                      0
age                        421
workclass                  543
salary                     422
education_rank               0
marital-status               0
occupation                 543
race                         0
sex                          0
mins_beerdrinking_year     424
mins_exercising_year       421
works_hours                  0
tea_per_year              2429
coffee_per_year           2411
great_customer_class         0
dtype: int64

In [7]:
X = df.iloc[:,:-1]
y = pd.DataFrame(df.iloc[:,-1])

X.drop('user_id', inplace=True, axis=1)

In [8]:
mean_imputer = SimpleImputer(strategy='mean')
workclass_imputer = SimpleImputer(strategy='constant', fill_value='no_workclass')
occupation_imputer = SimpleImputer(strategy='constant', fill_value='no_occupation')
iterative_imputer = IterativeImputer(max_iter=10, random_state=42)

In [9]:
X['age'] = mean_imputer.fit_transform(X['age'].to_numpy().reshape(-1, 1))

In [10]:
X['workclass'] = workclass_imputer.fit_transform(X['workclass'].to_numpy().reshape(-1, 1))

In [11]:
X['occupation'] = occupation_imputer.fit_transform(X['occupation'].to_numpy().reshape(-1, 1))

In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13599 entries, 0 to 13598
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     13599 non-null  float64
 1   workclass               13599 non-null  object 
 2   salary                  13177 non-null  float64
 3   education_rank          13599 non-null  int64  
 4   marital-status          13599 non-null  object 
 5   occupation              13599 non-null  object 
 6   race                    13599 non-null  object 
 7   sex                     13599 non-null  object 
 8   mins_beerdrinking_year  13175 non-null  float64
 9   mins_exercising_year    13178 non-null  float64
 10  works_hours             13599 non-null  int64  
 11  tea_per_year            11170 non-null  float64
 12  coffee_per_year         11188 non-null  float64
dtypes: float64(6), int64(2), object(5)
memory usage: 1.3+ MB


In [13]:
categorical_features = ['workclass', 'marital-status', 'occupation', 'race', 'sex']
for feature in categorical_features:
    X = pd.concat([X, pd.get_dummies(X[feature])], axis=1)
    X.drop(feature, inplace=True, axis=1)

In [14]:
X = pd.DataFrame(iterative_imputer.fit_transform(X), columns=X.columns)

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13599 entries, 0 to 13598
Data columns (total 35 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     13599 non-null  float64
 1   salary                  13599 non-null  float64
 2   education_rank          13599 non-null  float64
 3   mins_beerdrinking_year  13599 non-null  float64
 4   mins_exercising_year    13599 non-null  float64
 5   works_hours             13599 non-null  float64
 6   tea_per_year            13599 non-null  float64
 7   coffee_per_year         13599 non-null  float64
 8   government              13599 non-null  float64
 9   no_workclass            13599 non-null  float64
 10  private                 13599 non-null  float64
 11  self_employed           13599 non-null  float64
 12  Divorced                13599 non-null  float64
 13  Married                 13599 non-null  float64
 14  Never-married           13599 non-null

In [16]:
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [21]:
class ClassificationFeatureSelector:
    __methods: List[str] = ['pearson', 'mutual_info', 'rfe', 'lin-reg', 'rf', 'lgbm']
    __n_jobs: int

    feature_names: List[str]
    feature_support_: pd.DataFrame
    sorted_features_: List[str]

    def __init__(self,
                 methods='__all__',
                 n_jobs: Optional[int] = None):
        self.__n_jobs = n_jobs
        if methods != '__all__' \
                and isinstance(methods, List) \
                and all(isinstance(m, str) for m in methods):
            self.__methods = methods

    def __cor_selector(self,
                       X: pd.DataFrame,
                       y: pd.DataFrame,
                       number_of_features: int) -> List[bool]:
        feature_names = X.columns.to_list()
        coefficients = [np.corrcoef(pd.DataFrame(X[name], columns=[name]), y)[0, 1] for name in feature_names]
        coefficients = [0 if np.isnan(coef) else coef for coef in coefficients]
        feature_indexes = np.argsort(np.abs(coefficients))[-number_of_features:]
        support = [index in feature_indexes for index, name in enumerate(feature_names)]
        return support

    def __chi2_selector(self,
                               X: pd.DataFrame,
                               y: pd.DataFrame,
                               number_of_features: int) -> List[bool]:
        selector = SelectKBest(score_func=chi2,
                               k=number_of_features)
        selector = selector.fit(X, y)
        return selector.get_support()

    def __rfe_selector(self,
                       X: np.ndarray,
                       y: pd.DataFrame,
                       number_of_features: int):
        model = LogisticRegression()
        selector = RFE(estimator=model,
                       n_features_to_select=number_of_features,
                       step=1,
                       verbose=5)
        selector = selector.fit(X, y)
        return selector.get_support()

    def __embedded_log_reg_selector(self,
                                    X: np.ndarray,
                                    y: pd.DataFrame,
                                    number_of_features: int):
        model = LogisticRegression(n_jobs=self.__n_jobs)
        selector = SelectFromModel(model,
                                   max_features=number_of_features)
        selector = selector.fit(X, y)
        return selector.get_support()

    def __embedded_rf_selector(self,
                               X: np.ndarray,
                               y: pd.DataFrame,
                               number_of_features: int):
        model = RandomForestClassifier(n_estimators=50,
                                       n_jobs=self.__n_jobs,
                                       random_state=42,
                                       max_features=number_of_features)
        selector = SelectFromModel(model,
                                   max_features=number_of_features)
        embedded_selector = selector.fit(X, y)
        return embedded_selector.get_support()

    def __embedded_lgbm_selector(self,
                                 X: np.ndarray,
                                 y: pd.DataFrame,
                                 number_of_features: int):
        model = LGBMClassifier(n_estimators=500,
                               learning_rate=0.05,
                               num_leaves=32,
                               colsample_bytree=0.2,
                               reg_alpha=3,
                               reg_lambda=1,
                               min_split_gain=0.01,
                               min_child_weight=40,
                               n_jobs=self.__n_jobs,
                               random_state=42)
        selector = SelectFromModel(model,
                                   max_features=number_of_features)
        selector = selector.fit(X, y)
        return selector.get_support()

    def sort_features(self,
                      X: pd.DataFrame,
                      y: pd.DataFrame,
                      number_of_features: int):
        feature_names = X.columns.to_list()
        methods_support = {'Feature': feature_names}

        for method in self.__methods:
            print(f'Calculating {method}')
            if method == 'pearson' or self.__methods == '__all__':
                methods_support[method] = self.__cor_selector(X, y, number_of_features)
            if method == 'mutual_info' or self.__methods == '__all__':
                methods_support[method] = self.__chi2_selector(X, y, number_of_features)
            if method == 'rfe' or self.__methods == '__all__':
                methods_support[method] = self.__rfe_selector(X.to_numpy(), y, number_of_features)
            if method == 'lin-reg' or self.__methods == '__all__':
                methods_support[method] = self.__embedded_log_reg_selector(X.to_numpy(), y, number_of_features)
            if method == 'rf' or self.__methods == '__all__':
                methods_support[method] = self.__embedded_rf_selector(X.to_numpy(), y, number_of_features)
            if method == 'lgbm' or self.__methods == '__all__':
                methods_support[method] = self.__embedded_lgbm_selector(X.to_numpy(), y, number_of_features)

        pd.set_option('display.max_rows', None)

        feature_selection_df = pd.DataFrame(methods_support)
        feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
        feature_selection_df = feature_selection_df.sort_values(['Total', 'Feature'],
                                                                ascending=False)

        feature_selection_df.index = range(1, len(feature_selection_df)+1)
        self.feature_support_ = feature_selection_df
        self.sorted_features_ = feature_selection_df['Feature'].tolist()

In [22]:
selector = ClassificationFeatureSelector(n_jobs=-1)

In [23]:
selector.sort_features(X, y, number_of_features=10)

Calculating pearson
Calculating mutual_info
Calculating rfe
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitt

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number o

ValueError: All arrays must be of the same length

In [24]:
selector.sorted_features_

AttributeError: 'ClassificationFeatureSelector' object has no attribute 'sorted_features_'

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
rfc = RandomForestClassifier(n_estimators=100,
                             n_jobs=-1)

rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
rfc_accuracy = accuracy_score(y_test, rfc_pred)
print(f'RandomForestClassifier accuracy: {rfc_accuracy}')

  rfc.fit(X_train, y_train)


RandomForestClassifier accuracy: 0.9375


In [27]:
svm = SVC(random_state=42)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f'SVC accuracy: {svm_accuracy}')

  return f(*args, **kwargs)


SVC accuracy: 0.9279411764705883


In [28]:
lr = LogisticRegression(random_state=42,
                        n_jobs=-1)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
print(f'LogisticRegression accuracy: {lr_accuracy}')

  return f(*args, **kwargs)


LogisticRegression accuracy: 0.9286764705882353


In [29]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)
print(f'GaussianNB accuracy: {nb_accuracy}')

GaussianNB accuracy: 0.6746323529411765


  return f(*args, **kwargs)


In [30]:
lgbm = LGBMClassifier(random_state=42,
                      n_jobs=-1)
lgbm.fit(X_train, y_train)
lgbm_pred = lgbm.predict(X_test)
lgbm_accuracy = accuracy_score(y_test, lgbm_pred)
print(f'LGBMClassifier accuracy: {lgbm_accuracy}')

LGBMClassifier accuracy: 0.9419117647058823


  return f(*args, **kwargs)


In [35]:
ys = pd.concat([pd.DataFrame(rfc_pred, columns=['rfc']),
                pd.DataFrame(svm_pred, columns=['svm']),
                pd.DataFrame(lr_pred, columns=['lr']),
                pd.DataFrame(nb_pred, columns=['nb']),
                pd.DataFrame(lr_pred, columns=['lr'])],
               axis=1)
ys.head()

Unnamed: 0,rfc,svm,lr,nb,lr.1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,1,0


In [36]:
ys['bagged'] = np.median(ys, axis=1)
ys.head()

Unnamed: 0,rfc,svm,lr,nb,lr.1,bagged
0,0,0,0,0,0,0.0
1,0,0,0,0,0,0.0
2,0,0,0,0,0,0.0
3,0,0,0,0,0,0.0
4,0,0,0,1,0,0.0


In [37]:
bagged_pred = ys['bagged']
bagged_accuracy = accuracy_score(y_test, bagged_pred)
print(f'Bagged accuracy: {bagged_accuracy}')

Bagged accuracy: 0.9290441176470589


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
