# Feature Engineering and advanced preprocessing
author: Elvira Dzhuraeva

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import mutual_info_classif,chi2
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import CondensedNearestNeighbour

import category_encoders as ce
%matplotlib inline

In [2]:
columns = ['age',
'workclass',
'fnlwgt',
'education',
'education-num',
'marital-status',
'occupation',
'relationship',
'race',
'sex',
'capital-gain',
'capital-loss',
'hours-per-week',
'native-country',
'salary-class']

data = pd.read_csv('adult.csv',skipinitialspace=True, names=columns)
print(data.shape)
data.head(30)

(32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


### Явных пропущенных значений нет, но есть значения заполненные нулями или вопросами

In [3]:
def check_missing(data, output_path=None):    
    result = pd.concat([(data[columns] == '?').sum(), (data[columns] == '?').mean(), (data[columns] == 0).sum(), (data[columns] == 0).mean()], axis=1)
    result = result.rename(index=str, columns={0:'total missing', 1:'proportion', 2: 'total nulls', 3: 'proportion' })
    if output_path is not None:
        result.to_csv(output_path+'missing.csv')
        print(output_path, 'missing.csv')
    return result

In [4]:
print (data.shape)
check_missing(data)

(32561, 15)


  result = method(y)


Unnamed: 0,total missing,proportion,total nulls,proportion.1
age,0,0.0,0,0.0
workclass,1836,0.056386,0,0.0
fnlwgt,0,0.0,0,0.0
education,0,0.0,0,0.0
education-num,0,0.0,0,0.0
marital-status,0,0.0,0,0.0
occupation,1843,0.056601,0,0.0
relationship,0,0.0,0,0.0
race,0,0.0,0,0.0
sex,0,0.0,0,0.0


#### Удаление пропущенных значений 
#### Нули трогать не будем, так как они затрагивают 95% данных, дропнем скорее всего в разделе feature importance

In [5]:
def drop_missing(data,axis=0):
    data_copy = data.copy(deep=True)
    data_copy = data_copy[data_copy[columns] != '?']
    return data_copy

In [6]:
data_dropped = drop_missing(data)
print (data_dropped.shape)
check_missing(data_dropped)

(32561, 15)


Unnamed: 0,total missing,proportion,total nulls,proportion.1
age,0,0.0,0,0.0
workclass,0,0.0,0,0.0
fnlwgt,0,0.0,0,0.0
education,0,0.0,0,0.0
education-num,0,0.0,0,0.0
marital-status,0,0.0,0,0.0
occupation,0,0.0,0,0.0
relationship,0,0.0,0,0.0
race,0,0.0,0,0.0
sex,0,0.0,0,0.0


#### Добавление переменной оценки пропущенных значений

In [7]:
def add_var_denote_NA(data,NA_col=[]):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if (data_copy[i] == '?').sum()>0:
            data_copy[i+'_is_NA'] = np.where(data_copy[i] == '?',1,0)
        else:
            warn("Нет пропущенных значений" % i)          
    return data_copy

In [8]:
data3 = add_var_denote_NA(data=data,NA_col=['native-country'])
print(data3['native-country_is_NA'].value_counts())
data3.head(15) # пропущенное значение пример под индексом 14

0    31978
1      583
Name: native-country_is_NA, dtype: int64


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class,native-country_is_NA
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,0
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K,0
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,0
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K,0
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K,0


#### Заполнение пропусков выборочным значением

In [9]:
def impute_NA_with_arbitrary(data,impute_value,NA_col=[]):  
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if (data_copy[i] == 0).sum()>0:
            data_copy[i+'_'+str(impute_value)] = data_copy[i].replace(0, impute_value)
        else:
            warn("Нет пропущенных значений" % i)
    return data_copy

In [10]:
data4 = impute_NA_with_arbitrary(data=data,impute_value=-999,NA_col=['capital-gain'])
data4.head(15)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class,capital-gain_-999
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,2174
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,-999
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,-999
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,-999
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,-999
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,-999
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K,-999
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,-999
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K,14084
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K,5178


#### Заполнение пропущенных значений средним/медианой/модой

In [11]:
def impute_NA_with_avg(data,strategy='mean',NA_col=[]):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if (data_copy[i] == 0).sum()>0:
            if strategy=='mean':
                data_copy[i+'_impute_mean'] = data_copy[i].replace(0, data[i].mean())
            elif strategy=='median':
                data_copy[i+'_impute_median'] = data_copy[i].replace(0, data[i].median())
            elif strategy=='mode':
                data_copy[i+'_impute_mode'] = data_copy[i].replace(0, data[i].mode()[0])
        else:
            warn("Нет пропущенных значений" % i)
    return data_copy  

In [12]:
print(data['capital-gain'].mean())
data5 = impute_NA_with_avg(data=data,strategy='mean',NA_col=['capital-gain'])
data5.head(15)

1077.6488437087312


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class,capital-gain_impute_mean
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,2174.0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,1077.648844
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,1077.648844
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,1077.648844
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,1077.648844
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,1077.648844
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K,1077.648844
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,1077.648844
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K,14084.0
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K,5178.0


#### Заполнение пропусков значением из "хвоста" распределения

In [13]:
def impute_NA_with_end_of_distribution(data,NA_col=[]):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if (data_copy[i] == 0).sum()>0:
            data_copy[i+'_impute_end_of_distri'] = data_copy[i].replace(0, data[i].mean()+3*data[i].std())
        else:
            warn("Нет пропущенных значений" % i)
    return data_copy     

In [14]:
data6 = impute_NA_with_end_of_distribution(data=data, NA_col=['capital-gain'])
data6.head(8)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class,capital-gain_impute_end_of_distri
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,2174.0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,23233.525098
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,23233.525098
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,23233.525098
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,23233.525098
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,23233.525098
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K,23233.525098
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,23233.525098


#### Заполнение пропусков случайными значениями

In [15]:
def impute_NA_with_random(data,NA_col=[],random_state=0):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if (data_copy[i] == '?').sum()>0:
            data_copy[i+'_random'] = data_copy[i]
            random_sample = data_copy[i].dropna().sample((data_copy[i] == '?').sum(), random_state=random_state)
            random_sample.index = data_copy[data_copy[i] == '?'].index
            data_copy.loc[data_copy[i] == '?', str(i)+'_random'] = random_sample
        else:
            warn("Нет пропущенных значений" % i)
    return data_copy 

In [16]:
data7 = impute_NA_with_random(data=data,NA_col=['native-country'])
data7.head(15)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class,native-country_random
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,Cuba
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,United-States
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K,Jamaica
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,United-States
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K,United-States
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K,United-States


## Выбросы

Детекция с помощью выборочных значений

In [17]:
def outlier_detect_arbitrary(data, col,upper_fence, lower_fence):
    para = (upper_fence, lower_fence)
    tmp = pd.concat([data[col]>upper_fence, data[col]<lower_fence], axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:', outlier_index.value_counts()[1])
    print('Доля выбросов:', outlier_index.value_counts()[1]/len(outlier_index))    
    return outlier_index, para

In [18]:
# больше максимального кол-ва дней в неделю = 280 часов и меньше 1го дня в неделю < 8 часов
index,para = outlier_detect_arbitrary(data=data, col='hours-per-week', upper_fence=280, lower_fence=8)
print('Верхняя граница:', para[0], '\nНижняя граница:', para[1])

Количество выбросов в данных: 295
Доля выбросов: 0.009059918307177298
Верхняя граница: 280 
Нижняя граница: 8


In [19]:
data.loc[index, 'hours-per-week'].sort_values().head()

32525    1
5632     1
5766     1
5808     1
8447     1
Name: hours-per-week, dtype: int64

Интерквартильное расстояние

In [20]:
def outlier_detect_IQR(data,col,threshold=3):    
    IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
    Lower_fence = data[col].quantile(0.25) - (IQR * threshold)
    Upper_fence = data[col].quantile(0.75) + (IQR * threshold)
    para = (Upper_fence, Lower_fence)
    tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [21]:
index,para = outlier_detect_IQR(data=data,col='hours-per-week',threshold=5)
print('Верхняя граница:',para[0],'\nНижняя граница:',para[1])

Количество выбросов в данных: 1475
Доля выбросов: 0.04529959153588649
Верхняя граница: 70.0 
Нижняя граница: 15.0


In [22]:
data.loc[index,'hours-per-week'].sort_values().head()

24284    1
22960    1
25078    1
5766     1
5590     1
Name: hours-per-week, dtype: int64

Среднее-среднеквадратичное отклонение

In [23]:
def outlier_detect_mean_std(data,col,threshold=3):
    Upper_fence = data[col].mean() + threshold * data[col].std()
    Lower_fence = data[col].mean() - threshold * data[col].std()   
    para = (Upper_fence, Lower_fence)   
    tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [24]:
index,para = outlier_detect_mean_std(data=data,col='hours-per-week',threshold=3)
print('Верхняя граница:',para[0],'\nНижняя граница:',para[1])

Количество выбросов в данных: 440
Доля выбросов: 0.013513098492061055
Верхняя граница: 77.47974189728542 
Нижняя граница: 3.3951698069005687


In [25]:
data.loc[index,'hours-per-week'].sort_values().head()

25078    1
5590     1
20909    1
19337    1
1262     1
Name: hours-per-week, dtype: int64

Медианы абсолютного отклонения (MAD)

In [26]:
def outlier_detect_MAD(data,col,threshold=3.5):
    median = data[col].median()
    median_absolute_deviation = np.median([np.abs(y - median) for y in data[col]])
    modified_z_scores = pd.Series([0.6745 * (y - median) / median_absolute_deviation for y in data[col]])
    outlier_index = np.abs(modified_z_scores) > threshold
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index

In [27]:
index = outlier_detect_MAD(data=data,col='hours-per-week',threshold=3.5)

Количество выбросов в данных: 6001
Доля выбросов: 0.1843002364792236


In [28]:
data.loc[index,'hours-per-week'].sort_values().head()

1262     1
1036     1
189      1
9147     1
19337    1
Name: hours-per-week, dtype: int64

Замена выброса выборочным значением

In [29]:
def impute_outlier_with_arbitrary(data,outlier_index,value,col=[]):
    data_copy = data.copy(deep=True)
    for i in col:
        data_copy.loc[outlier_index,i] = value
    return data_copy

In [30]:
data2 = impute_outlier_with_arbitrary(data=data,outlier_index=index,
                                         value=40,col=['hours-per-week'])
data2[25:35]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class
25,56,Local-gov,216851,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,>50K
26,19,Private,168294,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States,<=50K
27,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,40,South,>50K
28,39,Private,367260,HS-grad,9,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,<=50K
29,49,Private,193366,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
30,23,Local-gov,190709,Assoc-acdm,12,Never-married,Protective-serv,Not-in-family,White,Male,0,0,52,United-States,<=50K
31,20,Private,266015,Some-college,10,Never-married,Sales,Own-child,Black,Male,0,0,44,United-States,<=50K
32,45,Private,386940,Bachelors,13,Divorced,Exec-managerial,Own-child,White,Male,0,1408,40,United-States,<=50K
33,30,Federal-gov,59951,Some-college,10,Married-civ-spouse,Adm-clerical,Own-child,White,Male,0,0,40,United-States,<=50K
34,22,State-gov,311512,Some-college,10,Married-civ-spouse,Other-service,Husband,Black,Male,0,0,40,United-States,<=50K


Виндзоризация

In [31]:
def windsorization(data,col,para,strategy='both'):
    data_copy = data.copy(deep=True)  
    if strategy == 'both':
        data_copy.loc[data_copy[col]>para[0],col] = para[0]
        data_copy.loc[data_copy[col]<para[1],col] = para[1]
    elif strategy == 'top':
        data_copy.loc[data_copy[col]>para[0],col] = para[0]
    elif strategy == 'bottom':
        data_copy.loc[data_copy[col]<para[1],col] = para[1]  
    return data_copy

In [32]:
data3 = windsorization(data=data,col='hours-per-week',para=para,strategy='both')
data3[25:35]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class
25,56,Local-gov,216851,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40.0,United-States,>50K
26,19,Private,168294,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,40.0,United-States,<=50K
27,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60.0,South,>50K
28,39,Private,367260,HS-grad,9,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,77.479742,United-States,<=50K
29,49,Private,193366,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40.0,United-States,<=50K
30,23,Local-gov,190709,Assoc-acdm,12,Never-married,Protective-serv,Not-in-family,White,Male,0,0,52.0,United-States,<=50K
31,20,Private,266015,Some-college,10,Never-married,Sales,Own-child,Black,Male,0,0,44.0,United-States,<=50K
32,45,Private,386940,Bachelors,13,Divorced,Exec-managerial,Own-child,White,Male,0,1408,40.0,United-States,<=50K
33,30,Federal-gov,59951,Some-college,10,Married-civ-spouse,Adm-clerical,Own-child,White,Male,0,0,40.0,United-States,<=50K
34,22,State-gov,311512,Some-college,10,Married-civ-spouse,Other-service,Husband,Black,Male,0,0,15.0,United-States,<=50K


Удаление выбросов

In [33]:
def drop_outlier(data,outlier_index):
    data_copy = data[~outlier_index]
    return data_copy

In [34]:
data4 = drop_outlier(data=data,outlier_index=index)
print(data4['hours-per-week'].max())
print(data4['hours-per-week'].min())

55
25


Замена выбросов средним/медианой/модой

In [35]:
def impute_outlier_with_avg(data,col,outlier_index,strategy='mean'):
    data_copy = data.copy(deep=True)
    if strategy=='mean':
        data_copy.loc[outlier_index,col] = data_copy[col].mean()
    elif strategy=='median':
        data_copy.loc[outlier_index,col] = data_copy[col].median()
    elif strategy=='mode':
        data_copy.loc[outlier_index,col] = data_copy[col].mode()[0]           
    return data_copy

In [36]:
data5 = impute_outlier_with_avg(data=data,col='hours-per-week',
                                   outlier_index=index,strategy='mean')
data5[25:35]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class
25,56,Local-gov,216851,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40.0,United-States,>50K
26,19,Private,168294,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,40.0,United-States,<=50K
27,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,40.437456,South,>50K
28,39,Private,367260,HS-grad,9,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,40.437456,United-States,<=50K
29,49,Private,193366,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40.0,United-States,<=50K
30,23,Local-gov,190709,Assoc-acdm,12,Never-married,Protective-serv,Not-in-family,White,Male,0,0,52.0,United-States,<=50K
31,20,Private,266015,Some-college,10,Never-married,Sales,Own-child,Black,Male,0,0,44.0,United-States,<=50K
32,45,Private,386940,Bachelors,13,Divorced,Exec-managerial,Own-child,White,Male,0,1408,40.0,United-States,<=50K
33,30,Federal-gov,59951,Some-college,10,Married-civ-spouse,Adm-clerical,Own-child,White,Male,0,0,40.0,United-States,<=50K
34,22,State-gov,311512,Some-college,10,Married-civ-spouse,Other-service,Husband,Black,Male,0,0,40.437456,United-States,<=50K


##  Шкалирование данных

In [37]:
from sklearn.preprocessing import LabelEncoder
X_train, X_test, y_train, y_test = train_test_split(data, data['salary-class'], test_size=0.2, random_state=0)
labelencoder_Y = LabelEncoder()
y_train = labelencoder_Y.fit_transform(y_train)
y_test = labelencoder_Y.fit_transform(y_test)
X_train.shape, X_test.shape

((26048, 15), (6513, 15))

Нормализация данных

In [38]:
ss = StandardScaler().fit(X_train[['hours-per-week']])
X_train_copy = X_train.copy(deep=True)
X_train_copy['hours-per-week_zscore'] = ss.transform(X_train_copy[['hours-per-week']])
print(X_train_copy.head(6))

       age         workclass  fnlwgt     education  education-num  \
15282   36           Private  174308          11th              7   
24870   35           Private  198202       HS-grad              9   
18822   38           Private   52963     Bachelors             13   
26404   50           Private  138270       HS-grad              9   
7842    68  Self-emp-not-inc  116903     Assoc-voc             11   
4890    51  Self-emp-not-inc  149220  Some-college             10   

           marital-status        occupation   relationship   race     sex  \
15282            Divorced  Transport-moving  Not-in-family  White    Male   
24870       Never-married   Exec-managerial  Not-in-family  White  Female   
18822       Never-married      Adm-clerical  Not-in-family  White  Female   
26404  Married-civ-spouse             Sales           Wife  Black  Female   
7842   Married-civ-spouse    Prof-specialty        Husband  White    Male   
4890   Married-civ-spouse   Farming-fishing        Hus

In [39]:
print(X_train_copy['hours-per-week_zscore'].mean())
print(X_train_copy['hours-per-week_zscore'].std())

3.599657768147942e-16
1.0000191958843228


Мин-Макс шкалирование

In [40]:
mms = MinMaxScaler().fit(X_train[['hours-per-week']])
X_train_copy = X_train.copy(deep=True)
X_train_copy['hours-per-week_minmax'] = mms.transform(X_train_copy[['hours-per-week']])
print(X_train_copy.head(6))

       age         workclass  fnlwgt     education  education-num  \
15282   36           Private  174308          11th              7   
24870   35           Private  198202       HS-grad              9   
18822   38           Private   52963     Bachelors             13   
26404   50           Private  138270       HS-grad              9   
7842    68  Self-emp-not-inc  116903     Assoc-voc             11   
4890    51  Self-emp-not-inc  149220  Some-college             10   

           marital-status        occupation   relationship   race     sex  \
15282            Divorced  Transport-moving  Not-in-family  White    Male   
24870       Never-married   Exec-managerial  Not-in-family  White  Female   
18822       Never-married      Adm-clerical  Not-in-family  White  Female   
26404  Married-civ-spouse             Sales           Wife  Black  Female   
7842   Married-civ-spouse    Prof-specialty        Husband  White    Male   
4890   Married-civ-spouse   Farming-fishing        Hus

In [41]:
print(X_train_copy['hours-per-week_minmax'].max())
print(X_train_copy['hours-per-week_minmax'].min())

0.9999999999999999
0.0


Робустное шкалирование

In [42]:
rs = RobustScaler().fit(X_train[['hours-per-week']])
X_train_copy = X_train.copy(deep=True)
X_train_copy['hours-per-week_robust'] = rs.transform(X_train_copy[['hours-per-week']])
print(X_train_copy.head(6))

       age         workclass  fnlwgt     education  education-num  \
15282   36           Private  174308          11th              7   
24870   35           Private  198202       HS-grad              9   
18822   38           Private   52963     Bachelors             13   
26404   50           Private  138270       HS-grad              9   
7842    68  Self-emp-not-inc  116903     Assoc-voc             11   
4890    51  Self-emp-not-inc  149220  Some-college             10   

           marital-status        occupation   relationship   race     sex  \
15282            Divorced  Transport-moving  Not-in-family  White    Male   
24870       Never-married   Exec-managerial  Not-in-family  White  Female   
18822       Never-married      Adm-clerical  Not-in-family  White  Female   
26404  Married-civ-spouse             Sales           Wife  Black  Female   
7842   Married-civ-spouse    Prof-specialty        Husband  White    Male   
4890   Married-civ-spouse   Farming-fishing        Hus

## Кодирование переменных

One-Hot encoding

In [43]:
data1 = pd.get_dummies(data,drop_first=True)
data1.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,salary-class_>50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


Кодирование значением

In [44]:
ord_enc = ce.OrdinalEncoder(cols=['race']).fit(X_train,y_train)

In [45]:
data4 = ord_enc.transform(data)
print(data4.head(5))

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship  race     sex  \
0       Never-married       Adm-clerical  Not-in-family     1    Male   
1  Married-civ-spouse    Exec-managerial        Husband     1    Male   
2            Divorced  Handlers-cleaners  Not-in-family     1    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband     2    Male   
4  Married-civ-spouse     Prof-specialty           Wife     2  Female   

   capital-gain  capital-loss  hours-per-week native-country salary-class  
0          2174             0              40  United-States        <=50K  
1             0             0       

WOE кодирование

In [46]:
woe_enc = ce.WOEEncoder(cols=['race']).fit(X_train,y_train)

In [47]:
data3 = woe_enc.transform(data)
data3.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,0.080237,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,0.080237,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,0.080237,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,-0.802247,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,-0.802247,Female,0,0,40,Cuba,<=50K


Целевое кодирование

In [48]:
target_enc = ce.TargetEncoder(cols=['race']).fit(X_train,y_train)

In [49]:
data2 = target_enc.transform(data)
data2.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,0.254738,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,0.254738,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,0.254738,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,0.123618,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,0.123618,Female,0,0,40,Cuba,<=50K


## Feature Selection

In [50]:
continuous = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
categorical = ['workclass', 'education', 'marital-status',  'occupation', 'relationship', 'sex', 'native-country', 'salary-class']

In [51]:
df = drop_missing(data)

df.head(15)

  result = method(y)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [52]:
Y = df['salary-class']
X = df.drop('salary-class', axis=1)

labelencoder = LabelEncoder()
Y = labelencoder.fit_transform(Y)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,
                                                    random_state=42)
X_train.shape, X_test.shape

((22792, 14), (9769, 14))

In [53]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)
X_train.head()


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
19749,34,56460,9,0,2179,12,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1216,48,243631,10,7688,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
27962,23,56402,10,0,0,30,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
23077,56,255406,9,0,0,40,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
10180,17,297246,7,0,0,9,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


#### Константные значения

In [54]:
def constant_feature_detect(data,threshold=0.98):    
    data_copy = data.copy(deep=True)
    quasi_constant_feature = []
    for feature in data_copy.columns:
        predominant = (data_copy[feature].value_counts() / np.float(
                      len(data_copy))).sort_values(ascending=False).values[0]
        if predominant >= threshold:
            quasi_constant_feature.append(feature)
    print(len(quasi_constant_feature),'константные переменные')    
    return quasi_constant_feature

In [55]:
quasi_constant_feature = constant_feature_detect(data=X_train,threshold=0.9)
quasi_constant_feature

77 константные переменные


['capital-gain',
 'capital-loss',
 'workclass_Local-gov',
 'workclass_Never-worked',
 'workclass_Self-emp-inc',
 'workclass_Self-emp-not-inc',
 'workclass_State-gov',
 'workclass_Without-pay',
 'education_11th',
 'education_12th',
 'education_1st-4th',
 'education_5th-6th',
 'education_7th-8th',
 'education_9th',
 'education_Assoc-acdm',
 'education_Assoc-voc',
 'education_Doctorate',
 'education_Masters',
 'education_Preschool',
 'education_Prof-school',
 'marital-status_Married-AF-spouse',
 'marital-status_Married-spouse-absent',
 'marital-status_Separated',
 'marital-status_Widowed',
 'occupation_Armed-Forces',
 'occupation_Farming-fishing',
 'occupation_Handlers-cleaners',
 'occupation_Machine-op-inspct',
 'occupation_Other-service',
 'occupation_Priv-house-serv',
 'occupation_Protective-serv',
 'occupation_Tech-support',
 'occupation_Transport-moving',
 'relationship_Other-relative',
 'relationship_Wife',
 'race_Asian-Pac-Islander',
 'race_Black',
 'race_Other',
 'native-country_C

In [56]:
X_train.drop(labels=quasi_constant_feature,axis=1,inplace=True)
print(X_train.shape)
X_train.head()

(22792, 20)


Unnamed: 0,age,fnlwgt,education-num,hours-per-week,workclass_Private,education_Bachelors,education_HS-grad,education_Some-college,marital-status_Married-civ-spouse,marital-status_Never-married,occupation_Craft-repair,occupation_Exec-managerial,occupation_Prof-specialty,occupation_Sales,relationship_Not-in-family,relationship_Own-child,relationship_Unmarried,race_White,sex_Male,native-country_United-States
19749,34,56460,9,12,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1
1216,48,243631,10,40,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,1
27962,23,56402,10,30,0,0,0,1,1,0,0,0,1,0,0,0,0,1,1,1
23077,56,255406,9,40,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1
10180,17,297246,7,9,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1


#### Корреляционная фильтрация

In [57]:
def corr_feature_detect(data,threshold=0.8):
    
    corrmat = data.corr()
    corrmat = corrmat.abs().unstack() 
    corrmat = corrmat.sort_values(ascending=False)
    corrmat = corrmat[corrmat >= threshold]
    corrmat = corrmat[corrmat < 1] 
    corrmat = pd.DataFrame(corrmat).reset_index()
    corrmat.columns = ['feature1', 'feature2', 'corr']
   
    grouped_feature_ls = []
    correlated_groups = []
    
    for feature in corrmat.feature1.unique():
        if feature not in grouped_feature_ls:
    
            correlated_block = corrmat[corrmat.feature1 == feature]
            grouped_feature_ls = grouped_feature_ls + list(
                correlated_block.feature2.unique()) + [feature]
    
            correlated_groups.append(correlated_block)
    return correlated_groups

In [58]:
corr = corr_feature_detect(data=X_train,threshold=0.9)
for i in corr:
    print(i,'\n')

#### Нет высоко коррелирующих значений

In [59]:
X_train.corr()

Unnamed: 0,age,fnlwgt,education-num,hours-per-week,workclass_Private,education_Bachelors,education_HS-grad,education_Some-college,marital-status_Married-civ-spouse,marital-status_Never-married,occupation_Craft-repair,occupation_Exec-managerial,occupation_Prof-specialty,occupation_Sales,relationship_Not-in-family,relationship_Own-child,relationship_Unmarried,race_White,sex_Male,native-country_United-States
age,1.0,-0.076223,0.041348,0.069102,-0.200311,0.010879,0.019409,-0.110868,0.31964,-0.535578,0.010106,0.105868,0.052972,-0.03033,-0.00681,-0.43472,0.036951,0.0326,0.089279,0.01815
fnlwgt,-0.076223,1.0,-0.041946,-0.016227,0.04169,-0.002551,-0.006977,-0.00917,-0.027356,0.038151,0.005404,-0.014741,-0.013334,0.007487,0.008657,0.012416,0.005209,-0.055826,0.024053,-0.069799
education-num,0.041348,-0.041946,1.0,0.14767,-0.118127,0.502396,-0.291074,-0.018982,0.092946,-0.038773,-0.140391,0.208927,0.417513,0.026665,0.051077,-0.099482,-0.062405,0.050875,0.02095,0.108119
hours-per-week,0.069102,-0.016227,0.14767,1.0,-0.022191,0.080601,0.010716,-0.068117,0.219831,-0.197496,0.053295,0.140203,0.058263,0.007317,-0.001481,-0.248128,-0.036129,0.049681,0.231592,0.003415
workclass_Private,-0.200311,0.04169,-0.118127,-0.022191,1.0,-0.034527,0.072977,0.002906,-0.095043,0.111789,0.0676,-0.033679,-0.120304,0.077567,0.030493,0.064231,0.017989,0.002441,-0.037507,-0.043748
education_Bachelors,0.010879,-0.002551,0.502396,0.080601,-0.034527,1.0,-0.30628,-0.238948,0.054305,0.001072,-0.114976,0.185472,0.199303,0.048379,0.054233,-0.069469,-0.061807,0.024729,0.028282,-0.003064
education_HS-grad,0.019409,-0.006977,-0.291074,0.010716,0.072977,-0.30628,1.0,-0.369029,-0.00273,-0.049717,0.124465,-0.103144,-0.216004,-0.017441,-0.025102,-0.020127,0.049122,-0.016547,0.014585,0.064867
education_Some-college,-0.110868,-0.00917,-0.018982,-0.068117,0.002906,-0.238948,-0.369029,1.0,-0.078703,0.087535,-0.005886,-0.013569,-0.110054,0.040224,-0.007876,0.116948,0.001725,-0.003192,-0.059786,0.047776
marital-status_Married-civ-spouse,0.31964,-0.027356,0.092946,0.219831,-0.095043,0.054305,-0.00273,-0.078703,1.0,-0.648057,0.124477,0.109455,0.03799,0.002846,-0.535673,-0.381011,-0.318559,0.101088,0.435982,-0.002975
marital-status_Never-married,-0.535578,0.038151,-0.038773,-0.197496,0.111789,0.001072,-0.049717,0.087535,-0.648057,1.0,-0.090469,-0.10917,-0.024173,0.025558,0.297237,0.512531,-0.052667,-0.064458,-0.171789,-0.003531


#### Взаимная информация

In [60]:
def mutual_info(X,y,select_k=10):

    if select_k >= 1:
        sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y)
        col = X.columns[sel_.get_support()]
        
    elif 0 < select_k < 1:
        sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y)
        col = X.columns[sel_.get_support()]   
        
    else:
        raise ValueError("select_k должно быть положительным значением")
    
    return col

In [61]:
mi = mutual_info(X=X_train,y=y_train,select_k=3)
print(mi)

Index(['age', 'education-num', 'marital-status_Married-civ-spouse'], dtype='object')


In [62]:
mi = mutual_info(X=X_train,y=y_train,select_k=0.2)
print(mi)

Index(['age', 'education-num', 'marital-status_Married-civ-spouse',
       'marital-status_Never-married'],
      dtype='object')


#### Хи-квадрат тест

In [63]:
def khi_square_test(X,y,select_k=10):

    if select_k >= 1:
        sel_ = SelectKBest(chi2, k=select_k).fit(X,y)
        col = X.columns[sel_.get_support()]
    elif 0 < select_k < 1:
        sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X,y)
        col = X.columns[sel_.get_support()]   
    else:
        raise ValueError("select_k должно быть положительным значением")  
    
    return col

In [64]:
khi = khi_square_test(X=X_train,y=y_train,select_k=3)
print(khi)

Index(['age', 'fnlwgt', 'hours-per-week'], dtype='object')


#### Одномерный ROC-AUC или MSE анализ

In [65]:
def univariate_roc_auc(X_train,y_train,X_test,y_test,threshold):

    roc_values = []
    for feature in X_train.columns:
        clf = DecisionTreeClassifier()
        clf.fit(X_train[feature].to_frame(), y_train)
        y_scored = clf.predict_proba(X_test[feature].to_frame())
        roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
    roc_values = pd.Series(roc_values)
    roc_values.index = X_train.columns
    print(roc_values.sort_values(ascending=False))
    print(len(roc_values[roc_values > threshold]), len(X_train.columns))
    keep_col = roc_values[roc_values > threshold]
    return keep_col

In [66]:
uni_roc_auc = univariate_roc_auc(X_train=X_train,y_train=y_train,
                                   X_test=X_test,y_test=y_test,threshold=0.8)
print(uni_roc_auc)

marital-status_Married-civ-spouse    0.754957
education-num                        0.713471
age                                  0.705835
marital-status_Never-married         0.671330
hours-per-week                       0.670869
sex_Male                             0.620900
relationship_Own-child               0.595649
relationship_Not-in-family           0.592595
occupation_Exec-managerial           0.579457
education_Bachelors                  0.577874
occupation_Prof-specialty            0.577825
fnlwgt                               0.571964
education_HS-grad                    0.565895
relationship_Unmarried               0.554127
workclass_Private                    0.542201
race_White                           0.535040
education_Some-college               0.529778
native-country_United-States         0.507042
occupation_Sales                     0.506734
occupation_Craft-repair              0.503922
dtype: float64
0 20
Series([], dtype: float64)


In [67]:
def univariate_mse(X_train,y_train,X_test,y_test,threshold):

    mse_values = []
    for feature in X_train.columns:
        clf = DecisionTreeRegressor()
        clf.fit(X_train[feature].to_frame(), y_train)
        y_scored = clf.predict(X_test[feature].to_frame())
        mse_values.append(mean_squared_error(y_test, y_scored))
    mse_values = pd.Series(mse_values)
    mse_values.index = X_train.columns
    print(mse_values.sort_values(ascending=False))
    print(len(mse_values[mse_values > threshold]), len(X_train.columns))
    keep_col = mse_values[mse_values > threshold]
    return keep_col   

In [68]:
uni_mse = univariate_mse(X_train=X_train,y_train=y_train,
                            X_test=X_test,y_test=y_test,threshold=0.4)
print(uni_mse)

fnlwgt                               0.300211
native-country_United-States         0.180795
occupation_Craft-repair              0.180779
occupation_Sales                     0.180743
education_Some-college               0.180133
workclass_Private                    0.179692
race_White                           0.179475
education_HS-grad                    0.178265
relationship_Unmarried               0.176765
relationship_Not-in-family           0.175021
education_Bachelors                  0.174950
occupation_Prof-specialty            0.173729
occupation_Exec-managerial           0.173218
sex_Male                             0.172172
relationship_Own-child               0.171700
hours-per-week                       0.168099
marital-status_Never-married         0.163393
age                                  0.163156
education-num                        0.156779
marital-status_Married-civ-spouse    0.146571
dtype: float64
0 20
Series([], dtype: float64)


## Oversampling/undersampling

In [69]:
len(y_train)

22792

In [70]:
sm = SMOTE(random_state=42)

In [71]:
X_res, y_res = sm.fit_resample(X_train, y_train)

In [72]:
len(y_res)

34530

In [73]:
cn = CondensedNearestNeighbour(random_state=42)

In [None]:
X_res, y_res = cn.fit_resample(X_train, y_train)

In [None]:
len(y_res)