In [1]:
import pandas as pd
import numpy as np

In [356]:
data = pd.read_csv( 'adult.csv' )
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


https://www.rdocumentation.org/packages/arules/versions/1.6-6/topics/Adult

http://www.cs.toronto.edu/~delve/data/adult/adultDetail.html

In [357]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
age                48842 non-null int64
workclass          48842 non-null object
fnlwgt             48842 non-null int64
education          48842 non-null object
educational-num    48842 non-null int64
marital-status     48842 non-null object
occupation         48842 non-null object
relationship       48842 non-null object
race               48842 non-null object
gender             48842 non-null object
capital-gain       48842 non-null int64
capital-loss       48842 non-null int64
hours-per-week     48842 non-null int64
native-country     48842 non-null object
income             48842 non-null object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [358]:
data.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


### workclass & occupation

In [359]:
data.workclass.value_counts()
data.occupation.value_counts()
# data[data.workclass =='?'] # 2799 значений  = ?
# data[data.occupation=='?'] # 2809 значений  = ?

Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
?                    2809
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: occupation, dtype: int64

In [360]:
data[(data['occupation']=='?')&(data['workclass']!='?')]
data[(data['workclass']=='Never-worked')].occupation.value_counts() 

?    10
Name: occupation, dtype: int64

В отношении признаков считаю достаточным руководствоваться следующими критериями:
- если 'workclass' не определен и 'occupation' не определен, то workclass  присваиаем значение Other'
- если 'workclass' == 'Never-worked' и 'occupation' не определен, то workclass  оставляем  значение 'Never-worked'
(если человек никогда не работал, то и признака отраслевой принадлежности работы не будет)


Исключим признак occupation = ? при отсутствии 'workclass' == '?'  или 'workclass' == 'Never-worked' на уровне OneHotEncoding как отсутствие признаков отраслевой принадлежности

In [361]:
data.loc[data.workclass =='?', 'workclass'] = 'Other'

### Education & educational-num

In [362]:
data.education.value_counts() #educational-num то же самое только в коде

HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: education, dtype: int64

In [363]:
data['educational-num'].value_counts() 
# распределение значений по полю educational-num в точности совпадает с распределением значений по полю education
# поле educational-num- код вида образования -  не несет никакой смысловой нагрузки, избыточно , к удалению
data = data.drop(['educational-num'], axis = 1)
data.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K


In [364]:
pt1 = pd.pivot_table(data, 
                     values     = ['age'],  
                     index      = ['education'],
                     columns    = ['income'], 
                     aggfunc    = ['count'],
                     margins    = True)
pt1.columns = ['<=50K', '>50K', 'All']
pt1['percent_>50'] = 100* pt1['>50K'] / pt1['All']
pt1.sort_values('percent_>50', ascending = False)

Unnamed: 0_level_0,<=50K,>50K,All,percent_>50
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Prof-school,217,617,834,73.980815
Doctorate,163,431,594,72.558923
Masters,1198,1459,2657,54.911554
Bachelors,4712,3313,8025,41.283489
Assoc-acdm,1188,413,1601,25.796377
Assoc-voc,1539,522,2061,25.327511
All,37155,11687,48842,23.928177
Some-college,8815,2063,10878,18.964883
HS-grad,13281,2503,15784,15.857831
12th,609,48,657,7.305936


Объединим наиболее очевидные схожие признаки

In [365]:
data.loc[data.education == '1st-4th',           'education']      = 'elementary_school'
data.loc[data.education.str.contains('th'),     'education']      = 'middle_school'
data.loc[data.education.str.contains('Assoc'),  'education']      = 'Assoc_pr'
data.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,middle_school,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K


### marital-status

In [366]:
pt2 = pd.pivot_table(data, 
                     values     = ['age'],  
                     index      = ['marital-status'],
                     columns    = ['income'], 
                     aggfunc    = ['count'],
                     margins    = True)
pt2.columns = ['<=50K', '>50K', 'All']
pt2['percent_>50'] = 100* pt2['>50K'] / pt2['All']
pt2.sort_values('percent_>50', ascending = False)

Unnamed: 0_level_0,<=50K,>50K,All,percent_>50
marital-status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Married-civ-spouse,12395,9984,22379,44.613253
Married-AF-spouse,23,14,37,37.837838
All,37155,11687,48842,23.928177
Divorced,5962,671,6633,10.116086
Married-spouse-absent,570,58,628,9.235669
Widowed,1390,128,1518,8.432148
Separated,1431,99,1530,6.470588
Never-married,15384,733,16117,4.547993


In [367]:
# data[data['relationship']=='Own-child'].age.value_counts() не зависит от возраста
# data[data['relationship']=='Not-in-family'].age.value_counts() не зависит от возраста

pt3 = pd.pivot_table(data, 
                     values     = ['age'],  
                     index      = ['relationship'],
                     columns    = ['income'], 
                     aggfunc    = ['count'],
                     margins    = True)
pt3.columns = ['<=50K', '>50K', 'All']
pt3['percent_>50'] = 100* pt3['>50K'] / pt3['All']
pt3.sort_values('percent_>50', ascending = False)

Unnamed: 0_level_0,<=50K,>50K,All,percent_>50
relationship,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Wife,1238,1093,2331,46.889747
Husband,10870,8846,19716,44.867113
All,37155,11687,48842,23.928177
Not-in-family,11307,1276,12583,10.140666
Unmarried,4816,309,5125,6.029268
Other-relative,1454,52,1506,3.452855
Own-child,7470,111,7581,1.464187


Признак relationship не несет содержательной нагрузки и полностью раскрывается либо по полу либо по marital-status

In [368]:
data = data.drop(['relationship'], axis = 1)
data.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,middle_school,Never-married,Machine-op-inspct,Black,Male,0,0,40,United-States,<=50K


### Race, gender, hours-per-week etc

In [369]:
data.race.value_counts()                    # не вижу очевидных аномалий
data.gender.value_counts()                  # не вижу очевидных аномалий
data['hours-per-week'].value_counts()       # не вижу очевидных аномалий
data['hours-per-week'].describe()           # не вижу очевидных аномалий

count    48842.000000
mean        40.422382
std         12.391444
min          1.000000
25%         40.000000
50%         40.000000
75%         45.000000
max         99.000000
Name: hours-per-week, dtype: float64

In [370]:
data.loc[data.gender == 'Male', 'gender'] = 1
data.loc[data.gender != 'Male', 'gender'] = 0

### native-country

In [371]:
data['native-country'].value_counts()  
pt4 = pd.pivot_table(data, 
                     values     = ['age'],  
                     index      = ['native-country'],
                     columns    = ['income'], 
                     aggfunc    = ['count'],
                     margins    = True)
pt4.columns = ['<=50K', '>50K', 'All']
pt4['percent_>50'] = 100* pt4['>50K'] / pt4['All']
pt4.sort_values('percent_>50', ascending = False).head(10) 

Unnamed: 0_level_0,<=50K,>50K,All,percent_>50
native-country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
France,22.0,16.0,38,42.105263
India,89.0,62.0,151,41.059603
Taiwan,39.0,26.0,65,40.0
Iran,37.0,22.0,59,37.288136
England,80.0,47.0,127,37.007874
Greece,31.0,18.0,49,36.734694
Yugoslavia,15.0,8.0,23,34.782609
Japan,60.0,32.0,92,34.782609
Canada,119.0,63.0,182,34.615385
Italy,71.0,34.0,105,32.380952


> 95%  наблюдений - США
Не вижу закономерностей по группам стран
Преобразуем в бинарный признак

In [372]:
data.loc[data['native-country'] == 'United-States', 'native-country'] = 1
data.loc[data['native-country'] != 'United-States', 'native-country'] = 0
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,middle_school,Never-married,Machine-op-inspct,Black,0,0,0,40,0,<=50K
1,38,Private,89814,HS-grad,Married-civ-spouse,Farming-fishing,White,0,0,0,50,0,<=50K
2,28,Local-gov,336951,Assoc_pr,Married-civ-spouse,Protective-serv,White,0,0,0,40,0,>50K
3,44,Private,160323,Some-college,Married-civ-spouse,Machine-op-inspct,Black,0,7688,0,40,0,>50K
4,18,Other,103497,Some-college,Never-married,?,White,0,0,0,30,0,<=50K


### capital-gain & capital-loss

In [373]:
data.iloc[:,[8,9]].describe()

Unnamed: 0,capital-gain,capital-loss
count,48842.0,48842.0
mean,1079.067626,87.502314
std,7452.019058,403.004552
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,99999.0,4356.0


In [374]:
data[data['capital-gain']>30000]['capital-gain'].value_counts() 

# неочевидный выброс, оставим

99999    244
34095      6
41310      3
Name: capital-gain, dtype: int64

### income

In [375]:
data['income'].value_counts()

<=50K    37155
>50K     11687
Name: income, dtype: int64

In [376]:
data.loc[data['income'] == '<=50K', 'income'] = 0
data.loc[data['income'] == '>50K',  'income'] = 1

In [377]:
data.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,middle_school,Never-married,Machine-op-inspct,Black,0,0,0,40,0,0


### Encoding

In [378]:
data = pd.get_dummies(data=data, columns=['workclass', 'education','marital-status','occupation','race'])
data = data.drop(['occupation_?','fnlwgt'], axis = 1)
data.columns

Index(['age', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week',
       'native-country', 'income', 'workclass_Federal-gov',
       'workclass_Local-gov', 'workclass_Never-worked', 'workclass_Other',
       'workclass_Private', 'workclass_Self-emp-inc',
       'workclass_Self-emp-not-inc', 'workclass_State-gov',
       'workclass_Without-pay', 'education_Assoc_pr', 'education_Bachelors',
       'education_Doctorate', 'education_HS-grad', 'education_Masters',
       'education_Preschool', 'education_Prof-school',
       'education_Some-college', 'education_elementary_school',
       'education_middle_school', 'marital-status_Divorced',
       'marital-status_Married-AF-spouse', 'marital-status_Married-civ-spouse',
       'marital-status_Married-spouse-absent', 'marital-status_Never-married',
       'marital-status_Separated', 'marital-status_Widowed',
       'occupation_Adm-clerical', 'occupation_Armed-Forces',
       'occupation_Craft-repair', 'occupation_Exec-managerial',
   

In [379]:
data = data.iloc[:, [6,0,1,2,3,4,5] + list(range(7, data.shape[1]))]
data.columns

Index(['income', 'age', 'gender', 'capital-gain', 'capital-loss',
       'hours-per-week', 'native-country', 'workclass_Federal-gov',
       'workclass_Local-gov', 'workclass_Never-worked', 'workclass_Other',
       'workclass_Private', 'workclass_Self-emp-inc',
       'workclass_Self-emp-not-inc', 'workclass_State-gov',
       'workclass_Without-pay', 'education_Assoc_pr', 'education_Bachelors',
       'education_Doctorate', 'education_HS-grad', 'education_Masters',
       'education_Preschool', 'education_Prof-school',
       'education_Some-college', 'education_elementary_school',
       'education_middle_school', 'marital-status_Divorced',
       'marital-status_Married-AF-spouse', 'marital-status_Married-civ-spouse',
       'marital-status_Married-spouse-absent', 'marital-status_Never-married',
       'marital-status_Separated', 'marital-status_Widowed',
       'occupation_Adm-clerical', 'occupation_Armed-Forces',
       'occupation_Craft-repair', 'occupation_Exec-managerial',
   

In [380]:
from sklearn import preprocessing
headers = list(data.iloc[:,[1,3,4,5]].columns.values)
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
data_scaled = min_max_scaler.fit_transform(data.iloc[:,[0,2,3,4]])
data_norm = pd.DataFrame(data_scaled)
data_norm.columns = headers
data_norm.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
0,-1.0,-1.0,-1.0,-1.0
1,-1.0,-1.0,-1.0,-1.0
2,1.0,-1.0,-1.0,-1.0
3,1.0,-1.0,-0.846238,-1.0
4,-1.0,-1.0,-1.0,-1.0


In [381]:
data_a = data.merge(data_norm, how = 'left', left_on = data.index, right_on=data_norm.index)
data_a.head(1)

Unnamed: 0,key_0,income,age_x,gender,capital-gain_x,capital-loss_x,hours-per-week_x,native-country,workclass_Federal-gov,workclass_Local-gov,...,occupation_Transport-moving,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,age_y,capital-gain_y,capital-loss_y,hours-per-week_y
0,0,0,25,0,0,0,40,0,0,0,...,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0


In [382]:
data_a = data_a.iloc[:, [1,3] + list(range(7, data_a.shape[1]))]
data_a.head(1)

Unnamed: 0,income,gender,native-country,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Other,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,occupation_Transport-moving,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,age_y,capital-gain_y,capital-loss_y,hours-per-week_y
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0


In [389]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

model = LogisticRegression()
le=LabelEncoder()

y =  data_a.iloc[:,0]
X  = data_a.iloc[:,1:]

targets=(output_y==1)
Y= le.fit_transform(targets)

model.fit(X, y)
model.score(X, y)

1.0

In [390]:
# если бы не было нормализации

y1 =  data.iloc[:,0]
X1  = data.iloc[:,1:]

targets=(output_y==1)
Y= le.fit_transform(targets)

model.fit(X1, y1)
model.score(X1, y1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8470988084026043

In [388]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

output_y = data_a.iloc[:,0]
input_x  = data_a.iloc[:,1:]

le=LabelEncoder()
targets=(output_y==1)
Y= le.fit_transform(targets)

x_train, x_test, y_train, y_test = train_test_split(input_x, Y, test_size = 0.2)

model.fit(x_train, y_train)
model_pred = model.predict(x_test)
print(classification_report(y_test, model_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7414
           1       1.00      1.00      1.00      2355

    accuracy                           1.00      9769
   macro avg       1.00      1.00      1.00      9769
weighted avg       1.00      1.00      1.00      9769

