In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [2]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'result']


In [3]:
df = pd.read_table('adult.data', names=cols, sep=",", index_col=False)

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,result
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Remove rows with any unknown or NaN values
df = df.dropna()
df = df.reset_index(drop=True)

# Remove rows with any attribute equal to '?'
df = df[~(df == ' ?').any(axis=1)]
df = df.reset_index(drop=True)

In [6]:
# Get unique values of each column
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values of {column}:")
    print(unique_values)
    print()

Unique values of age:
[39 50 38 53 28 37 49 52 31 42 30 23 32 34 25 43 40 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 47 46 36 79 27 18 33 76 55 61 70 64 71 66 51 58
 26 17 60 90 75 65 77 62 63 67 74 72 69 68 73 81 78 88 80 84 83 85 82 86]

Unique values of workclass:
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Self-emp-inc' ' Without-pay']

Unique values of fnlwgt:
[ 77516  83311 215646 ...  84661 257302 201490]

Unique values of education:
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' 7th-8th' ' Doctorate' ' Assoc-voc' ' Prof-school'
 ' 5th-6th' ' 10th' ' Preschool' ' 12th' ' 1st-4th']

Unique values of education_num:
[13  9  7 14  5 10 12  4 16 11 15  3  6  1  8  2]

Unique values of marital_status:
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']

Unique values of occupation:
[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Pro

In [7]:
# # Create a label encoder object
# label_encoder = LabelEncoder()

# # Apply label encoding to each categorical column
# for column in cols:
#     df[column] = label_encoder.fit_transform(df[column])

# # Print the updated DataFrame
# print(df.head())

In [8]:
def preprocess_adult(df): 
    df['age'] = df['age'].apply(lambda x : 1 if x >= 45 else 0) # 1 if old, 0 if young
    df['workclass'] = df['workclass'].map({' Private': 0, ' Self-emp-not-inc': 1, ' Self-emp-inc': 2, ' Federal-gov': 3, ' Local-gov': 4, ' State-gov': 5, ' Without-pay': 6, ' Never-worked': 7}).astype(int)
    df['education'] = df['education'].map({' Preschool': 0, ' 1st-4th': 1, ' 5th-6th': 2, ' 7th-8th': 3, ' 9th': 4, ' 10th': 5, ' 11th': 6, ' 12th': 7, ' HS-grad': 8, ' Some-college': 9, ' Assoc-acdm': 10, ' Assoc-voc': 11, ' Bachelors': 12, ' Masters': 13, ' Prof-school': 14, ' Doctorate': 15}).astype(int)
    df['marital_status'] = df['marital_status'].map({' Never-married': 1, ' Married-civ-spouse': 2, ' Divorced': 3, ' Married-spouse-absent': 4, ' Separated': 5, ' Married-AF-spouse': 6,' Widowed': 7}).astype(int)
    df['occupation'] = df['occupation'].map({' Adm-clerical': 1, ' Exec-managerial': 2, ' Handlers-cleaners': 3, ' Prof-specialty': 4,' Other-service': 5,' Sales': 6,' Craft-repair': 7,' Transport-moving': 8,' Farming-fishing': 9,' Machine-op-inspct': 10,' Tech-support': 11,' Protective-serv': 12,' Armed-Forces': 13,' Priv-house-serv': 14}).astype(int)
    df['relationship'] = df['relationship'].map({' Not-in-family': 1, ' Husband': 2, ' Wife': 3, ' Own-child': 4, ' Unmarried': 5,' Other-relative': 6}).astype(int)
    df['race'] = df['race'].map({' White': 1, ' Black': 2, ' Asian-Pac-Islander': 3, ' Amer-Indian-Eskimo': 4, ' Other': 5}).astype(int)
    df['sex'] = df['sex'].map({' Male': 1, ' Female': 2}).astype(int)
    df['native_country'] = df['native_country'].map({' Cambodia': 0, ' Canada': 1,' China': 2,' Columbia': 3, ' Cuba':4, ' Dominican-Republic': 5, ' Ecuador': 6,' El-Salvador': 7, ' England': 8, ' France': 9, ' Germany': 10, ' Greece': 11, ' Guatemala': 12, ' Haiti': 13, ' Holand-Netherlands': 14, ' Honduras': 15, ' Hong': 16, ' Hungary': 17, ' India': 18, ' Iran': 19, ' Ireland': 20, ' Italy': 21, ' Jamaica': 22, ' Japan': 23, ' Laos': 24, ' Mexico': 25, ' Nicaragua': 26, ' Outlying-US(Guam-USVI-etc)': 27, ' Peru': 28, ' Philippines': 29, ' Poland': 30, ' Portugal': 31, ' Puerto-Rico': 32, ' Scotland': 33, ' South': 34, ' Taiwan': 35, ' Thailand': 36, ' Trinadad&Tobago': 37, ' United-States': 38, ' Vietnam': 39, ' Yugoslavia': 40}).astype(int)
    df['result'] = df['result'].map({' <=50K': 0, ' >50K': 1}).astype(int)
    
    return df

In [9]:
preprocess_adult(df)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,result
0,0,5,77516,12,13,1,1,1,1,1,2174,0,40,38,0
1,1,1,83311,12,13,2,2,2,1,1,0,0,13,38,0
2,0,0,215646,8,9,3,3,1,1,1,0,0,40,38,0
3,1,0,234721,6,7,2,3,2,2,1,0,0,40,38,0
4,0,0,338409,12,13,2,4,3,2,2,0,0,40,4,0


In [10]:
# Get unique values of each column
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values of {column}:")
    print(unique_values)
    print()

Unique values of age:
[0 1]

Unique values of workclass:
[5 1 0 3 4 2 6]

Unique values of fnlwgt:
[ 77516  83311 215646 ...  84661 257302 201490]

Unique values of education:
[12  8  6 13  4  9 10  3 15 11 14  2  5  0  7  1]

Unique values of education_num:
[13  9  7 14  5 10 12  4 16 11 15  3  6  1  8  2]

Unique values of marital_status:
[1 2 3 4 5 6 7]

Unique values of occupation:
[ 1  2  3  4  5  6  8  9 10 11  7 12 13 14]

Unique values of relationship:
[1 2 3 4 5 6]

Unique values of race:
[1 2 3 4 5]

Unique values of sex:
[1 2]

Unique values of capital_gain:
[ 2174     0 14084  5178  5013  2407 14344 15024  7688  4064  4386  7298
  1409  3674  1055  2050  2176   594 20051  6849  4101  8614  3411  2597
 25236  4650  2463  3103 10605  2964  3325  2580  3471  4865 99999  6514
  1471  2329  2105  2885 10520  2202  2961 27828  6767  2228  1506 13550
  2635  5556  4787  3781  3137  3818   914   401  2829  2977  4934  2062
  2354  3464  5455 15020  1424  3273 22040  4416 10566  493