In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv("adult.csv")
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [4]:
df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)


In [5]:
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=['int64']).columns

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)


Categorical Columns: Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'gender', 'native-country', 'income'],
      dtype='object')
Numerical Columns: Index(['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')


In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['income'] = le.fit_transform(df['income'])


In [7]:
df = pd.get_dummies(df, drop_first=True)
df.head()


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,0,0,40,0,False,True,False,...,False,False,False,False,False,False,False,True,False,False
1,38,89814,9,0,0,50,0,False,True,False,...,False,False,False,False,False,False,False,True,False,False
2,28,336951,12,0,0,40,1,True,False,False,...,False,False,False,False,False,False,False,True,False,False
3,44,160323,10,7688,0,40,1,False,True,False,...,False,False,False,False,False,False,False,True,False,False
5,34,198693,6,0,0,30,0,False,True,False,...,False,False,False,False,False,False,False,True,False,False


In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


In [9]:
df[numerical_cols].describe()


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0
mean,-2.5453970000000003e-17,5.357903e-17,1.693789e-16,-1.4769580000000002e-17,2.6475270000000002e-17,2.165158e-16
std,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011
min,-1.630231,-1.668365,-3.57187,-0.1467332,-0.2187803,-3.326124
25%,-0.7980149,-0.6848527,-0.4381216,-0.1467332,-0.2187803,-0.07812006
50%,-0.117111,-0.108093,-0.046403,-0.1467332,-0.2187803,-0.07812006
75%,0.6394489,0.4561924,1.128753,-0.1467332,-0.2187803,0.3382907
max,3.892656,12.31247,2.303909,13.17519,10.53806,4.835527


In [10]:
df.to_csv("adult_preprocessed.csv", index=False)


Feature scaling ensures that all numerical features contribute equally to the machine learning model. Without scaling, features with large values dominate the learning process. Algorithms like KNN, SVM, and Gradient Descent perform significantly better after scaling.