In [1]:
import pandas as pd

## Importing the dataset

In [2]:
df = pd.read_csv('census.csv')

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.shape

(32561, 15)

# Dataset overview

## Checking for nans

In [5]:
has_nan = df.isna().any().any()

if has_nan:
    print("The DataFrame has NaN values.")
else:
    print("The DataFrame does not have any NaN values.")

The DataFrame does not have any NaN values.


## Checking datatypes

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlgt           32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Checking for whitespaces

In [7]:
for col in df.columns:
    if any(char.isspace() for char in col):
        # String has whitespace
        print("WHITE SPACE FOUND in ", col)
        col = "".join(char for char in col if not char.isspace())
    print("Fixed column: ", col)

Fixed column:  age
WHITE SPACE FOUND in   workclass
Fixed column:  workclass
WHITE SPACE FOUND in   fnlgt
Fixed column:  fnlgt
WHITE SPACE FOUND in   education
Fixed column:  education
WHITE SPACE FOUND in   education-num
Fixed column:  education-num
WHITE SPACE FOUND in   marital-status
Fixed column:  marital-status
WHITE SPACE FOUND in   occupation
Fixed column:  occupation
WHITE SPACE FOUND in   relationship
Fixed column:  relationship
WHITE SPACE FOUND in   race
Fixed column:  race
WHITE SPACE FOUND in   sex
Fixed column:  sex
WHITE SPACE FOUND in   capital-gain
Fixed column:  capital-gain
WHITE SPACE FOUND in   capital-loss
Fixed column:  capital-loss
WHITE SPACE FOUND in   hours-per-week
Fixed column:  hours-per-week
WHITE SPACE FOUND in   native-country
Fixed column:  native-country
WHITE SPACE FOUND in   salary
Fixed column:  salary


## Removing whitespaces

In [8]:
# Remove whitespace from column names
df.rename(columns=lambda x: x.strip(), inplace=True)

# Remove whitespace from elements in the DataFrame
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Display the modified DataFrame
print(df)

       age         workclass   fnlgt   education  education-num  \
0       39         State-gov   77516   Bachelors             13   
1       50  Self-emp-not-inc   83311   Bachelors             13   
2       38           Private  215646     HS-grad              9   
3       53           Private  234721        11th              7   
4       28           Private  338409   Bachelors             13   
...    ...               ...     ...         ...            ...   
32556   27           Private  257302  Assoc-acdm             12   
32557   40           Private  154374     HS-grad              9   
32558   58           Private  151910     HS-grad              9   
32559   22           Private  201490     HS-grad              9   
32560   52      Self-emp-inc  287927     HS-grad              9   

           marital-status         occupation   relationship   race     sex  \
0           Never-married       Adm-clerical  Not-in-family  White    Male   
1      Married-civ-spouse    Exec-manag

## Checking if whitespace has been removed

In [9]:
for col in df.columns:
    if any(char.isspace() for char in col):
        # String has whitespace
        print("WHITE SPACE FOUND in ", col)
        col = "".join(char for char in col if not char.isspace())
    print("Fixed column: ", col)

Fixed column:  age
Fixed column:  workclass
Fixed column:  fnlgt
Fixed column:  education
Fixed column:  education-num
Fixed column:  marital-status
Fixed column:  occupation
Fixed column:  relationship
Fixed column:  race
Fixed column:  sex
Fixed column:  capital-gain
Fixed column:  capital-loss
Fixed column:  hours-per-week
Fixed column:  native-country
Fixed column:  salary


## Checking cardinality of the target variable

In [10]:
df['salary'].nunique()

2

In [11]:
df['salary'].value_counts()

<=50K    24720
>50K      7841
Name: salary, dtype: int64

## Converting "-" --> "_"

In [14]:
df.columns = [col.replace("-", "_") for col in df.columns]

In [15]:
df.columns

Index(['age', 'workclass', 'fnlgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'salary'],
      dtype='object')

In [16]:
df

Unnamed: 0,age,workclass,fnlgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


The dataset is imbalanced. I'll handle it in train_test_split

## Save cleaned dataset

In [17]:
df.to_csv('census_cleaned.csv', index=False)