# Section 2. Data Preprocessing
In this section we will load the cleaned raw dataset cleaned_raw.csv to perform data cleaning, feature transformationscaling, and prepares datasets for the step 3 building classifier and step 5 building segmentation model


In [2]:
## load raw data .csv from section1
import pandas as pd
import numpy as np

# Load cleaned raw data (already replaced '?' with NaN)
df = pd.read_csv("../data/preprocessed/cleaned_raw.csv")

print("Shape of data:", df.shape)
df.head()


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Shape of data: (199523, 42)


Unnamed: 0,age,class of worker,detailed industry recode,detailed occupation recode,education,wage per hour,enroll in edu inst last wk,marital stat,major industry code,major occupation code,...,country of birth father,country of birth mother,country of birth self,citizenship,own business or self employed,fill inc questionnaire for veteran's admin,veterans benefits,weeks worked in year,year,label
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.


In [3]:
# Convert the two recode columns to object dtype, they should be treated as class
df["detailed industry recode"] = df["detailed industry recode"].astype("object")
df["detailed occupation recode"] = df["detailed occupation recode"].astype("object")

# Confirm the dtype change
print(df[["detailed industry recode", "detailed occupation recode"]].dtypes)

## make sure the numeric cols is indeed numeric
numeric_cols = [
    'age', 
    'wage per hour', 'capital gains', 'capital losses',
    'dividends from stocks', 'weight', 'num persons worked for employer',
    'own business or self employed', 'veterans benefits',
    'weeks worked in year', 'year'
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

## tell the categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print("Number of categorical columns:", len(categorical_cols))
print(categorical_cols)

detailed industry recode      object
detailed occupation recode    object
dtype: object
Number of categorical columns: 31
['class of worker', 'detailed industry recode', 'detailed occupation recode', 'education', 'enroll in edu inst last wk', 'marital stat', 'major industry code', 'major occupation code', 'race', 'hispanic origin', 'sex', 'member of a labor union', 'reason for unemployment', 'full or part time employment stat', 'tax filer stat', 'region of previous residence', 'state of previous residence', 'detailed household and family stat', 'detailed household summary in household', 'migration code-change in msa', 'migration code-change in reg', 'migration code-move within reg', 'live in this house 1 year ago', 'migration prev res in sunbelt', 'family members under 18', 'country of birth father', 'country of birth mother', 'country of birth self', 'citizenship', "fill inc questionnaire for veteran's admin", 'label']


In [4]:
## handle missing values 
# Fill categorical missing value with "N/A"
df[categorical_cols] = df[categorical_cols].fillna("Unknown")

# Fill numeric missing with median
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

  df[categorical_cols] = df[categorical_cols].fillna("Unknown")


In [5]:
## transfer the label data to 0 and 1
df['label'] = df['label'].apply(lambda x: 1 if x.strip() == '50000+.' else 0)


In [6]:
df['label'].unique()


array([0, 1])

### In this part, we will do the ordinary encoding for the category columns.


In [7]:
from sklearn.preprocessing import OrdinalEncoder

# 找出所有 object 类型的列
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()

encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1
)

df[categorical_cols] = encoder.fit_transform(df[categorical_cols])




In [8]:
## save the preprossed data with ordinary encoded for the categories
df.to_csv("../data/preprocessed/cleaned_raw_encoded.csv", index=False)
