# Preprocessing

We will process our data set and prepare it for modelling based on our EDA findings.

## Load data 

In [0]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

def load_data_by_name(name: str) -> pd.DataFrame:
    """
    Load dataset by its name.
    """
    mydataset = dataiku.Dataset(name)
    mydataset_df = mydataset.get_dataframe()
    
    return mydataset_df

train_df = load_data_by_name("census_income_learn")
test_df = load_data_by_name("census_income_test")

In [0]:
col_mapping = {
    "col_0": "age", # matches type and range
    "col_1": "class of worker", # unique values checked with data dict (UVDD)
    "col_2": "detailed industry recode", # UVDD
    "col_3": "detailed occupation recode", # UVDD
    "col_4": "education", # UVDD
    "col_5": "wage per hour", # looks to be at right position, type checks, in cents?
    "col_6": "enroll in edu inst last wk", # UVDD
    "col_7": "marital stat", # UVDD
    "col_8": "major industry code", # UVDD
    "col_9": "major occupation code", # UVDD
    "col_10": "race", # UVDD
    "col_11": "hispanic origin", # UVDD - 10 unique in data dict? values match though
    "col_12": "sex", # UVDD
    "col_13": "member of a labor union", # UVDD
    "col_14": "reason for unemployment", # UVDD
    "col_15": "full or part time employment stat", # UVDD
    "col_16": "capital gains", # data dict check, range ok, dollars?
    "col_17": "capital losses", # data dict check, range ok, dollars?
    "col_18": "dividends from stocks", # data dict check
    "col_19": "tax filer stat", # UVDD
    "col_20": "region of previous residence", # UVDD
    "col_21": "state of previous residence", # UVDD
    "col_22": "detailed household and family stat", # data dict check
    "col_23": "detailed household summary in household", # data dict check
    "col_24": "instance weight", # SPECIAL
    "col_25": "migration code-change in msa", # UVDD
    "col_26": "migration code-change in reg", # UVDD
    "col_27": "migration code-move within reg", # UVDD
    "col_28": "live in this house 1 year ago",# UVDD
    "col_29": "migration prev res in sunbelt", # UVDD
    "col_30": "num persons worked for employer", # value check
    "col_31": "family members under 18", # UVDD
    "col_32": "country of birth mother",  # UVDD
    "col_33": "country of birth self",  # UVDD
    "col_34": "country of birth father",  # UVDD
    "col_35": "citizenship", # UVDD
    "col_36": "own business or self employed", # UVDD
    "col_37": "fill inc questionnaire for veteran's admin", # UVDD
    "col_38": "veterans benefits", # UVDD
    "col_39": "weeks worked in year", # data dict order
    "col_40": "year", # UVDD
    "col_41": "income"
}

### Column mapping and dtypes 

In [0]:
train_df = train_df.rename(columns=col_mapping)
test_df = test_df.rename(columns=col_mapping)

In [0]:
num_to_object_cols = [
    "detailed industry recode", 
    "detailed occupation recode",
    "own business or self employed",
    "veterans benefits",
    "year",
    "num persons worked for employer",
]

for col in num_to_object_cols:
    train_df[col] = train_df[col].astype("object")
    test_df[col] = test_df[col].astype("object")

## Data Cleaning

We deal with missing values and duplicates.

In [0]:
train_df = train_df.drop_duplicates()
test_df = test_df.drop_duplicates()

## Extract numeric features 

In [0]:
train_num = train_df.select_dtypes('number')
test_num = test_df.select_dtypes('number')

## Extract categorical features

## Save Data 

In [0]:
# Write recipe outputs
processed_learn = dataiku.Dataset("processed_learn")
processed_learn.write_with_schema(train_num)

processed_test = dataiku.Dataset("processed_test")
processed_test.write_with_schema(test_num)