In [8]:
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from rpart.DecisionTreeClassifier import DecisionTreeClassifier

# Read the data
raw = pd.read_csv('../data/adult.csv')

In [3]:
def fix_categorical(data):

    df = data.copy()

    # Simplify workclass feature
    df['workclass'] = df['workclass'].replace(['Never-worked', 'Without-pay'], 'Other')

    # Simplify education feature
    df['education'] = df['education'].replace(['Preschool', '1st-4th', '5th-6th'], 'Elementary-school')
    df['education'] = df['education'].replace(['7th-8th', '9th', '10th', '11th', '12th'], 'High-school-no-diploma')
    df['education'] = df['education'].replace(['Assoc-voc', 'Assoc-acdm'], 'Associates')
    df['education'] = df['education'].replace(['Prof-school', 'Doctorate'], 'Post-graduate')

    # Simplify occupation feature
    df['occupation'] = df['occupation'].replace(['?'], 'Unknown')

    # Simplify native.country feature by grouping countries into regions
    regions = {
        'North-America': ['United-States', 'Canada'],
        'Latin-America': ['Mexico', 'South', 'Jamaica', 'El-Salvador', 'Cuba', 'Puerto-Rico', 'Dominican-Republic',
                        'Trinadad&Tobago', 'Ecuador', 'Honduras', 'Haiti', 'Columbia', 'Peru', 'Guatemala'],
        'Europe': ['Germany', 'Greece', 'England', 'Ireland', 'Poland', 'Laos', 'Italy', 'Hungary', 'Holand-Netherlands',
                'Yugoslavia', 'Scotland', 'France'],
        'Asia': ['Philippines', 'China', 'Vietnam', 'India', 'Japan', 'Iran', 'Cambodia', 'Taiwan', 'Hong', 'Thailand']
    }

    def assign_region(country):
        for region, countries in regions.items():
            if country in countries:
                return region
        return 'Unknown'

    df['continent'] = df['native.country'].apply(assign_region)
    
    df.drop('native.country', axis=1, inplace=True)

    return df

def encode_data(data):

    df = data.copy()

    # List categorical and numerical columns
    categorical_columns = ['workclass', 'education', 'marital.status', 'occupation', 'relationship','sex','race', 'continent']

    # Apply one-hot encoding to categorical columns
    df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

    return df_encoded


interim = fix_categorical(raw)
processed = encode_data(interim)
# Save processed data
processed.to_csv('../data/adult30k_processed.csv', index=False)
