In [127]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
cat_feat_names = ["workclass", "education_level", "marital-status", "occupation", "relationship", "race", "sex",\
                  "native-country"]
num_feat_names = ["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]

In [120]:
class Data():
    def __init__(self, csv_file, cat_feats, num_feats):
        self.cat_feats = cat_feats
        self.num_feats = num_feats
        self.features = None
        self.target= None
        self.parse_file(csv_file)
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.preprocess()
    
    def parse_file(self, csv_file):
        df = pd.read_csv(csv_file)
        self.features = df[self.cat_feats + self.num_feats].copy()
        self.target = np.reshape(df["income"].copy().values, (-1,1))
    
    def log_transformation(self, feature):
        self.features[feature] = self.features[feature].apply(lambda x: np.log(x+1))
    
    def scale_data(self):
        num_values = self.features[self.num_feats].copy()
        scaler = MinMaxScaler()
        scaled_values = scaler.fit_transform(num_values)
        scaled_df = pd.DataFrame(scaled_values, columns=self.num_feats)
        output_df = pd.concat([scaled_df, self.features[self.cat_feats]], axis=1)
        self.features = output_df
    
    def encode_categorical_features(self):
        encoded_df = pd.get_dummies(self.features[self.cat_feats])
        num_vars = self.features[self.num_feats]
        self.features = pd.concat([num_vars, encoded_df], axis=1)
    
    def preprocess(self):
        self.log_transformation("capital-loss")
        self.log_transformation("capital-gain")
        self.scale_data()
        self.encode_categorical_features()
    
    def split_train_test(self):
        X_train, X_test, y_train, y_test = train_test_split(self.features, self.target, test_size=0.2, random)
        
        

In [122]:
data_ = Data("census.csv",cat_feat_names, num_feat_names)

In [123]:
data_.features.shape

(45222, 13)

In [124]:
data_.encode_categorical_features()

In [126]:
data_.features.shape

(45222, 103)