# Data Preprocessing Case Study: Case Study of Food Consumption

In [1]:
import os
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
import warnings
from pickle import dump, load

In [2]:
# config

pd.set_option('display.max_column', None)

## Load Data

In [3]:
PATH = os.getcwd()
os.chdir(os.path.join(PATH, 'drive/MyDrive/preprocessing'))

In [4]:
!pwd # cek working directory

/content/drive/MyDrive/preprocessing


In [5]:
df = pd.read_csv('data/data_consumption.csv')

In [6]:
df.head()

Unnamed: 0,(A1) CITY,(A2) PROVINCE,(B3) GENDER,(B4) AGE,(B5) RELIGION,(B6) ETHNIC,(B7) EDUCATION LEVEL,(B8) LIVED IN URBAN AREA (YEARS),(B9) OCCUPATION,(B10) SOCIAL CLAS,(B11) HOUSEHOLD MEMBER,(C12) HOUSEHOLD INCOME (MONTHLY IN MILLION IDR),(C13) TIME OF INCOME RECEIPT,(C14) HOUSEHOLD EXPENDITURE (MONTHLY IN MILLION IDR),(C15) % MONTHLY EXPENDITURE FOR FOOD,(D16) HEALTH FACTOR,(D17) PRICE FACTOR),(E18) RELIGIOUS VALUE IN TYPE OF FOOD,(E19) RELIGIOUS VALUE IN EATING PLACES,(E20) RELIGIOUS VALUE IN CONSUMPTION PATTERN,(E21) CUSTOM VALUE IN TYPE OF FOOD,(E22) CUSTOM VALUE IN EATING PLACES,(E23) CUSTOM VALUE IN CONSUMPTION PATTERN,(24) PRACTICE OF FOOD CONSUMPTION BASED ON RELIGIOUS RULES,(25) PRACTICE OF FOOD CONSUMPTION BASED ON CUSTOM RULES,(E26) BUYING FOOD PRODUCT IN THE RIGHT PLACE BASED ON RELIGIOUS RULES,(E27 TYPE OF DISHES VARIATION BASED ON RELIGIOUS RULES,(E28) PRACTICE OF CONSUMING TRADITIONAL FOOD
0,Jakarta,DKI Jakarta,Female,31,Islam,Java,Senior High School,31,Private Employees,Lower Class,5,3:08,Daily,3:02,57,Disagree,Agree,Strongly Agree,Neither agree nor disagree,Agree,Strongly Agree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree,Neither agree nor disagree,Agree,Agree
1,Jakarta,DKI Jakarta,Female,36,Islam,Java,Bachelor degree,32,Private Employees,Lower Class,6,4:02,Weekly,4:01,43,Neither agree nor disagree,Strongly Agree,Strongly Agree,Agree,Agree,Strongly Agree,Agree,Agree,Disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Strongly Agree
2,Jakarta,DKI Jakarta,Male,24,Islam,Java,Senior High School,24,Private Employees,Middle Class,5,5:08,Weekly,5:05,38,Neither agree nor disagree,Agree,Neither agree nor disagree,Agree,Neither agree nor disagree,Neither agree nor disagree,Disagree,Agree,Disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree
3,Jakarta,DKI Jakarta,Female,63,Islam,Other,Senior High School,32,Private Employees,Lower Class,5,3:01,Daily,2:08,61,Neither agree nor disagree,Agree,Agree,Agree,Neither agree nor disagree,Agree,Neither agree nor disagree,Agree,Agree,Neither agree nor disagree,Disagree,Neither agree nor disagree,Agree
4,Jakarta,DKI Jakarta,Male,43,Protestant,Batak,Primary School,43,Government Employees,Lower Class,6,3:04,Daily,3:01,58,Strongly Disagree,Strongly Agree,Neither agree nor disagree,Strongly Agree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Strongly Agree,Strongly Agree,Agree,Disagree,Agree,Agree


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710 entries, 0 to 709
Data columns (total 28 columns):
 #   Column                                                                 Non-Null Count  Dtype 
---  ------                                                                 --------------  ----- 
 0   (A1) CITY                                                              710 non-null    object
 1   (A2) PROVINCE                                                          710 non-null    object
 2   (B3) GENDER                                                            710 non-null    object
 3   (B4) AGE                                                               710 non-null    int64 
 4   (B5) RELIGION                                                          710 non-null    object
 5   (B6) ETHNIC                                                            710 non-null    object
 6   (B7) EDUCATION LEVEL                                                   710 non-null    object
 7  

## Processing
processing steps that can be done:
1. column preprocessing, clean the column name
2. convert income and expenditure to float
3. one hot encoding categorical (nominal) column
4. label encoding categorical (ordinal) column

### 1. Column Preprocessing

In [8]:
def col_preprocessing(x):
  x = x.lower().replace('(e27', '') #first lower case all the column name, and for column that start with (e27, replace it with empty string
  x = re.sub(r'\(.*?\)', '', re.sub('%', 'percent', x)) # remove all parenthesis and value inside it, but before that we remove all % symbol in the columns
  x = re.sub(r'\s+', '_', x.strip()) # we remove the trailing and leading space and replace all whitespace between words with _
  x = re.sub(r'[^a-z_]', '', x) # last one, we ensure by removing all non alphabet and _
  return x

In [9]:
df_test = df.rename(columns= lambda x: col_preprocessing(x))
df_test

Unnamed: 0,city,province,gender,age,religion,ethnic,education_level,lived_in_urban_area,occupation,social_clas,household_member,household_income,time_of_income_receipt,household_expenditure,percent_monthly_expenditure_for_food,health_factor,price_factor,religious_value_in_type_of_food,religious_value_in_eating_places,religious_value_in_consumption_pattern,custom_value_in_type_of_food,custom_value_in_eating_places,custom_value_in_consumption_pattern,practice_of_food_consumption_based_on_religious_rules,practice_of_food_consumption_based_on_custom_rules,buying_food_product_in_the_right_place_based_on_religious_rules,type_of_dishes_variation_based_on_religious_rules,practice_of_consuming_traditional_food
0,Jakarta,DKI Jakarta,Female,31,Islam,Java,Senior High School,31,Private Employees,Lower Class,5,3:08,Daily,3:02,57,Disagree,Agree,Strongly Agree,Neither agree nor disagree,Agree,Strongly Agree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree,Neither agree nor disagree,Agree,Agree
1,Jakarta,DKI Jakarta,Female,36,Islam,Java,Bachelor degree,32,Private Employees,Lower Class,6,4:02,Weekly,4:01,43,Neither agree nor disagree,Strongly Agree,Strongly Agree,Agree,Agree,Strongly Agree,Agree,Agree,Disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Strongly Agree
2,Jakarta,DKI Jakarta,Male,24,Islam,Java,Senior High School,24,Private Employees,Middle Class,5,5:08,Weekly,5:05,38,Neither agree nor disagree,Agree,Neither agree nor disagree,Agree,Neither agree nor disagree,Neither agree nor disagree,Disagree,Agree,Disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree
3,Jakarta,DKI Jakarta,Female,63,Islam,Other,Senior High School,32,Private Employees,Lower Class,5,3:01,Daily,2:08,61,Neither agree nor disagree,Agree,Agree,Agree,Neither agree nor disagree,Agree,Neither agree nor disagree,Agree,Agree,Neither agree nor disagree,Disagree,Neither agree nor disagree,Agree
4,Jakarta,DKI Jakarta,Male,43,Protestant,Batak,Primary School,43,Government Employees,Lower Class,6,3:04,Daily,3:01,58,Strongly Disagree,Strongly Agree,Neither agree nor disagree,Strongly Agree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Strongly Agree,Strongly Agree,Agree,Disagree,Agree,Agree
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,Denpasar,Bali,Female,64,Islam,Java,Junior High School,24,Government Employees,Middle Class,6,6:07,Weekly,6:07,34,Neither agree nor disagree,Disagree,Neither agree nor disagree,Strongly Agree,Neither agree nor disagree,Agree,Strongly Agree,Strongly Agree,Strongly Agree,Neither agree nor disagree,Agree,Neither agree nor disagree,Neither agree nor disagree
706,Denpasar,Bali,Male,51,Protestant,Java,Master and doctoral degree,24,Enterpreneur,Upper Class,6,10:09,Monthly,10:09,26,Neither agree nor disagree,Neither agree nor disagree,Agree,Neither agree nor disagree,Agree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree
707,Denpasar,Bali,Male,26,Hindu,Batak,Senior High School,26,Enterpreneur,Lower Class,6,3:09,Daily,3:09,57,Strongly Agree,Neither agree nor disagree,Strongly Agree,Agree,Strongly Agree,Strongly Agree,Agree,Agree,Agree,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree
708,Denpasar,Bali,Male,30,Hindu,Bali,Senior High School,30,Enterpreneur,Upper Class,6,9:08,Monthly,9:06,24,Strongly Agree,Neither agree nor disagree,Strongly Agree,Neither agree nor disagree,Strongly Agree,Agree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Strongly Agree,Agree,Strongly Agree,Strongly Agree


In [10]:
df_test.columns

Index(['city', 'province', 'gender', 'age', 'religion', 'ethnic',
       'education_level', 'lived_in_urban_area', 'occupation', 'social_clas',
       'household_member', 'household_income', 'time_of_income_receipt',
       'household_expenditure', 'percent_monthly_expenditure_for_food',
       'health_factor', 'price_factor', 'religious_value_in_type_of_food',
       'religious_value_in_eating_places',
       'religious_value_in_consumption_pattern',
       'custom_value_in_type_of_food', 'custom_value_in_eating_places',
       'custom_value_in_consumption_pattern',
       'practice_of_food_consumption_based_on_religious_rules',
       'practice_of_food_consumption_based_on_custom_rules',
       'buying_food_product_in_the_right_place_based_on_religious_rules',
       'type_of_dishes_variation_based_on_religious_rules',
       'practice_of_consuming_traditional_food'],
      dtype='object')

In [11]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710 entries, 0 to 709
Data columns (total 28 columns):
 #   Column                                                           Non-Null Count  Dtype 
---  ------                                                           --------------  ----- 
 0   city                                                             710 non-null    object
 1   province                                                         710 non-null    object
 2   gender                                                           710 non-null    object
 3   age                                                              710 non-null    int64 
 4   religion                                                         710 non-null    object
 5   ethnic                                                           710 non-null    object
 6   education_level                                                  710 non-null    object
 7   lived_in_urban_area                                  

## 2. Numerical Processing
here we convert `household_expenditure` and `household_income` to float.

In [12]:
def num_processing(df:pd.DataFrame):
  for col in ['household_expenditure', 'household_income']:
    df[col] = df[col].str.replace(':','.')
    df[col] = df[col].astype(float)
  return df

In [13]:
df_test = num_processing(df_test)
df_test.head()

Unnamed: 0,city,province,gender,age,religion,ethnic,education_level,lived_in_urban_area,occupation,social_clas,household_member,household_income,time_of_income_receipt,household_expenditure,percent_monthly_expenditure_for_food,health_factor,price_factor,religious_value_in_type_of_food,religious_value_in_eating_places,religious_value_in_consumption_pattern,custom_value_in_type_of_food,custom_value_in_eating_places,custom_value_in_consumption_pattern,practice_of_food_consumption_based_on_religious_rules,practice_of_food_consumption_based_on_custom_rules,buying_food_product_in_the_right_place_based_on_religious_rules,type_of_dishes_variation_based_on_religious_rules,practice_of_consuming_traditional_food
0,Jakarta,DKI Jakarta,Female,31,Islam,Java,Senior High School,31,Private Employees,Lower Class,5,3.08,Daily,3.02,57,Disagree,Agree,Strongly Agree,Neither agree nor disagree,Agree,Strongly Agree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree,Neither agree nor disagree,Agree,Agree
1,Jakarta,DKI Jakarta,Female,36,Islam,Java,Bachelor degree,32,Private Employees,Lower Class,6,4.02,Weekly,4.01,43,Neither agree nor disagree,Strongly Agree,Strongly Agree,Agree,Agree,Strongly Agree,Agree,Agree,Disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Strongly Agree
2,Jakarta,DKI Jakarta,Male,24,Islam,Java,Senior High School,24,Private Employees,Middle Class,5,5.08,Weekly,5.05,38,Neither agree nor disagree,Agree,Neither agree nor disagree,Agree,Neither agree nor disagree,Neither agree nor disagree,Disagree,Agree,Disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree
3,Jakarta,DKI Jakarta,Female,63,Islam,Other,Senior High School,32,Private Employees,Lower Class,5,3.01,Daily,2.08,61,Neither agree nor disagree,Agree,Agree,Agree,Neither agree nor disagree,Agree,Neither agree nor disagree,Agree,Agree,Neither agree nor disagree,Disagree,Neither agree nor disagree,Agree
4,Jakarta,DKI Jakarta,Male,43,Protestant,Batak,Primary School,43,Government Employees,Lower Class,6,3.04,Daily,3.01,58,Strongly Disagree,Strongly Agree,Neither agree nor disagree,Strongly Agree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Strongly Agree,Strongly Agree,Agree,Disagree,Agree,Agree


In [14]:
df_test.dtypes

city                                                                object
province                                                            object
gender                                                              object
age                                                                  int64
religion                                                            object
ethnic                                                              object
education_level                                                     object
lived_in_urban_area                                                  int64
occupation                                                          object
social_clas                                                         object
household_member                                                     int64
household_income                                                   float64
time_of_income_receipt                                              object
household_expenditure    

## 3. One Hot Encoding

before we decide whether to one hot encoding / label encoding the columns, it's better to check the unique value these column contain. It will help us decide which column will be assigned to the model.

In [15]:
# check all unique value for each colum

for col in df_test.select_dtypes(include = ['object', 'category']):
  print(f'\nColumn Name: {col}')
  print(df_test[col].value_counts().sort_values(ascending = False))


Column Name: city
Jakarta     174
Bandung     150
Denpasar    148
Makasar     120
Surabaya    118
Name: city, dtype: int64

Column Name: province
DKI Jakarta       174
West Java         150
Bali              148
South Sulawesi    120
East Java         118
Name: province, dtype: int64

Column Name: gender
Female    411
Male      299
Name: gender, dtype: int64

Column Name: religion
Islam         427
Protestant    112
Catholic      101
Hindu          66
Budha           3
Other           1
Name: religion, dtype: int64

Column Name: ethnic
Java      289
Sunda      97
Bali       73
Minang     61
Madura     58
Other      56
Batak      51
Bugis      25
Name: ethnic, dtype: int64

Column Name: education_level
Bachelor degree               212
Senior High School            181
Diploma                       156
Master and doctoral degree    100
Junior High School             37
Primary School                 24
Name: education_level, dtype: int64

Column Name: occupation
Private Employees      

In [16]:
# define list of nominal_col

nominal_col = ['city', 'province', 'gender', 'ethnic', 'religion',
               'occupation', 'time_of_income_receipt']

In [17]:
encoder = OneHotEncoder(drop= 'if_binary', handle_unknown = 'ignore')

In [None]:
nominal_col = ['city', 'province', 'gender', 'ethnic', 'religion',
               'occupation', 'time_of_income_receipt']
encoder = OneHotEncoder(drop= 'if_binary', handle_unknown = 'ignore')
encoder_array = encoder.fit_transform(df_test[nominal_col]).toarray()
encoder_col = encoder.get_feature_names_out(nominal_col)
df_test2 = pd.concat([df_test.drop(columns = nominal_col), pd.DataFrame(encoder_array, columns = encoder_col).astype(int)], axis = 1)


In [18]:
encoder_array = encoder.fit_transform(df_test[nominal_col]).toarray()
encoder_col = encoder.get_feature_names_out(nominal_col)

In [57]:
df_test2 = pd.concat([df_test.drop(columns = nominal_col), pd.DataFrame(encoder_array, columns = encoder_col).astype(int)], axis = 1)

In [58]:
df_test2.head()

Unnamed: 0,age,education_level,lived_in_urban_area,social_clas,household_member,household_income,household_expenditure,percent_monthly_expenditure_for_food,health_factor,price_factor,religious_value_in_type_of_food,religious_value_in_eating_places,religious_value_in_consumption_pattern,custom_value_in_type_of_food,custom_value_in_eating_places,custom_value_in_consumption_pattern,practice_of_food_consumption_based_on_religious_rules,practice_of_food_consumption_based_on_custom_rules,buying_food_product_in_the_right_place_based_on_religious_rules,type_of_dishes_variation_based_on_religious_rules,practice_of_consuming_traditional_food,city_Bandung,city_Denpasar,city_Jakarta,city_Makasar,city_Surabaya,province_Bali,province_DKI Jakarta,province_East Java,province_South Sulawesi,province_West Java,gender_Male,ethnic_Bali,ethnic_Batak,ethnic_Bugis,ethnic_Java,ethnic_Madura,ethnic_Minang,ethnic_Other,ethnic_Sunda,religion_Budha,religion_Catholic,religion_Hindu,religion_Islam,religion_Other,religion_Protestant,occupation_Enterpreneur,occupation_Government Employees,occupation_Independent Worker,occupation_Police Officer‎/Army,occupation_Private Employees,occupation_Teacher‎/Lecturer,time_of_income_receipt_Daily,time_of_income_receipt_Monthly,time_of_income_receipt_Weekly
0,31,Senior High School,31,Lower Class,5,3.08,3.02,57,Disagree,Agree,Strongly Agree,Neither agree nor disagree,Agree,Strongly Agree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree,Neither agree nor disagree,Agree,Agree,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0
1,36,Bachelor degree,32,Lower Class,6,4.02,4.01,43,Neither agree nor disagree,Strongly Agree,Strongly Agree,Agree,Agree,Strongly Agree,Agree,Agree,Disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Strongly Agree,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
2,24,Senior High School,24,Middle Class,5,5.08,5.05,38,Neither agree nor disagree,Agree,Neither agree nor disagree,Agree,Neither agree nor disagree,Neither agree nor disagree,Disagree,Agree,Disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
3,63,Senior High School,32,Lower Class,5,3.01,2.08,61,Neither agree nor disagree,Agree,Agree,Agree,Neither agree nor disagree,Agree,Neither agree nor disagree,Agree,Agree,Neither agree nor disagree,Disagree,Neither agree nor disagree,Agree,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0
4,43,Primary School,43,Lower Class,6,3.04,3.01,58,Strongly Disagree,Strongly Agree,Neither agree nor disagree,Strongly Agree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Strongly Agree,Strongly Agree,Agree,Disagree,Agree,Agree,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0


In [21]:
df_test2.columns

Index(['age', 'education_level', 'lived_in_urban_area', 'social_clas',
       'household_member', 'household_income', 'household_expenditure',
       'percent_monthly_expenditure_for_food', 'health_factor', 'price_factor',
       'religious_value_in_type_of_food', 'religious_value_in_eating_places',
       'religious_value_in_consumption_pattern',
       'custom_value_in_type_of_food', 'custom_value_in_eating_places',
       'custom_value_in_consumption_pattern',
       'practice_of_food_consumption_based_on_religious_rules',
       'practice_of_food_consumption_based_on_custom_rules',
       'buying_food_product_in_the_right_place_based_on_religious_rules',
       'type_of_dishes_variation_based_on_religious_rules',
       'practice_of_consuming_traditional_food', 'city_Bandung',
       'city_Denpasar', 'city_Jakarta', 'city_Makasar', 'city_Surabaya',
       'province_Bali', 'province_DKI Jakarta', 'province_East Java',
       'province_South Sulawesi', 'province_West Java', 'gender_M

In [22]:
# let's save the pickle

OUTPUT_DIR = 'output'
ohe = 'ohe_encoder.pkl'

if not os.path.exists(OUTPUT_DIR):
  os.makedirs(OUTPUT_DIR)

dump(encoder, open(OUTPUT_DIR + '/' + ohe, 'wb'))

## 4. Label Encoding

In [59]:
# define list of ordinal_col

ordinal_col = ['social_clas', 'education_level', 'health_factor', 'price_factor',
       'religious_value_in_type_of_food', 'religious_value_in_eating_places',
       'religious_value_in_consumption_pattern',
       'custom_value_in_type_of_food', 'custom_value_in_eating_places',
       'custom_value_in_consumption_pattern',
       'practice_of_food_consumption_based_on_religious_rules',
       'practice_of_food_consumption_based_on_custom_rules',
       'buying_food_product_in_the_right_place_based_on_religious_rules',
       'type_of_dishes_variation_based_on_religious_rules',
       'practice_of_consuming_traditional_food']

In [60]:
order_dict = {
    'social_clas': ['Lower Class', 'Middle Class', 'Upper Class'],
    'education_level' : ['Primary School', 'Junior High School', 'Senior High School',
            'Diploma', 'Bachelor degree', 'Master and doctoral degree']
}
default_order = ['Strongly Disagree', 'Disagree', 'Neither agree nor disagree', 'Agree', 'Strongly Agree']

In [61]:
def encode_ordinal(df:pd.DataFrame, columns:list, order_dict:dict, default_order:list):
  df = df.copy() # copy df so that our old df doesn't affected by the change
  # if the column not found in dict, create new key and append the order by value of default order
  for col in df[columns]:
    if col not in order_dict:
      order_dict[col] = default_order

  # map and encode the categorical
  for col, order in order_dict.items():
    df[col] = df[col].map({category: i for i, category in enumerate(order)})

  return df

In [68]:
df_test3 = encode_ordinal(df_test2, columns = ordinal_col,
                          order_dict=order_dict, default_order = default_order)

In [69]:
df_test3.head()

Unnamed: 0,age,education_level,lived_in_urban_area,social_clas,household_member,household_income,household_expenditure,percent_monthly_expenditure_for_food,health_factor,price_factor,religious_value_in_type_of_food,religious_value_in_eating_places,religious_value_in_consumption_pattern,custom_value_in_type_of_food,custom_value_in_eating_places,custom_value_in_consumption_pattern,practice_of_food_consumption_based_on_religious_rules,practice_of_food_consumption_based_on_custom_rules,buying_food_product_in_the_right_place_based_on_religious_rules,type_of_dishes_variation_based_on_religious_rules,practice_of_consuming_traditional_food,city_Bandung,city_Denpasar,city_Jakarta,city_Makasar,city_Surabaya,province_Bali,province_DKI Jakarta,province_East Java,province_South Sulawesi,province_West Java,gender_Male,ethnic_Bali,ethnic_Batak,ethnic_Bugis,ethnic_Java,ethnic_Madura,ethnic_Minang,ethnic_Other,ethnic_Sunda,religion_Budha,religion_Catholic,religion_Hindu,religion_Islam,religion_Other,religion_Protestant,occupation_Enterpreneur,occupation_Government Employees,occupation_Independent Worker,occupation_Police Officer‎/Army,occupation_Private Employees,occupation_Teacher‎/Lecturer,time_of_income_receipt_Daily,time_of_income_receipt_Monthly,time_of_income_receipt_Weekly
0,31,2,31,0,5,3.08,3.02,57,1,3,4,2,3,4,2,2,2,1,2,3,3,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0
1,36,4,32,0,6,4.02,4.01,43,2,4,4,3,3,4,3,3,1,2,2,2,4,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
2,24,2,24,1,5,5.08,5.05,38,2,3,2,3,2,2,1,3,1,2,2,2,2,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
3,63,2,32,0,5,3.01,2.08,61,2,3,3,3,2,3,2,3,3,2,1,2,3,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0
4,43,0,43,0,6,3.04,3.01,58,0,4,2,4,2,2,2,4,4,3,1,3,3,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0


## Test Your Class

In [70]:
from src.preprocess import Processor

In [71]:
df = pd.read_csv("data/data_consumption.csv")
df.head()

Unnamed: 0,(A1) CITY,(A2) PROVINCE,(B3) GENDER,(B4) AGE,(B5) RELIGION,(B6) ETHNIC,(B7) EDUCATION LEVEL,(B8) LIVED IN URBAN AREA (YEARS),(B9) OCCUPATION,(B10) SOCIAL CLAS,(B11) HOUSEHOLD MEMBER,(C12) HOUSEHOLD INCOME (MONTHLY IN MILLION IDR),(C13) TIME OF INCOME RECEIPT,(C14) HOUSEHOLD EXPENDITURE (MONTHLY IN MILLION IDR),(C15) % MONTHLY EXPENDITURE FOR FOOD,(D16) HEALTH FACTOR,(D17) PRICE FACTOR),(E18) RELIGIOUS VALUE IN TYPE OF FOOD,(E19) RELIGIOUS VALUE IN EATING PLACES,(E20) RELIGIOUS VALUE IN CONSUMPTION PATTERN,(E21) CUSTOM VALUE IN TYPE OF FOOD,(E22) CUSTOM VALUE IN EATING PLACES,(E23) CUSTOM VALUE IN CONSUMPTION PATTERN,(24) PRACTICE OF FOOD CONSUMPTION BASED ON RELIGIOUS RULES,(25) PRACTICE OF FOOD CONSUMPTION BASED ON CUSTOM RULES,(E26) BUYING FOOD PRODUCT IN THE RIGHT PLACE BASED ON RELIGIOUS RULES,(E27 TYPE OF DISHES VARIATION BASED ON RELIGIOUS RULES,(E28) PRACTICE OF CONSUMING TRADITIONAL FOOD
0,Jakarta,DKI Jakarta,Female,31,Islam,Java,Senior High School,31,Private Employees,Lower Class,5,3:08,Daily,3:02,57,Disagree,Agree,Strongly Agree,Neither agree nor disagree,Agree,Strongly Agree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree,Neither agree nor disagree,Agree,Agree
1,Jakarta,DKI Jakarta,Female,36,Islam,Java,Bachelor degree,32,Private Employees,Lower Class,6,4:02,Weekly,4:01,43,Neither agree nor disagree,Strongly Agree,Strongly Agree,Agree,Agree,Strongly Agree,Agree,Agree,Disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Strongly Agree
2,Jakarta,DKI Jakarta,Male,24,Islam,Java,Senior High School,24,Private Employees,Middle Class,5,5:08,Weekly,5:05,38,Neither agree nor disagree,Agree,Neither agree nor disagree,Agree,Neither agree nor disagree,Neither agree nor disagree,Disagree,Agree,Disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree
3,Jakarta,DKI Jakarta,Female,63,Islam,Other,Senior High School,32,Private Employees,Lower Class,5,3:01,Daily,2:08,61,Neither agree nor disagree,Agree,Agree,Agree,Neither agree nor disagree,Agree,Neither agree nor disagree,Agree,Agree,Neither agree nor disagree,Disagree,Neither agree nor disagree,Agree
4,Jakarta,DKI Jakarta,Male,43,Protestant,Batak,Primary School,43,Government Employees,Lower Class,6,3:04,Daily,3:01,58,Strongly Disagree,Strongly Agree,Neither agree nor disagree,Strongly Agree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Strongly Agree,Strongly Agree,Agree,Disagree,Agree,Agree


In [72]:
# Define columns name
nominal_col = ['city', 'province', 'gender', 'ethnic', 'religion',
               'occupation', 'time_of_income_receipt']

ordinal_col = ['social_clas', 'education_level', 'health_factor', 'price_factor',
       'religious_value_in_type_of_food', 'religious_value_in_eating_places',
       'religious_value_in_consumption_pattern',
       'custom_value_in_type_of_food', 'custom_value_in_eating_places',
       'custom_value_in_consumption_pattern',
       'practice_of_food_consumption_based_on_religious_rules',
       'practice_of_food_consumption_based_on_custom_rules',
       'buying_food_product_in_the_right_place_based_on_religious_rules',
       'type_of_dishes_variation_based_on_religious_rules',
       'practice_of_consuming_traditional_food']

num_col = ['household_expenditure', 'household_income']

order_dict = {
    'social_clas': ['Lower Class', 'Middle Class', 'Upper Class'],
    'education_level' : ['Primary School', 'Junior High School', 'Senior High School',
            'Diploma', 'Bachelor degree', 'Master and doctoral degree']
}
default_order = ['Strongly Disagree', 'Disagree', 'Neither agree nor disagree',
                 'Agree', 'Strongly Agree']

# pickle file containing the OneHotEncoder
encoder_path = 'output/ohe_encoder.pkl'

In [73]:
# Init
processor = Processor(nominal_col, ordinal_col, num_col, order_dict, default_order, encoder_path)

In [74]:
df_preprocessed = processor.transform(df)

In [75]:
df_preprocessed.head()

Unnamed: 0,age,education_level,lived_in_urban_area,social_clas,household_member,household_income,household_expenditure,percent_monthly_expenditure_for_food,health_factor,price_factor,religious_value_in_type_of_food,religious_value_in_eating_places,religious_value_in_consumption_pattern,custom_value_in_type_of_food,custom_value_in_eating_places,custom_value_in_consumption_pattern,practice_of_food_consumption_based_on_religious_rules,practice_of_food_consumption_based_on_custom_rules,buying_food_product_in_the_right_place_based_on_religious_rules,type_of_dishes_variation_based_on_religious_rules,practice_of_consuming_traditional_food,city_Bandung,city_Denpasar,city_Jakarta,city_Makasar,city_Surabaya,province_Bali,province_DKI Jakarta,province_East Java,province_South Sulawesi,province_West Java,gender_Male,ethnic_Bali,ethnic_Batak,ethnic_Bugis,ethnic_Java,ethnic_Madura,ethnic_Minang,ethnic_Other,ethnic_Sunda,religion_Budha,religion_Catholic,religion_Hindu,religion_Islam,religion_Other,religion_Protestant,occupation_Enterpreneur,occupation_Government Employees,occupation_Independent Worker,occupation_Police Officer‎/Army,occupation_Private Employees,occupation_Teacher‎/Lecturer,time_of_income_receipt_Daily,time_of_income_receipt_Monthly,time_of_income_receipt_Weekly
0,31,2,31,0,5,3.08,3.02,57,1,3,4,2,3,4,2,2,2,1,2,3,3,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0
1,36,4,32,0,6,4.02,4.01,43,2,4,4,3,3,4,3,3,1,2,2,2,4,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
2,24,2,24,1,5,5.08,5.05,38,2,3,2,3,2,2,1,3,1,2,2,2,2,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
3,63,2,32,0,5,3.01,2.08,61,2,3,3,3,2,3,2,3,3,2,1,2,3,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0
4,43,0,43,0,6,3.04,3.01,58,0,4,2,4,2,2,2,4,4,3,1,3,3,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0


In [76]:
df_preprocessed.to_csv("data_kebotakan_clean.csv", index = False)