In [1]:
import os
import pandas as pd
import json

In [2]:
base_dir = '../data/uci_adult/'
train_data = os.path.join(base_dir, 'adult.data')
test_data = os.path.join(base_dir, 'adult.test')

In [3]:
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation",
    "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country",
    "income"
]

In [4]:
with open(train_data, "r") as train_data:
  train_df = pd.read_csv(train_data, sep=',', names=columns)

with open(test_data, "r") as test_data:
  test_df = pd.read_csv(test_data, sep=',', names=columns)

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
# convert object columns to category columns
# https://stackoverflow.com/a/39906514/4082505

def convert_object_to_category(df):
    return pd.concat([
        df.select_dtypes([], ['object']),
        df.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
        ], axis=1).reindex(df.columns, axis=1)

In [7]:
train_df = convert_object_to_category(train_df)
test_df = convert_object_to_category(test_df)

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             32561 non-null  int64   
 1   workclass       32561 non-null  category
 2   fnlwgt          32561 non-null  int64   
 3   education       32561 non-null  category
 4   education-num   32561 non-null  int64   
 5   marital-status  32561 non-null  category
 6   occupation      32561 non-null  category
 7   relationship    32561 non-null  category
 8   race            32561 non-null  category
 9   sex             32561 non-null  category
 10  capital-gain    32561 non-null  int64   
 11  capital-loss    32561 non-null  int64   
 12  hours-per-week  32561 non-null  int64   
 13  native-country  32561 non-null  category
 14  income          32561 non-null  category
dtypes: category(9), int64(6)
memory usage: 1.8 MB


In [8]:
categorical_columns = train_df.select_dtypes(include='category').columns
vocabulary = {}

for column in categorical_columns:
  vocabulary[column] = list(set(train_df[column].cat.categories)-{"?"})
categorical_columns

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income'],
      dtype='object')

In [9]:
def write_to_output_file(filename, content):
    output_file_path = os.path.join(base_dir, filename)

    with open(output_file_path, mode="w") as output_file:
        output_file.write(json.dumps(content))
        output_file.close()

In [10]:
write_to_output_file('vocabulary.json', vocabulary)
print(vocabulary)

{'workclass': ['Federal-gov', 'Self-emp-not-inc', 'Self-emp-inc', 'Without-pay', 'State-gov', 'Never-worked', 'Private', 'Local-gov'], 'education': ['5th-6th', '12th', 'Assoc-voc', 'Preschool', '10th', '9th', 'Prof-school', '7th-8th', 'Doctorate', '1st-4th', '11th', 'Some-college', 'Bachelors', 'HS-grad', 'Masters', 'Assoc-acdm'], 'marital-status': ['Separated', 'Married-civ-spouse', 'Divorced', 'Widowed', 'Married-AF-spouse', 'Married-spouse-absent', 'Never-married'], 'occupation': ['Prof-specialty', 'Protective-serv', 'Tech-support', 'Transport-moving', 'Handlers-cleaners', 'Machine-op-inspct', 'Craft-repair', 'Priv-house-serv', 'Other-service', 'Exec-managerial', 'Armed-Forces', 'Sales', 'Farming-fishing', 'Adm-clerical'], 'relationship': ['Other-relative', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Husband'], 'race': ['Amer-Indian-Eskimo', 'Other', 'Black', 'White', 'Asian-Pac-Islander'], 'sex': ['Female', 'Male'], 'native-country': ['Japan', 'Greece', 'Poland', 'Peru', 'H

In [11]:
mean_and_std = {}

for key, value in train_df.describe().to_dict().items():
  mean_and_std[key] = [value['mean'], value['std']]

In [12]:
write_to_output_file('mean_and_std.json', mean_and_std)
print(mean_and_std)

{'age': [38.58164675532078, 13.640432553581341], 'fnlwgt': [189778.36651208502, 105549.97769702224], 'education-num': [10.0806793403151, 2.5727203320673877], 'capital-gain': [1077.6488437087312, 7385.292084840338], 'capital-loss': [87.303829734959, 402.9602186489998], 'hours-per-week': [40.437455852092995, 12.347428681731843]}
