In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import seaborn as sns


In [2]:
# Loading dataset
labels = ["age", "workclass", "fnlwgt", "education", "education-num",
          "marital-status", "occupation" ,"relationship", "race", "sex",
          "capital-gain", "capital-loss", "hours-per-week", "native-country", "group"]

# From dataset description, we know that missing values or unknown values are marked with "?" value
dataset = pd.read_csv('G:\\semester 8\\Data Mining\\Tugas\\1\\adult.data', names=labels,
                         sep=',\s', na_values=["?"], engine='python')

In [3]:
print('Number of instances = %d' % (dataset.shape[0]))
print('Number of attributes = %d' % (dataset.shape[1]))

Number of instances = 32561
Number of attributes = 15


In [4]:
print('Number of missing values:')
for col in dataset.columns:
    print('\t%s: %d' % (col, dataset[col].isnull().sum()))

Number of missing values:
	age: 0
	workclass: 1836
	fnlwgt: 0
	education: 0
	education-num: 0
	marital-status: 0
	occupation: 1843
	relationship: 0
	race: 0
	sex: 0
	capital-gain: 0
	capital-loss: 0
	hours-per-week: 0
	native-country: 583
	group: 0


In [5]:
# Method 1
# Dropping NaN values
dataset_drop = pd.read_csv('G:\\semester 8\\Data Mining\\Tugas\\1\\adult.data', names=labels,
                         sep=',\s', na_values=["?"], engine='python')
print("Data shape before drop :", dataset_drop.shape)
dataset_drop.dropna(inplace=True)
print("Data shape after drop :", dataset_drop.shape)
for col in dataset_drop.columns:
    print('\t%s: %d' % (col, dataset_drop[col].isnull().sum()))

Data shape before drop : (32561, 15)
Data shape after drop : (30162, 15)
	age: 0
	workclass: 0
	fnlwgt: 0
	education: 0
	education-num: 0
	marital-status: 0
	occupation: 0
	relationship: 0
	race: 0
	sex: 0
	capital-gain: 0
	capital-loss: 0
	hours-per-week: 0
	native-country: 0
	group: 0


In [6]:
# Method 2
# Using imputer with most frequent categorical

from sklearn.base import TransformerMixin



class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)



dataset_impute = pd.read_csv('G:\\semester 8\\Data Mining\\Tugas\\1\\adult.data', names=labels,
                         sep=',\s', na_values=["?"], engine='python')

for col in dataset_impute.columns:
    print('\t%s: %d' % (col, dataset_impute[col].isnull().sum()))
imputer = DataFrameImputer()
dataset_impute = imputer.fit_transform(dataset_impute)
print('\n')
for col in dataset_impute.columns:
    print('\t%s: %d' % (col, dataset_impute[col].isnull().sum()))

	age: 0
	workclass: 1836
	fnlwgt: 0
	education: 0
	education-num: 0
	marital-status: 0
	occupation: 1843
	relationship: 0
	race: 0
	sex: 0
	capital-gain: 0
	capital-loss: 0
	hours-per-week: 0
	native-country: 583
	group: 0


	age: 0
	workclass: 0
	fnlwgt: 0
	education: 0
	education-num: 0
	marital-status: 0
	occupation: 0
	relationship: 0
	race: 0
	sex: 0
	capital-gain: 0
	capital-loss: 0
	hours-per-week: 0
	native-country: 0
	group: 0
