# This case study consists of 1000 marks and the solution file is expected to be submitted in the upcoming session for evaluation.

## Import libraries

In [5]:
import pandas as pd
import numpy as np

## Data input

In [6]:
# Download
DATASET = (
    "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names",
    "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
)

In [7]:
# Load Training and Test Data Sets
headers = ['age', 'workclass', 'fnlwgt', 
           'education', 'education-num', 
           'marital-status', 'occupation', 
           'relationship', 'race', 'sex', 
           'capital-gain', 'capital-loss', 
           'hours-per-week', 'native-country', 
           'predclass']
training_raw = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', 
                       header=None, 
                       names=headers,na_values=["?"])
test_raw = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', 
                      header=None, 
                      names=headers,na_values=["?"])

In [8]:
# Join Datasets
dataset_raw = training_raw.append(test_raw)
dataset_raw.reset_index(inplace=True)
dataset_raw.drop('index',inplace=True,axis=1)

## Data intial stats checking

In [9]:
dataset_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48843 entries, 0 to 48842
Data columns (total 15 columns):
age               48843 non-null object
workclass         48842 non-null object
fnlwgt            48842 non-null float64
education         48842 non-null object
education-num     48842 non-null float64
marital-status    48842 non-null object
occupation        48842 non-null object
relationship      48842 non-null object
race              48842 non-null object
sex               48842 non-null object
capital-gain      48842 non-null float64
capital-loss      48842 non-null float64
hours-per-week    48842 non-null float64
native-country    48842 non-null object
predclass         48842 non-null object
dtypes: float64(5), object(10)
memory usage: 5.6+ MB


In [10]:
# Describing all the Numerical Features
dataset_raw.describe()

Unnamed: 0,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0
mean,189664.1,10.078089,1079.067626,87.502314,40.422382
std,105604.0,2.570973,7452.019058,403.004552,12.391444
min,12285.0,1.0,0.0,0.0,1.0
25%,117550.5,9.0,0.0,0.0,40.0
50%,178144.5,10.0,0.0,0.0,40.0
75%,237642.0,12.0,0.0,0.0,45.0
max,1490400.0,16.0,99999.0,4356.0,99.0


In [11]:
# Describing all the Categorical Features
dataset_raw.describe(include=['O'])

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,native-country,predclass
count,48843,48842,48842,48842,48842,48842,48842,48842,48842,48842
unique,147,9,16,7,15,6,5,2,42,4
top,36,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,898,33906,15784,22379,6172,19716,41762,32650,43832,24720


## Q1) Check for missing values

In [12]:
dataset_raw[dataset_raw.isna().any(axis=1)]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,predclass
32561,|1x3 Cross validator,,,,,,,,,,,,,,


## Q2) For columns with missing values, print the number of rows with missing values

In [13]:
a=dataset_raw.shape[0] - dataset_raw.dropna().shape[0]
a

1

## Q3) For columns with missing values, print the percentage of rows with missing values

In [14]:
print("missing rows %",(a/len(dataset_raw)*100))

missing rows % 0.0020473762872878407


In [15]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

## Q4) Use the DataFrameImputer class defined above to impute values to rows with  missing values.

In [27]:
dataset_raw1=DataFrameImputer().fit_transform(dataset_raw)
dataset_raw1[dataset_raw1.isna().any(axis=1)]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,predclass


## Q5) Use appropriate formatting for all columns

In [17]:
dataset_raw['occupation'].unique()
dataset_raw['occupation'] = dataset_raw['occupation'].apply(lambda x : str(x).strip())

In [18]:
dataset_raw['age'].unique()

array([39, 50, 38, 53, 28, 37, 49, 52, 31, 42, 30, 23, 32, 40, 34, 25, 43,
       54, 35, 59, 56, 19, 20, 45, 22, 48, 21, 24, 57, 44, 41, 29, 18, 47,
       46, 36, 79, 27, 67, 33, 76, 17, 55, 61, 70, 64, 71, 68, 66, 51, 58,
       26, 60, 90, 75, 65, 77, 62, 63, 80, 72, 74, 69, 73, 81, 78, 88, 82,
       83, 84, 85, 86, 87, '|1x3 Cross validator', '25', '38', '28', '44',
       '18', '34', '29', '63', '24', '55', '65', '36', '26', '58', '48',
       '43', '20', '37', '40', '72', '45', '22', '23', '54', '32', '46',
       '56', '17', '39', '52', '21', '42', '33', '30', '47', '41', '19',
       '69', '50', '31', '59', '49', '51', '27', '57', '61', '64', '79',
       '73', '53', '77', '80', '62', '35', '68', '66', '75', '60', '67',
       '71', '70', '90', '81', '74', '78', '82', '83', '85', '76', '84',
       '89', '88', '87'], dtype=object)

In [20]:
dataset_raw['education'].unique()
dataset_raw['education'] = dataset_raw['education'].apply(lambda x : str(x).strip())

In [22]:
dataset_raw['workclass'].unique()
dataset_raw['workclass'] = dataset_raw['workclass'].apply(lambda x : str(x).strip())

In [24]:
dataset_raw['marital-status'].unique()
dataset_raw['marital-status'] = dataset_raw['marital-status'].apply(lambda x : str(x).strip())