<img src='imputing_with_statistics.jpg' style='width:350px;length:340px'>

# Using Pandas

In [1]:
import pandas as pd
wdi = pd.read_pickle('wdi.pkl')
num_cols = wdi.select_dtypes(include='number').columns
cat_cols = wdi.select_dtypes(exclude='number').columns

## Get Mean and Median from Numeric Columns

In [2]:
wdi[num_cols].mean()

access_to_electricity_pct         8.421552e+01
atms_per_100000                   4.812857e+01
compulsory_education_years        9.701456e+00
health_expenditure_pct_of_gdp     6.433350e+00
gdp_per_capita_usd                1.359989e+04
gdp_per_capita_ppp                1.992523e+04
life_expectancy_female            7.456947e+01
life_expectancy_male              6.986652e+01
life_expectancy                   7.217627e+01
population_density                2.099738e+02
population                        3.690135e+08
alcohol_consumption_per_capita    6.016516e+00
unemployment_rate_female          8.404762e+00
unemployment_rate_male            6.465617e+00
unemployment_rate                 7.047682e+00
year                              2.017500e+03
is_region                         2.119816e-01
dtype: float64

In [3]:
wdi[num_cols].median()

access_to_electricity_pct         9.891294e+01
atms_per_100000                   3.941107e+01
compulsory_education_years        1.000000e+01
health_expenditure_pct_of_gdp     6.006967e+00
gdp_per_capita_usd                5.783711e+03
gdp_per_capita_ppp                1.310845e+04
life_expectancy_female            7.608850e+01
life_expectancy_male              7.081350e+01
life_expectancy                   7.358550e+01
population_density                6.961068e+01
population                        1.604842e+07
alcohol_consumption_per_capita    5.837431e+00
unemployment_rate_female          5.836087e+00
unemployment_rate_male            5.351708e+00
unemployment_rate                 5.606500e+00
year                              2.017500e+03
is_region                         0.000000e+00
dtype: float64

## Fill the N/A in Num_cols with the mean

In [4]:
wdi[num_cols] = wdi[num_cols].fillna(wdi[num_cols].mean())
# wdi[num_cols] = wdi[num_cols].fillna(wdi[num_cols].median()) => fill with median

In [5]:
wdi['alcohol_consumption_per_capita'].value_counts(dropna=False)

6.016516     219
0.690000       3
9.230000       2
12.030000      2
0.682988       2
            ... 
1.110000       1
5.380000       1
6.890000       1
2.730000       1
4.670000       1
Name: alcohol_consumption_per_capita, Length: 205, dtype: int64

## Fill N/A in Categorical Columns with Most Freq

In [6]:
wdi[cat_cols].describe()

Unnamed: 0,country_name,country_category
count,434,282
unique,217,2
top,Afghanistan,DEVELOPING
freq,2,224


In [7]:
wdi[cat_cols].describe().loc['top']

country_name        Afghanistan
country_category     DEVELOPING
Name: top, dtype: object

In [8]:
most_freq = wdi[cat_cols].describe().loc['top']

In [9]:
wdi[cat_cols] = wdi[cat_cols].fillna(most_freq)

In [11]:
wdi['country_category'].value_counts(dropna=False)

DEVELOPING    376
DEVELOPED      58
Name: country_category, dtype: int64

# Working with Scikit-learn

In [12]:
from sklearn.impute import SimpleImputer
wdi = pd.read_pickle('wdi.pkl')
num_cols = wdi.select_dtypes(include='number').columns
cat_cols = wdi.select_dtypes(exclude='number').columns

## Fill N/A in Num_Cols with mean

In [13]:
simple_imp_mean = SimpleImputer(strategy='mean')
imputed_num_mean = simple_imp_mean.fit_transform(wdi[num_cols])
wdi[num_cols] = imputed_num_mean
wdi['alcohol_consumption_per_capita'].value_counts(dropna=False)

6.016516     219
0.690000       3
9.230000       2
12.030000      2
0.682988       2
            ... 
1.110000       1
5.380000       1
6.890000       1
2.730000       1
4.670000       1
Name: alcohol_consumption_per_capita, Length: 205, dtype: int64

## Fill N/A in Cat_cols with Freq

In [14]:
simple_imp_freq = SimpleImputer(strategy='most_frequent')
imputed_cat_freq = simple_imp_freq.fit_transform(wdi[cat_cols])
wdi[cat_cols] = imputed_cat_freq
wdi['country_category'].value_counts(dropna=False)

DEVELOPING    376
DEVELOPED      58
Name: country_category, dtype: int64

## Working with Scikit-learn indicator

In [15]:
wdi = pd.read_pickle('wdi.pkl')

In [16]:
simple_imp_ind = SimpleImputer(strategy='mean', add_indicator=True)

In [17]:
wdi[['compulsory_education_years']]

Unnamed: 0,compulsory_education_years
0,9.0
1,9.0
2,10.0
3,6.0
4,9.0
...,...
429,10.0
430,10.0
431,10.0
432,7.0


In [18]:
type(wdi[['compulsory_education_years']])

pandas.core.frame.DataFrame

In [19]:
imputed_ind = simple_imp_ind.fit_transform(wdi[['compulsory_education_years']])
imputed_ind[:20]

array([[ 9.        ,  0.        ],
       [ 9.        ,  0.        ],
       [10.        ,  0.        ],
       [ 6.        ,  0.        ],
       [ 9.        ,  0.        ],
       [14.        ,  0.        ],
       [12.        ,  0.        ],
       [10.        ,  0.        ],
       [13.        ,  0.        ],
       [10.        ,  0.        ],
       [12.        ,  0.        ],
       [ 9.        ,  0.        ],
       [ 5.        ,  0.        ],
       [11.        ,  0.        ],
       [ 9.        ,  0.        ],
       [12.        ,  0.        ],
       [ 8.        ,  0.        ],
       [ 6.        ,  0.        ],
       [ 9.70145631,  1.        ],
       [14.        ,  0.        ]])

In [21]:
wdi[['compulsory_education_years','compulsory_education_years_missing']] = imputed_ind

In [23]:
wdi[['compulsory_education_years','compulsory_education_years_missing']].head(20)

Unnamed: 0,compulsory_education_years,compulsory_education_years_missing
0,9.0,0.0
1,9.0,0.0
2,10.0,0.0
3,6.0,0.0
4,9.0,0.0
5,14.0,0.0
6,12.0,0.0
7,10.0,0.0
8,13.0,0.0
9,10.0,0.0
