# Data Cleaning in Numpy

In [1]:
# Import the Numpy library
import numpy as np

## Arrays with Outliers

In [2]:
# Create an array with the data
DataWithOutlier = np.array([2, 1, 1, 99, 1, 5, 3, 1, 4, 3])
# Use len to determine the number of elements

### Remove outliers from arrays

In [3]:
x = DataWithOutlier.copy()

In [4]:
# The high limit for acceptable values is the mean plus 2 standard deviations    
LimitHi = np.mean(x) + 2*np.std(x)
# The high limit is the cutoff for good values
LimitHi

np.float64(70.06203578931762)

In [5]:
# The low limit for acceptable values is the mean plus 2 standard deviations
LimitLo = np.mean(x) - 2*np.std(x)
# The low limit is the cutoff for good values
LimitLo

np.float64(-46.06203578931762)

In [6]:
# Create Flag for values within limits 
FlagGood = (x >= LimitLo) & (x <= LimitHi)
# What type of variable is FlagGood? Check the Variable explorer.

# present the flag
FlagGood

array([ True,  True,  True, False,  True,  True,  True,  True,  True,
        True])

In [7]:
# We can present the values of the items within the limits
x[FlagGood]

# Overwrite x with the selected values
x = x[FlagGood]

# present the data set
x
# Use len to determine the number of elements in x

array([2, 1, 1, 1, 5, 3, 1, 4, 3])

### Replace outliers in numpy arrays (Imputation)

In [8]:
# Create an array with the data
y = DataWithOutlier.copy()

In [9]:
# Create Flag for values outside of limits
FlagBad = (y < LimitLo) | (y > LimitHi)

# present the flag
FlagBad

array([False, False, False,  True, False, False, False, False, False,
       False])

In [10]:
# Replace outlieres with mean of the whole array
y[FlagBad] = np.mean(y)

# See the values of y
y

array([ 2,  1,  1, 12,  1,  5,  3,  1,  4,  3])

In [11]:
# FlagGood is the complement of FlagBad
FlagGood = ~FlagBad

# Replace outliers with the mean of non-outliers
y[FlagBad] = np.mean(y[FlagGood])

# See the values of y
y

array([2, 1, 1, 2, 1, 5, 3, 1, 4, 3])

In [12]:
# Get the Sample data
z = DataWithOutlier.copy()

# Replace outliers with the median of the whole array
z[FlagBad] = np.median(z)

# See the values of z
z

array([2, 1, 1, 2, 1, 5, 3, 1, 4, 3])

## Arrays with Missing Values

In [13]:
# Create an array with missing values
a = np.array([2, 1, " ", 1, 99, 1, 5, 3, "?", 1, 4, 3])

### Remove Missing Values

In [14]:
# Attempt to tally values that are larger than 4
sum(a > 4)

UFuncTypeError: ufunc 'greater' did not contain a loop with signature matching types (<class 'numpy.dtypes.StrDType'>, <class 'numpy.dtypes._PyLongDType'>) -> None

In [15]:
# Find out the data type for a:
print(type(a))

# Find out the data type for the elements in the array
print(a.dtype.name)

<class 'numpy.ndarray'>
str672


In [18]:
# Do not allow specific texts
FlagGood = (a != "?") & (a != " ")
FlagGood

array([ True,  True, False,  True,  True,  True,  True,  True, False,
        True,  True,  True])

In [19]:
# Find elements that are numbers
FlagGood = [element.isdigit() for element in a]
FlagGood

[True, True, False, True, True, True, True, True, False, True, True, True]

In [20]:
# Select only the values that look like numbers
a = a[FlagGood]

a

array(['2', '1', '1', '99', '1', '5', '3', '1', '4', '3'], dtype='<U21')

In [21]:
# Attempt to tally values that are larger than 4
sum(a > 4)

UFuncTypeError: ufunc 'greater' did not contain a loop with signature matching types (<class 'numpy.dtypes.StrDType'>, <class 'numpy.dtypes._PyLongDType'>) -> None

In [22]:
# Need to cast the numbers from text (string) to real numeric values
a = a.astype(int)

a

array([ 2,  1,  1, 99,  1,  5,  3,  1,  4,  3])

In [23]:
# tally values that are larger than 4
sum(a > 4)

np.int64(2)

### Replace Missing Values (Imputation)

In [24]:
# Create an array with missing values
a = np.array([2, 1, " ", 1, 99, 1, 5, 3, "?", 1, 4, 3])

In [None]:
# Add Code here