In [1]:
import pandas as pd
from io import StringIO
import numpy as np


In [2]:
csv_data = \
    '''A,B,C,D
    1.0,2.0,3.0,4.0
    5.0,6.0,,8.0
    10.0,11.0,12.0,'''

In [3]:
df = pd.read_csv(StringIO(csv_data))

In [4]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [5]:
# identifying missing data by column using isnull or isna
print(df.isnull().sum())
print(df.isna().sum())

A    0
B    0
C    1
D    1
dtype: int64
A    0
B    0
C    1
D    1
dtype: int64


In [6]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

# Eliminating training examples or features with missing values

In [7]:
df.dropna(axis=0) # this drops any ROW with missing data

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [8]:
df.dropna(axis=1) # this method drops any COLUMN with a missing value in it

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [9]:
df.dropna(how='all') # this method will only drop rows where all columns are null or NaN

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [10]:
print(df.dropna(thresh=4)) # this is a threshold for ROWS that have fewer than 4 real values
print(df.dropna(thresh=3)) # this sets it fewer than 3

     A    B    C    D
0  1.0  2.0  3.0  4.0
      A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   NaN  8.0
2  10.0  11.0  12.0  NaN


In [11]:
df.dropna(subset=['C']) # this will drop NA ROWS for columns C only

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


# Mean imputation   

In [12]:
from sklearn.impute import SimpleImputer
import numpy as np

In [13]:
imr = SimpleImputer(missing_values=np.nan,strategy='mean')

imr = imr.fit(df.values)

imputed_data = imr.transform(df.values)

imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [14]:
df.fillna(df.mean()) # same method as previous cell using fillna from pandas

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


# Categorical encoding with Pandas

In [15]:
df2 = pd.DataFrame([
    ['green','M',10.1,'class2'],
    ['red','L',13.5,'class1'],
    ['blue','XL',15.3,'class2']
])

df2.columns = ['color','size','price','classlabel']

df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [16]:
# mapping ordinal features
size_mapping = {'XL': 3,
                'L' : 2,
                'M' : 1
                }

In [17]:
df2['size'] = df2['size'].map(size_mapping)

df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [18]:
# if we want to revert back from ordinal values to the original size
inv_size_mapping = {v:k for k, v in size_mapping.items()}

df2['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

# Encoding class labels

In [19]:
df2['classlabel'] = np.where(df2['classlabel'] == 'class1',0,1)

df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [20]:
# revert back
df2['classlabel'] = np.where(df2['classlabel'] == 0,'class1','class2')

In [21]:
# using sklearn built in labeling
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()

y = class_le.fit_transform(df2['classlabel'].values)
print(y)

# revert back ---------------------------------------------------------
class_le.inverse_transform(y)

[1 0 1]


array(['class2', 'class1', 'class2'], dtype=object)

# Performing one-hot encoding on nominal features

In [22]:
## creating a function to revert the size integers back to M,L, XL
def revert_func(dataframe):
    empty_list = []
    for i in range(0,len(dataframe)):
        if dataframe['size'].iloc[i] == 1:
            empty_list.append('M')
        elif dataframe['size'].iloc[i] == 2:
            empty_list.append('L')
        elif dataframe['size'].iloc[i] == 3:
            empty_list.append('XL')
    dataframe['size'] = empty_list
    return dataframe
    

In [23]:
revert_func(df2)

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [24]:
X = df2[['color','size','price']].values
X

color_le = LabelEncoder()

print(X[:,0])

X[:,0] = color_le.fit_transform(X[:,0]) # this turns the different colors into integer values

print(X)


['green' 'red' 'blue']
[[1 'M' 10.1]
 [2 'L' 13.5]
 [0 'XL' 15.3]]


In [25]:
# ColumnTransformer method in sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
X = df2[['color','size','price']].values

c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0]),
    ('nothing','passthrough', [1,2])
])

c_transf.fit_transform(X) # One Hotting the color column ------------------------------------



array([[0.0, 1.0, 0.0, 'M', 10.1],
       [0.0, 0.0, 1.0, 'L', 13.5],
       [1.0, 0.0, 0.0, 'XL', 15.3]], dtype=object)

In [26]:
# using pandas 'get_dummies' ------- this is the most convenient way to one hot string/class features

df_one_hot = pd.get_dummies(df2[['price','color','size']], dtype=int)

df_one_hot

Unnamed: 0,price,color_blue,color_green,color_red,size_L,size_M,size_XL
0,10.1,0,1,0,0,1,0
1,13.5,0,0,1,1,0,0
2,15.3,1,0,0,0,0,1


In [27]:
pd.get_dummies(df2[['price','color','size']], dtype=int,drop_first=True) # this will drop the first one hotted varaiable using drop_first = True

Unnamed: 0,price,color_green,color_red,size_M,size_XL
0,10.1,1,0,1,0
1,13.5,0,1,0,0
2,15.3,0,0,0,1


In [28]:
pd.get_dummies(df2[['price','color','size']],drop_first=False,dtype=bool) # setting the dtype to bool gives you true/false values instead of 0/1 values for the one hot

Unnamed: 0,price,color_blue,color_green,color_red,size_L,size_M,size_XL
0,10.1,False,True,False,False,True,False
1,13.5,False,False,True,True,False,False
2,15.3,True,False,False,False,False,True


In [29]:
# OPTIONAL ENCODING METHODS -- if we want to encode ordinal features using thresholds.  See the x > M column that is created

df2['x > M'] = df2['size'].apply(
    lambda x: 1 if x in {'L','XL'} else 0
)

df2['x > L'] = df2['size'].apply(
    lambda x: 1 if x in {'XL'} else 0
)

df2

Unnamed: 0,color,size,price,classlabel,x > M,x > L
0,green,M,10.1,class2,0,0
1,red,L,13.5,class1,1,0
2,blue,XL,15.3,class2,1,1
