In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO
import numpy as np

In [None]:
csv_data = '''
A, B, C, D
1,2, 3, 4
4, 2,, 0
9, 10, 6
12,12, 34, 9

'''


In [None]:
df = pd.read_csv(StringIO(csv_data))
df.head()

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,4,2,,0.0
2,9,10,6.0,
3,12,12,34.0,9.0


In [None]:
df.isnull().sum()

A     0
 B    0
 C    1
 D    1
dtype: int64

## Dropping Nan Values

Dropping the rows having the nan values

In [None]:
df.dropna(axis=0) # axis 0 represents the row

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
3,12,12,34.0,9.0


Dropping the columns having the nan values

In [None]:
df.dropna(axis=1) #axis 1 represents the column

Unnamed: 0,A,B
0,1,2
1,4,2
2,9,10
3,12,12


In [None]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,4,2,,0.0
2,9,10,6.0,
3,12,12,34.0,9.0


In [None]:
df.dropna(thresh=4) #dropping the rowshw having >4 real values

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
3,12,12,34.0,9.0


In [None]:
df.dropna(subset=['A']) 

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,4,2,,0.0
2,9,10,6.0,
3,12,12,34.0,9.0


## Inputing missing values

In [None]:
from sklearn.impute import SimpleImputer
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed_df = imr.fit_transform(df.values)
imputed_df


array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 4.        ,  2.        , 14.33333333,  0.        ],
       [ 9.        , 10.        ,  6.        ,  4.33333333],
       [12.        , 12.        , 34.        ,  9.        ]])

In [None]:
# alternative way to impute
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,4,2,14.333333,0.0
2,9,10,6.0,4.333333
3,12,12,34.0,9.0


In [None]:
df.fillna(df.median())

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,4,2,6.0,0.0
2,9,10,6.0,4.0
3,12,12,34.0,9.0


# Categorical Data encoding with pandas

In [None]:
df_cat = pd.DataFrame({
    'color':['green', 'red', 'blue'],
    'size':['M', 'L', 'XL'],
    'price':[10.1, 13.5, 15.3],
    'classlabel':['class2', 'class1', 'class2']

})
df_cat.head()

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


## Mapping ordinal features

In [None]:
size_mapping = {'XL':3, 'L':2, 'M':1}
df_cat['size'] = df_cat['size'].map(size_mapping)
df_cat.head()

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [None]:
class_map = {'class1':0, 'class2':1}
df_cat['classlabel'] = df_cat['classlabel'].map(class_map)
df_cat.head()


Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


TypeError: ignored

## Performing the one-hot encoding on the nominal features

In [None]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df_cat['classlabel'].values)
y

array([1, 0, 1])

In [None]:
x = df_cat[['color', 'size', 'price']].values
color_le  = LabelEncoder()
x[:, 0] = color_le.fit_transform(x[:, 0])
x

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)