In [1]:
import pandas as pd
from io import StringIO
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0
32.0,,22.0
43.0,,2.0'''

In [2]:
df = pd.read_csv(StringIO(csv_data))


In [3]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,
3,32.0,,22.0,
4,43.0,,2.0,


In [4]:
df.isnull().sum()

A    0
B    2
C    1
D    3
dtype: int64

In [5]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan],
       [32., nan, 22., nan],
       [43., nan,  2., nan]])

In [6]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [7]:
df.dropna(axis=1)

Unnamed: 0,A
0,1.0
1,5.0
2,10.0
3,32.0
4,43.0


In [8]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,
3,32.0,,22.0,
4,43.0,,2.0,


In [9]:
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6.,  2.,  8.],
       [10., 11., 12.,  4.],
       [32.,  2., 22.,  4.],
       [43.,  2.,  2.,  4.]])

In [10]:
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  9.75      ,  8.        ],
       [10.        , 11.        , 12.        ,  6.        ],
       [32.        ,  6.33333333, 22.        ,  6.        ],
       [43.        ,  6.33333333,  2.        ,  6.        ]])

In [11]:
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='median')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ],
       [32. ,  6. , 22. ,  6. ],
       [43. ,  6. ,  2. ,  6. ]])

In [12]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,9.75,8.0
2,10.0,11.0,12.0,6.0
3,32.0,6.333333,22.0,6.0
4,43.0,6.333333,2.0,6.0


In [13]:
import pandas as pd
df = pd.DataFrame([
['green', 'M', 10.1, 'class2'],
['red', 'L', 13.5, 'class1'],
['blue', 'XL', 15.3, 'class2']])
df.columns = ['color', 'size', 'price', 'Label']
df

Unnamed: 0,color,size,price,Label
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [14]:
size_mapping = {'XL': 2,'L': 1,'M': 0}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,Label
0,green,0,10.1,class2
1,red,1,13.5,class1
2,blue,2,15.3,class2


In [15]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

In [16]:
import numpy as np
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['Label']))}
class_mapping

{'class1': 0, 'class2': 1}

In [17]:
df['Label'] = df['Label'].map(class_mapping)
df

Unnamed: 0,color,size,price,Label
0,green,0,10.1,1
1,red,1,13.5,0
2,blue,2,15.3,1


In [18]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['Label'] = df['Label'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,Label
0,green,0,10.1,class2
1,red,1,13.5,class1
2,blue,2,15.3,class2
