In [16]:
import pandas as pd
from io import StringIO

csv_data = \
'''A,B,C,D,E,F,G,H,I,J
1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,,10.0
5.0,6.0,,7.0,,,,,,
10.0,11.0,12.0,,2.0,3.0,,2.0,,2.0'''

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,,10.0
1,5.0,6.0,,7.0,,,,,,
2,10.0,11.0,12.0,,2.0,3.0,,2.0,,2.0


String IO is used to create our own dataset

In [39]:
df.fillna(method = 'bfill')  #fills null values with next values

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,,10.0
1,5.0,6.0,12.0,7.0,2.0,3.0,,2.0,,2.0
2,10.0,11.0,12.0,,2.0,3.0,,2.0,,2.0


In [41]:
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,,10.0
1,5.0,6.0,,7.0,,,,,,
2,10.0,11.0,12.0,,2.0,3.0,,2.0,,2.0


In [40]:
df.fillna(method = 'ffill') #fills null values before values in place of it..

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,,10.0
1,5.0,6.0,3.0,7.0,5.0,6.0,7.0,8.0,,10.0
2,10.0,11.0,12.0,7.0,2.0,3.0,7.0,2.0,,2.0


In [17]:
df.isnull().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      bool 
 1   B       3 non-null      bool 
 2   C       3 non-null      bool 
 3   D       3 non-null      bool 
 4   E       3 non-null      bool 
 5   F       3 non-null      bool 
 6   G       3 non-null      bool 
 7   H       3 non-null      bool 
 8   I       3 non-null      bool 
 9   J       3 non-null      bool 
dtypes: bool(10)
memory usage: 158.0 bytes


# In dropna 
axis 1 means for columns


axis 0  means for rows

In [19]:
df.dropna(axis = 1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [21]:
df.isnull().sum()

A    0
B    0
C    1
D    1
E    1
F    1
G    2
H    1
I    3
J    1
dtype: int64

In [28]:
df.values  #to represent the all values in nd array format

array([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8., nan, 10.],
       [ 5.,  6., nan,  7., nan, nan, nan, nan, nan, nan],
       [10., 11., 12., nan,  2.,  3., nan,  2., nan,  2.]])

In [29]:
df.columns #prints the all column names in the dataset

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype='object')

In [30]:
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,,10.0
1,5.0,6.0,,7.0,,,,,,
2,10.0,11.0,12.0,,2.0,3.0,,2.0,,2.0


In [31]:
from sklearn.impute import SimpleImputer
import numpy as np

In [32]:
imr = SimpleImputer(missing_values = np.nan,strategy = 'median')  #similarly you can perform mean also
imr = imr.fit(df.values)

imputed_data = imr.transform(df.values)  
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ,  5. ,  6. ,  7. ,  8. , 10. ],
       [ 5. ,  6. ,  7.5,  7. ,  3.5,  4.5,  7. ,  5. ,  6. ],
       [10. , 11. , 12. ,  5.5,  2. ,  3. ,  7. ,  2. ,  2. ]])

In [33]:
imr = SimpleImputer(missing_values = np.nan,strategy = 'mean')  
imr = imr.fit(df.values)

imputed_data = imr.transform(df.values)  
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ,  5. ,  6. ,  7. ,  8. , 10. ],
       [ 5. ,  6. ,  7.5,  7. ,  3.5,  4.5,  7. ,  5. ,  6. ],
       [10. , 11. , 12. ,  5.5,  2. ,  3. ,  7. ,  2. ,  2. ]])

In [34]:
imr = SimpleImputer(missing_values = np.nan,strategy = 'most_frequent')
imr = imr.fit(df.values)

imputed_data = imr.transform(df.values)  
imputed_data

array([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8., 10.],
       [ 5.,  6.,  3.,  7.,  2.,  3.,  7.,  2.,  2.],
       [10., 11., 12.,  4.,  2.,  3.,  7.,  2.,  2.]])

In [36]:
result_of_mean = df.mean()
result_of_mean

A    5.333333
B    6.333333
C    7.500000
D    5.500000
E    3.500000
F    4.500000
G    7.000000
H    5.000000
I         NaN
J    6.000000
dtype: float64

In [37]:
df.fillna(result_of_mean)

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,,10.0
1,5.0,6.0,7.5,7.0,3.5,4.5,7.0,5.0,,6.0
2,10.0,11.0,12.0,5.5,2.0,3.0,7.0,2.0,,2.0


In [38]:
df.dtypes   #represents the what type of data present in the dataset

A    float64
B    float64
C    float64
D    float64
E    float64
F    float64
G    float64
H    float64
I    float64
J    float64
dtype: object

In [43]:
df2 = pd.DataFrame([
    ['Green',1,"class1"],
    ["Red",3,"Class2"],
    ["yelloe",4,"classs4"]
])
df2.columns = ['colour','price','label']
df2

Unnamed: 0,colour,price,label
0,Green,1,class1
1,Red,3,Class2
2,yelloe,4,classs4
