In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### Handling missing data with numpy

In [3]:
x = np.array([1,2,3,'---',5])
print(x.dtype)
x.sum()

<U21


TypeError: cannot perform reduce with flexible type

one thing can be replacing missing value by None

In [4]:
x = np.array([1,2,3,None,5])
print(x.dtype)
x.sum()

object


TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [5]:
x = np.array([1,2,3,np.nan,5])
print(x.dtype)
x.sum()

float64


nan

In [6]:
x_b = np.array([True,True,True,False,True])

In [7]:
np.sum(x[x_b])

11.0

In [8]:
x[x_b].mean() # 11/4

2.75

In [9]:
masked_x = np.ma.masked_array(x,mask = [0,0,0,1,0])

In [10]:
np.mean(masked_x)

2.75

#### Handling missing data in dataframes

In [11]:
df = pd.read_csv('rooms.csv')
df.head()

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,101.0,1.0,Mechanical,Y
1,102.0,,Empty,N
2,103.0,3.0,Electrical,Y
3,104.0,2.0,Mechanical,Y
4,105.0,,Chemical,N


In [12]:
df.dtypes

Room_Number     float64
Num_Students     object
Department       object
Occupied         object
dtype: object

having dtypes as object for numeric data slows down the processing time significantly

In [13]:
%%timeit
np.arange(100000,dtype = "int").sum()

364 µs ± 44 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
%%timeit 
np.arange(100000,dtype = "object").sum()

7.87 ms ± 414 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
df['Room_Number'].isnull().sum()

1

In [16]:
df.isnull().sum(axis=0)

Room_Number     1
Num_Students    3
Department      0
Occupied        1
dtype: int64

In [17]:
missing_values = ["NA","n/a","na","Empty","--"]

In [18]:
df = pd.read_csv("rooms.csv",na_values=missing_values)

In [19]:
df.isnull()

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,False,False,False,False
1,False,True,True,False
2,False,False,False,False
3,False,False,False,False
4,False,True,False,False
5,True,False,False,False
6,False,False,False,True
7,False,True,False,False
8,False,True,False,True
9,False,False,False,False


In [20]:
df

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,101.0,1.0,Mechanical,Y
1,102.0,,,N
2,103.0,3.0,Electrical,Y
3,104.0,2.0,Mechanical,Y
4,105.0,,Chemical,N
5,,1.0,Electrical,Y
6,107.0,3.0,Civil,
7,108.0,,CS,Y
8,109.0,,Mechanical,
9,110.0,2.0,CS,N


In [21]:
df['Department'].unique()

array(['Mechanical', nan, 'Electrical', 'Chemical', 'Civil', 'CS'],
      dtype=object)

In [22]:
df.dtypes

Room_Number     float64
Num_Students    float64
Department       object
Occupied         object
dtype: object

In [23]:
df['Occupied'].fillna("N",inplace = True)

In [24]:
df

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,101.0,1.0,Mechanical,Y
1,102.0,,,N
2,103.0,3.0,Electrical,Y
3,104.0,2.0,Mechanical,Y
4,105.0,,Chemical,N
5,,1.0,Electrical,Y
6,107.0,3.0,Civil,N
7,108.0,,CS,Y
8,109.0,,Mechanical,N
9,110.0,2.0,CS,N


In [25]:
df['Department'].fillna(method = "ffill",inplace = True) #ffill or forwardfill is same as giving pad as the method

In [26]:
df

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,101.0,1.0,Mechanical,Y
1,102.0,,Mechanical,N
2,103.0,3.0,Electrical,Y
3,104.0,2.0,Mechanical,Y
4,105.0,,Chemical,N
5,,1.0,Electrical,Y
6,107.0,3.0,Civil,N
7,108.0,,CS,Y
8,109.0,,Mechanical,N
9,110.0,2.0,CS,N


In [27]:
df['Num_Students'].fillna(df['Num_Students'].median(),inplace = True) #when replacing missing values by mean or median

In [28]:
df

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,101.0,1.0,Mechanical,Y
1,102.0,2.0,Mechanical,N
2,103.0,3.0,Electrical,Y
3,104.0,2.0,Mechanical,Y
4,105.0,2.0,Chemical,N
5,,1.0,Electrical,Y
6,107.0,3.0,Civil,N
7,108.0,2.0,CS,Y
8,109.0,2.0,Mechanical,N
9,110.0,2.0,CS,N


In [29]:
df['Room_Number'].interpolate(inplace = True) # we can interpolate complex patterns by passing different order as parameter 