## Creating array with null value

we will create an array and keep one value nan. As you can see the element at (2,0) is converted to null value. Now we will implement different techniques to fill the missing values

In [4]:
import numpy as np

X = np.arange(50).reshape(10, 5) * 1.0
X[2, 0] = np.nan

print(X)

[[ 0.  1.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]
 [nan 11. 12. 13. 14.]
 [15. 16. 17. 18. 19.]
 [20. 21. 22. 23. 24.]
 [25. 26. 27. 28. 29.]
 [30. 31. 32. 33. 34.]
 [35. 36. 37. 38. 39.]
 [40. 41. 42. 43. 44.]
 [45. 46. 47. 48. 49.]]


## Soft Impute

In [5]:
from soft_impute import SoftImpute 

clf = SoftImpute(J=1, lambda_=0.0)
clf.fit(X)
imputed = clf.predict(X)
print(imputed)

[[ 1.92521382  1.99828233  2.06086028  2.12343824  2.18601619]
 [ 6.5916521   6.84182805  7.05608586  7.27034367  7.48460148]
 [11.30429127 11.73332814 12.10076755 12.46820695 12.83564636]
 [15.92452865 16.52891948 17.04653701 17.56415454 18.08177207]
 [20.59096693 21.37246519 22.04176258 22.71105997 23.38035736]
 [25.2574052  26.21601091 27.03698815 27.8579654  28.67894265]
 [29.92384348 31.05955662 32.03221373 33.00487084 33.97752795]
 [34.59028176 35.90310234 37.0274393  38.15177627 39.27611324]
 [39.25672003 40.74664805 42.02266488 43.2986817  44.57469853]
 [43.92315831 45.59019377 47.01789045 48.44558714 49.87328382]]


In [6]:
print(imputed[2, 0])

11.304291270922157


## Moving Average

In [79]:
import pandas as pd
data = pd.DataFrame(X)

In [80]:
data

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,24.0
5,25.0,26.0,27.0,28.0,29.0
6,30.0,31.0,32.0,33.0,34.0
7,35.0,36.0,37.0,38.0,39.0
8,40.0,41.0,42.0,43.0,44.0
9,45.0,46.0,47.0,48.0,49.0


In [81]:
data['rollmean12'] = data[0].rolling(5,center=True,min_periods=1).mean()

In [82]:
data

Unnamed: 0,0,1,2,3,4,rollmean12
0,0.0,1.0,2.0,3.0,4.0,2.5
1,5.0,6.0,7.0,8.0,9.0,6.666667
2,,11.0,12.0,13.0,14.0,10.0
3,15.0,16.0,17.0,18.0,19.0,16.25
4,20.0,21.0,22.0,23.0,24.0,22.5
5,25.0,26.0,27.0,28.0,29.0,25.0
6,30.0,31.0,32.0,33.0,34.0,30.0
7,35.0,36.0,37.0,38.0,39.0,35.0
8,40.0,41.0,42.0,43.0,44.0,37.5
9,45.0,46.0,47.0,48.0,49.0,40.0


In [83]:
data[0] = data['rollmean12']

In [84]:
data

Unnamed: 0,0,1,2,3,4,rollmean12
0,2.5,1.0,2.0,3.0,4.0,2.5
1,6.666667,6.0,7.0,8.0,9.0,6.666667
2,10.0,11.0,12.0,13.0,14.0,10.0
3,16.25,16.0,17.0,18.0,19.0,16.25
4,22.5,21.0,22.0,23.0,24.0,22.5
5,25.0,26.0,27.0,28.0,29.0,25.0
6,30.0,31.0,32.0,33.0,34.0,30.0
7,35.0,36.0,37.0,38.0,39.0,35.0
8,37.5,41.0,42.0,43.0,44.0,37.5
9,40.0,46.0,47.0,48.0,49.0,40.0


## Linear Interpolation

In [50]:
import pandas as pd
data = pd.DataFrame(X)

In [51]:
data

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,24.0
5,25.0,26.0,27.0,28.0,29.0
6,30.0,31.0,32.0,33.0,34.0
7,35.0,36.0,37.0,38.0,39.0
8,40.0,41.0,42.0,43.0,44.0
9,45.0,46.0,47.0,48.0,49.0


In [52]:
data[0]=data[0].interpolate()

In [53]:
data

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,24.0
5,25.0,26.0,27.0,28.0,29.0
6,30.0,31.0,32.0,33.0,34.0
7,35.0,36.0,37.0,38.0,39.0
8,40.0,41.0,42.0,43.0,44.0
9,45.0,46.0,47.0,48.0,49.0


## Polynomial Interpolation

In [58]:
import pandas as pd
data = pd.DataFrame(X)

In [59]:
data

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,24.0
5,25.0,26.0,27.0,28.0,29.0
6,30.0,31.0,32.0,33.0,34.0
7,35.0,36.0,37.0,38.0,39.0
8,40.0,41.0,42.0,43.0,44.0
9,45.0,46.0,47.0,48.0,49.0


In [60]:
data[0] = data[0].interpolate(method="polynomial", order=2)

In [61]:
data

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,24.0
5,25.0,26.0,27.0,28.0,29.0
6,30.0,31.0,32.0,33.0,34.0
7,35.0,36.0,37.0,38.0,39.0
8,40.0,41.0,42.0,43.0,44.0
9,45.0,46.0,47.0,48.0,49.0


## LOCF - last observation carried forward

In [69]:
import pandas as pd
data = pd.DataFrame(X)

In [70]:
data

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,24.0
5,25.0,26.0,27.0,28.0,29.0
6,30.0,31.0,32.0,33.0,34.0
7,35.0,36.0,37.0,38.0,39.0
8,40.0,41.0,42.0,43.0,44.0
9,45.0,46.0,47.0,48.0,49.0


In [71]:
data[0].fillna(method="ffill", inplace=True)

In [72]:
data

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,5.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,24.0
5,25.0,26.0,27.0,28.0,29.0
6,30.0,31.0,32.0,33.0,34.0
7,35.0,36.0,37.0,38.0,39.0
8,40.0,41.0,42.0,43.0,44.0
9,45.0,46.0,47.0,48.0,49.0
