In [1]:
import pandas as pd
import numpy as np

import pathlib
from pathlib import Path
home_dir_path = pathlib.Path.home()

import datetime as date
now = date.datetime.now()

# Фильтрация отстутствуюищих данных

## Простое избавление от NaN

### Series

In [2]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [3]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [4]:
data = data.dropna()

In [5]:
data

0    1.0
2    3.5
4    7.0
dtype: float64

In [6]:
data.astype('int')

0    1
2    3
4    7
dtype: int64

In [7]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [8]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

### DataFrame

In [9]:
data = pd.DataFrame([[1, np.nan, 6.4, 8.8], [np.nan, 9.4, 8, np.nan], [8.5, 5, 6, 4.5], [np.nan, np.nan, np.nan, np.nan]])

In [10]:
data

Unnamed: 0,0,1,2,3
0,1.0,,6.4,8.8
1,,9.4,8.0,
2,8.5,5.0,6.0,4.5
3,,,,


In [11]:
clened_all = data.dropna()

In [12]:
clened_all

Unnamed: 0,0,1,2,3
2,8.5,5.0,6.0,4.5


In [13]:
# Удаляет только полностью отсутствующие значения
data.dropna(how='all')

Unnamed: 0,0,1,2,3
0,1.0,,6.4,8.8
1,,9.4,8.0,
2,8.5,5.0,6.0,4.5


In [14]:
data[4] = np.nan

In [15]:
data

Unnamed: 0,0,1,2,3,4
0,1.0,,6.4,8.8,
1,,9.4,8.0,,
2,8.5,5.0,6.0,4.5,
3,,,,,


In [16]:
data.dropna(how='all', axis=1)

Unnamed: 0,0,1,2,3
0,1.0,,6.4,8.8
1,,9.4,8.0,
2,8.5,5.0,6.0,4.5
3,,,,


In [20]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = np.nan; df.iloc[:2, 2] = np.nan

In [21]:
df

Unnamed: 0,0,1,2
0,0.607052,,
1,0.535861,,
2,0.800903,,0.291863
3,-1.687046,,-0.753315
4,0.121429,1.113011,-0.655722
5,-0.386101,2.377525,0.115051
6,0.801422,-0.313875,1.007987


In [37]:
# Оставляет определенное кол-во наблюдений без NaN
df.dropna(thresh=3)

Unnamed: 0,0,1,2
4,0.121429,1.113011,-0.655722
5,-0.386101,2.377525,0.115051
6,0.801422,-0.313875,1.007987


# Замена отсутствующих данных

In [43]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.607052,0.0,0.0
1,0.535861,0.0,0.0
2,0.800903,0.0,0.291863
3,-1.687046,0.0,-0.753315
4,0.121429,1.113011,-0.655722
5,-0.386101,2.377525,0.115051
6,0.801422,-0.313875,1.007987


In [46]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [45]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [47]:
df

Unnamed: 0,0,1,2
0,0.607052,,
1,0.535861,,
2,0.800903,,0.291863
3,-1.687046,,-0.753315
4,0.121429,1.113011,-0.655722
5,-0.386101,2.377525,0.115051
6,0.801422,-0.313875,1.007987


In [55]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,0.607052,1.058887,0.001173
1,0.535861,1.058887,0.001173
2,0.800903,1.058887,0.291863
3,-1.687046,1.058887,-0.753315
4,0.121429,1.113011,-0.655722
5,-0.386101,2.377525,0.115051
6,0.801422,-0.313875,1.007987


# Кореляция и ковариация