In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({"feature_1": [0.1,np.NaN,np.NaN,0.4],
                   "feature_2": [1.1,2.2,np.NaN,np.NaN]
                  })
df

Unnamed: 0,feature_1,feature_2
0,0.1,1.1
1,,2.2
2,,
3,0.4,


In [3]:
df.isnull()

Unnamed: 0,feature_1,feature_2
0,False,False
1,True,False
2,True,True
3,False,True


In [5]:
df_booleans = pd.DataFrame({
    "col_1": [True, True, False],
    "col_2": [True, False, False]
})
df_booleans

Unnamed: 0,col_1,col_2
0,True,True
1,True,False
2,False,False


In [6]:
df_booleans.any()

col_1    True
col_2    True
dtype: bool

In [7]:
df_booleans.any(axis=0)

col_1    True
col_2    True
dtype: bool

In [8]:
df_booleans.any(axis=1)

0     True
1     True
2    False
dtype: bool

In [9]:
series_booleans = pd.Series([True, True, False])
series_booleans

0     True
1     True
2    False
dtype: bool

In [10]:
sum(series_booleans)

2

## Decision Trees

In [11]:
import pandas as pd

In [12]:
X = pd.DataFrame({
    "feature_1": [0, 1, 2, 3]
})
y = pd.Series([0, 0, 1, 1])

In [13]:
X

Unnamed: 0,feature_1
0,0
1,1
2,2
3,3


In [14]:
y

0    0
1    0
2    1
3    1
dtype: int64

In [15]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
dt = DecisionTreeClassifier()
dt

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [17]:
dt.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [18]:
dt = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=10,
    min_samples_split=2
)
dt

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [20]:
tree_parameters = {
    'criterion': 'entropy',
    'max_depth': 10,
    'min_samples_split': 2
}
dt = DecisionTreeClassifier(**tree_parameters)
dt

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [21]:
df = pd.DataFrame({
    'feature_1': [0, 1, 2, 3, 4]
})
df

Unnamed: 0,feature_1
0,0
1,1
2,2
3,3
4,4


In [22]:
mask = df["feature_1"] >= 3
mask

0    False
1    False
2    False
3     True
4     True
Name: feature_1, dtype: bool

In [23]:
df[mask]

Unnamed: 0,feature_1
3,3
4,4


In [24]:
# This will compare the series, one row at a time
(df["feature_1"] >=2) & (df["feature_1" ] <=3)

0    False
1    False
2     True
3     True
4    False
Name: feature_1, dtype: bool

## Imputation

In [25]:
df = pd.DataFrame({"feature_1": [0,1,2,3,4,5,6,7,8,9,10],
                   "feature_2": [0,np.NaN,20,30,40,50,60,70,80,np.NaN,100],
                  })
df

Unnamed: 0,feature_1,feature_2
0,0,0.0
1,1,
2,2,20.0
3,3,30.0
4,4,40.0
5,5,50.0
6,6,60.0
7,7,70.0
8,8,80.0
9,9,


In [26]:
from sklearn.impute import SimpleImputer

In [28]:
mean_imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')
mean_imputer

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [30]:
mean_imputer.fit(df)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [31]:
nparray_imputed_mean = mean_imputer.transform(df)
nparray_imputed_mean

array([[  0.,   0.],
       [  1.,  50.],
       [  2.,  20.],
       [  3.,  30.],
       [  4.,  40.],
       [  5.,  50.],
       [  6.,  60.],
       [  7.,  70.],
       [  8.,  80.],
       [  9.,  50.],
       [ 10., 100.]])

## Regression Imputation

In [33]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [34]:
reg_imputer = IterativeImputer()
reg_imputer

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=None,
                 sample_posterior=False, tol=0.001, verbose=0)

In [35]:
reg_imputer.fit(df)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=None,
                 sample_posterior=False, tol=0.001, verbose=0)

In [37]:
nparray_imputed_reg = reg_imputer.transform(df)
nparray_imputed_reg

array([[  0.,   0.],
       [  1.,  10.],
       [  2.,  20.],
       [  3.,  30.],
       [  4.,  40.],
       [  5.,  50.],
       [  6.,  60.],
       [  7.,  70.],
       [  8.,  80.],
       [  9.,  90.],
       [ 10., 100.]])