In [1]:
import pandas as pd

### Date time

In [2]:
s = pd.date_range('2020-01-06', '2020-01-10', freq='10H').to_series()
s

2020-01-06 00:00:00   2020-01-06 00:00:00
2020-01-06 10:00:00   2020-01-06 10:00:00
2020-01-06 20:00:00   2020-01-06 20:00:00
2020-01-07 06:00:00   2020-01-07 06:00:00
2020-01-07 16:00:00   2020-01-07 16:00:00
2020-01-08 02:00:00   2020-01-08 02:00:00
2020-01-08 12:00:00   2020-01-08 12:00:00
2020-01-08 22:00:00   2020-01-08 22:00:00
2020-01-09 08:00:00   2020-01-09 08:00:00
2020-01-09 18:00:00   2020-01-09 18:00:00
Freq: 10H, dtype: datetime64[ns]

In [7]:
features = {
    "dayofweek": s.dt.dayofweek.values,
    "dayofyear": s.dt.dayofyear.values,
    "hour": s.dt.hour.values,
    "is_leap_year": s.dt.is_leap_year.values,
    "quarter": s.dt.quarter.values,
    "weekofyear": s.dt.isocalendar().week.values
}
features

{'dayofweek': array([0, 0, 0, 1, 1, 2, 2, 2, 3, 3], dtype=int64),
 'dayofyear': array([6, 6, 6, 7, 7, 8, 8, 8, 9, 9], dtype=int64),
 'hour': array([ 0, 10, 20,  6, 16,  2, 12, 22,  8, 18], dtype=int64),
 'is_leap_year': array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True]),
 'quarter': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64),
 'weekofyear': <IntegerArray>
 [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
 Length: 10, dtype: UInt32}

### Polynomial features

In [14]:
import numpy as np

In [15]:
df = pd.DataFrame(np.random.rand(100, 2), columns = [f"f_{i}" for i in range(1,3)])
df.head()

Unnamed: 0,f_1,f_2
0,0.701058,0.623867
1,0.821532,0.410557
2,0.696663,0.646644
3,0.074810,0.618307
4,0.661139,0.574790
...,...,...
95,0.538292,0.485830
96,0.531478,0.053523
97,0.433968,0.223512
98,0.726438,0.870565


In [16]:
from sklearn import preprocessing

In [18]:
# Initialize polynomial features: [f1,f2] => [f1, f2, f1^2, f2^2, f1*f2]
pf = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
pf.fit(df)
poly_feats = pf.transform(df)
poly_feats.shape

(100, 5)

In [19]:
# Create a dataframe
num_feats = poly_feats.shape[1]
df_transformed =pd.DataFrame(poly_feats, columns=[f"f_{i}" for i in range(1, num_feats + 1)])
df_transformed.shape

(100, 5)

In [20]:
df_transformed.head()

Unnamed: 0,f_1,f_2,f_3,f_4,f_5
0,0.701058,0.623867,0.491483,0.437367,0.38921
1,0.821532,0.410557,0.674915,0.337285,0.168557
2,0.696663,0.646644,0.485339,0.450492,0.418148
3,0.07481,0.618307,0.005597,0.046256,0.382304
4,0.661139,0.57479,0.437104,0.380016,0.330383


### Binning

In [21]:
# Create bins of the numberical columns
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)
df.head()

Unnamed: 0,f_1,f_2,f_bin_10,f_bin_100
0,0.701058,0.623867,7,70
1,0.821532,0.410557,8,82
2,0.696663,0.646644,6,69
3,0.07481,0.618307,0,7
4,0.661139,0.57479,6,66


In [22]:
df.f_bin_10.value_counts()

9    18
7    14
5    14
0    10
6     9
4     9
1     8
8     7
3     7
2     4
Name: f_bin_10, dtype: int64

### Imputation with KNN

In [64]:
import numpy as np
from sklearn import impute

In [65]:
# Create a random np array with 10 samples
X = np.random.randint(1, 15, (10,6))
X = X.astype(float)
X.shape

(10, 6)

In [66]:
X

array([[ 7.,  1., 14., 14.,  3.,  8.],
       [ 8.,  3.,  9.,  9.,  2., 12.],
       [ 6., 13.,  4.,  8., 12.,  9.],
       [11.,  4.,  5.,  3.,  7., 14.],
       [11., 10., 13.,  5.,  4.,  2.],
       [ 8., 11., 12.,  5.,  2., 14.],
       [ 3.,  1.,  7.,  9.,  9.,  7.],
       [ 1.,  9., 14., 14.,  3.,  2.],
       [ 9.,  9.,  5.,  7.,  2.,  1.],
       [ 1., 14.,  4.,  6.,  9.,  2.]])

In [67]:
# Randomly assign 10 elements to NaN
e_rand = np.random.choice(X.size, 10, replace=False)
X.ravel()[e_rand]

array([ 6., 11.,  6.,  8.,  7.,  3.,  2.,  9., 11.,  9.])

In [68]:
X.ravel()[e_rand] = np.nan
X

array([[nan,  1., 14., 14.,  3.,  8.],
       [nan,  3.,  9., nan, nan, 12.],
       [nan, 13.,  4.,  8., 12.,  9.],
       [nan,  4.,  5., nan,  7., 14.],
       [nan, 10., 13.,  5.,  4.,  2.],
       [ 8., 11., 12.,  5.,  2., 14.],
       [ 3.,  1.,  7.,  9.,  9.,  7.],
       [ 1.,  9., 14., 14.,  3.,  2.],
       [ 9., nan,  5.,  7.,  2.,  1.],
       [ 1., 14.,  4., nan,  9.,  2.]])

In [69]:
knn_imputer = impute.KNNImputer(n_neighbors=2)
Y = knn_imputer.fit_transform(X)

In [70]:
Y.ravel()[e_rand]

array([ 2. ,  5.5,  7.5,  5.5,  2. ,  7. ,  8. ,  5.5,  5. , 11.5])