In [173]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, impute


In [202]:
# Randomly generated categorical and numerical features with time data.

date = pd.Series(pd.date_range("2020-01-01","2020-05-20", freq="4H"), name="date")
customer_id = pd.Series(data=np.random.randint(low=100, high=900, size=(len(date))), name="customer_id")
cat_1 = pd.Series(data=np.random.randint(5, size=(len(date),)), name="cat_1")
cat_2 = pd.Series(data=np.random.randint(10, size=(len(date),)), name="cat_2")
cat_3 = pd.Series(data=np.random.randint(20, size=(len(date),)), name="cat_3")
num_1 = pd.Series(np.random.randn(len(date)), name="num_1")

df = pd.concat([date, customer_id, cat_1, cat_2, cat_3, num_1], axis=1)
df

Unnamed: 0,date,customer_id,cat_1,cat_2,cat_3,num_1
0,2020-01-01 00:00:00,807,2,3,2,0.514247
1,2020-01-01 04:00:00,193,0,4,2,-0.227681
2,2020-01-01 08:00:00,840,0,2,10,0.360493
3,2020-01-01 12:00:00,468,2,0,9,0.605255
4,2020-01-01 16:00:00,758,0,7,4,0.099341
...,...,...,...,...,...,...
836,2020-05-19 08:00:00,785,4,6,2,-2.075977
837,2020-05-19 12:00:00,706,1,3,5,0.098413
838,2020-05-19 16:00:00,495,3,8,9,1.101501
839,2020-05-19 20:00:00,415,3,7,8,1.143291


In [203]:
df["customer_id"].value_counts()

637    6
548    6
446    5
582    5
495    5
      ..
567    1
566    1
282    1
564    1
101    1
Name: customer_id, Length: 519, dtype: int64

In [211]:
# Generate aggregate features
def generate_features(df):
    df.loc[:, "year"] = df["date"].dt.year
    df.loc[:, "weekofyear"] = df["date"].dt.isocalendar().week
    df.loc[:, "month"] = df["date"].dt.month
    df.loc[:, "dayofweek"] = df["date"].dt.dayofweek
    df.loc[:, "weekend"] = (df["date"].dt.weekday >= 5).astype(int)

    aggs = {}
    aggs["month"] = ["nunique", "mean"]
    aggs["weekofyear"] = ["nunique", "mean"]
    aggs["num_1"] = ["sum", "max", "min", "mean"]
    aggs["customer_id"] = ["size"]
    aggs["customer_id"] = ["nunique"]    

    agg_df = df.groupby("customer_id").agg(aggs)
    agg_df = agg_df.reset_index()
    
    return agg_df, df 

cust_id = 822

agg_df, converted_df = generate_features(df)
agg_df.loc[agg_df["customer_id"][""] == cust_id, :]

Unnamed: 0_level_0,customer_id,month,month,weekofyear,weekofyear,num_1,num_1,num_1,num_1,customer_id
Unnamed: 0_level_1,Unnamed: 1_level_1,nunique,mean,nunique,mean,sum,max,min,mean,nunique
470,822,1,1.0,1,4.0,-0.559704,-0.268056,-0.291648,-0.279852,1


In [212]:
converted_df.loc[converted_df["customer_id"] == cust_id, :]

Unnamed: 0,date,customer_id,cat_1,cat_2,cat_3,num_1,year,weekofyear,month,dayofweek,weekend
114,2020-01-20,822,0,6,0,-0.291648,2020,4,1,0,0
132,2020-01-23,822,0,6,5,-0.268056,2020,4,1,3,0


In [213]:
converted_df.groupby("customer_id").agg({"num_1":["sum", "mean", "min", "max"]})

Unnamed: 0_level_0,num_1,num_1,num_1,num_1
Unnamed: 0_level_1,sum,mean,min,max
customer_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
101,1.592413,1.592413,1.592413,1.592413
102,-0.350886,-0.350886,-0.350886,-0.350886
103,-0.756897,-0.756897,-0.756897,-0.756897
104,1.778869,1.778869,1.778869,1.778869
105,0.495664,0.495664,0.495664,0.495664
...,...,...,...,...
893,-0.603889,-0.603889,-0.603889,-0.603889
894,-1.103556,-0.551778,-1.290459,0.186902
895,-0.540506,-0.270253,-0.973724,0.433218
896,0.853911,0.853911,0.853911,0.853911


In [214]:
# Generate polynomial features from numerical data
def generate_polynomial_features(df):
    pf = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
    pf.fit(df)
    poly_feats = pf.transform(df)
    
    num_feats = poly_feats.shape[1]
    df_transformed = pd.DataFrame(poly_feats, columns=[f"f_{i}" for i in range(1, num_feats+1)])
    
    return df_transformed

df_numerical = pd.DataFrame(np.random.rand(100,2), columns=[f"f_{i}" for i in range(1,3)])
generate_polynomial_features(df_numerical)

Unnamed: 0,f_1,f_2,f_3,f_4,f_5
0,0.661120,0.058390,0.437080,0.038603,0.003409
1,0.944648,0.792440,0.892361,0.748577,0.627961
2,0.924931,0.074864,0.855497,0.069244,0.005605
3,0.776733,0.913842,0.603314,0.709811,0.835107
4,0.986185,0.621469,0.972561,0.612884,0.386224
...,...,...,...,...,...
95,0.002006,0.717858,0.000004,0.001440,0.515321
96,0.722735,0.400806,0.522345,0.289677,0.160646
97,0.727569,0.366190,0.529356,0.266428,0.134095
98,0.632994,0.308903,0.400682,0.195534,0.095421


In [215]:
# Binning the data
df_numerical["f_1_bin_10"] = pd.cut(df_numerical["f_1"], bins=10, labels=False)
df_numerical["f_1_bin_100"] = pd.cut(df_numerical["f_1"], bins=100, labels=False)
df_numerical

Unnamed: 0,f_1,f_2,f_1_bin_10,f_1_bin_100
0,0.661120,0.058390,6,66
1,0.944648,0.792440,9,95
2,0.924931,0.074864,9,93
3,0.776733,0.913842,7,78
4,0.986185,0.621469,9,99
...,...,...,...,...
95,0.002006,0.717858,0,0
96,0.722735,0.400806,7,73
97,0.727569,0.366190,7,73
98,0.632994,0.308903,6,64


In [216]:
# Log transformation
df_numerical["f_high_variance"] = np.random.randint(0,10000, size=(100,))
df_numerical["f_high_variance_log"] = df_numerical["f_high_variance"].apply(lambda x: np.log(1+x))
df_numerical


Unnamed: 0,f_1,f_2,f_1_bin_10,f_1_bin_100,f_high_variance,f_high_variance_log
0,0.661120,0.058390,6,66,7891,8.973605
1,0.944648,0.792440,9,95,2105,7.652546
2,0.924931,0.074864,9,93,679,6.522093
3,0.776733,0.913842,7,78,9202,9.127285
4,0.986185,0.621469,9,99,3986,8.290794
...,...,...,...,...,...,...
95,0.002006,0.717858,0,0,1387,7.235619
96,0.722735,0.400806,7,73,433,6.073045
97,0.727569,0.366190,7,73,1881,7.540090
98,0.632994,0.308903,6,64,4751,8.466321


In [217]:
# Missing Values
from sklearn import impute

x = np.random.randint(low=1,high=15,size=(10,16)).astype(float)
x.ravel()[np.random.choice(x.size, 10, replace=False)] = np.nan

knn_imputer = impute.KNNImputer(n_neighbors=2)
knn_imputer.fit_transform(x)

array([[ 2. , 14. ,  6. , 13. ,  6. ,  7. ,  6. ,  8. ,  8. ,  8. , 12. ,
         3. ,  6. ,  3. , 11. ,  2. ],
       [ 3. ,  9. , 12. , 10. ,  9. , 10. ,  6. , 10. , 13. ,  4. ,  9. ,
         5. ,  5.5,  3. , 10. , 14. ],
       [ 1. , 11. ,  9. , 14. , 14. ,  4. , 14. ,  6. , 14. ,  9. , 10. ,
         3. ,  6. ,  2. ,  5. ,  4. ],
       [ 1. , 14. , 10. ,  8. , 10. ,  6. ,  5. , 14. , 12. , 14. ,  5. ,
         7. , 14. , 10. ,  6. ,  1. ],
       [ 7. ,  8. ,  4. , 13. ,  9. ,  1. ,  5. ,  1. ,  4. ,  1. ,  7. ,
         1. , 10. , 11. , 11. ,  9. ],
       [ 2. , 13. , 12. , 10. ,  4. ,  4.5, 12. ,  1. ,  5. ,  8. , 13. ,
         6. , 10. ,  9. ,  3. ,  1. ],
       [ 2. , 10. , 10. ,  5. , 11. ,  8. , 14. , 12. ,  5. , 14. , 11.5,
         4.5,  3. ,  6. ,  1. ,  4. ],
       [ 9. ,  4. ,  5. , 10. , 11. ,  7. ,  5. ,  7. , 14. ,  1. , 12. ,
        14. ,  5. ,  1. ,  1. , 11. ],
       [ 9. ,  7. ,  6. ,  8. , 12. ,  2. , 10. ,  2. ,  6. ,  8. ,  8. ,
         8. , 14. ,  6