In [1]:
# Time series Data

import pandas as pd
# create a series of datetime with a frequency of 10 hours
s = pd.date_range('2020-01-06', '2020-01-10', freq='10H').to_series()
# create some features based on datetime
features = {"dayofweek": s.dt.dayofweek.values,
            "dayofyear": s.dt.dayofyear.values,
            "hour": s.dt.hour.values,
            "is_leap_year": s.dt.is_leap_year.values,
            "quarter": s.dt.quarter.values,
            "weekofyear": s.dt.weekofyear.values}

  "weekofyear": s.dt.weekofyear.values}


In [3]:
features

{'dayofweek': array([0, 0, 0, 1, 1, 2, 2, 2, 3, 3], dtype=int64),
 'dayofyear': array([6, 6, 6, 7, 7, 8, 8, 8, 9, 9], dtype=int64),
 'hour': array([ 0, 10, 20,  6, 16,  2, 12, 22,  8, 18], dtype=int64),
 'is_leap_year': array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True]),
 'quarter': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64),
 'weekofyear': array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)}

In [None]:
## Customer level Features

def generate_features(df):
    # create a bunch of features using the date column
    df.loc[:, 'year'] = df['date'].dt.year
    df.loc[:, 'weekofyear'] = df['date'].dt.weekofyear
    df.loc[:, 'month'] = df['date'].dt.month
    df.loc[:, 'dayofweek'] = df['date'].dt.dayofweek
    df.loc[:, 'weekend'] = (df['date'].dt.weekday >=5).astype(int)
    # create an aggregate dictionary
    aggs = {} # for aggregation by month, we calculate the# number of unique month values and also the mean
    aggs['month'] = ['nunique', 'mean']
    aggs['weekofyear'] = ['nunique', 'mean']
    # we aggregate by num1 and calculate sum, max, min # and mean values of this column
    aggs['num1'] = ['sum','max','min','mean']
    # for customer_id, we calculate the total count
    aggs['customer_id'] = ['size']
    # again for customer_id, we calculate the total unique
    aggs['customer_id'] = ['nunique']
    # we group by customer_id and calculate the aggregates
    agg_df = df.groupby('customer_id').agg(aggs)
    agg_df = agg_df.reset_index()
    return agg_df

In [None]:
## Statistical Features

import numpy as np
feature_dict = {}
# calculate mean
feature_dict['mean'] = np.mean(x)
# calculate max
feature_dict['max'] = np.max(x)
# calculate min
feature_dict['min'] = np.min(x)
# calculate standard deviation
feature_dict['std'] = np.std(x)
# calculate variance
feature_dict['var'] = np.var(x)
# peak-to-peak
feature_dict['ptp'] = np.ptp(x)
# percentile features
feature_dict['percentile_10'] = np.percentile(x, 10)
feature_dict['percentile_60'] = np.percentile(x, 60)
feature_dict['percentile_90'] = np.percentile(x, 90)
# quantile features
feature_dict['quantile_5'] = np.quantile(x,0.05)
feature_dict['quantile_95'] = np.quantile(x, 0.95)
feature_dict['quantile_99'] = np.quantile(x, 0.99)

from tsfresh.feature_extraction import feature_calculators as fc
# tsfresh based features
feature_dict['abs_energy'] = fc.abs_energy(x)
feature_dict['count_above_mean'] = fc.count_above_mean(x)
feature_dict['count_below_mean'] = fc.count_below_mean(x)
feature_dict['mean_abs_change'] = fc.mean_abs_change(x)
feature_dict['mean_change'] = fc.mean_change(x)

In [6]:
# Polynomial Features
import numpy as np
# generate a random dataframe with # 2 columns and 100 rows
df = pd.DataFrame(np.random.rand(100, 2),columns=[f"f_{i}"for i in range(1, 3)])


from sklearn import preprocessing
# initialize polynomial features class object
# for two-degree polynomial features
pf = preprocessing.PolynomialFeatures(degree=2,interaction_only=False,include_bias=False)
# fit to the features
pf.fit(df)
# create polynomial features
poly_feats = pf.transform(df)
# create a dataframe with all the features
num_feats = poly_feats.shape[1]
df_transformed = pd.DataFrame(poly_feats,columns=[f"f_{i}"for i in range(1, num_feats + 1)])
df_transformed

In [8]:
# Binning
# create bins of the numerical columns
# 10 bins
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
# 100 bins
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)
df

In [12]:
# Log Transformation (To reduce variance due to outliers)
df.f_bin_100.var(), df.f_bin_100.apply(lambda x: np.log(1+ x)).var()

(824.3364646464647, 0.8609664687403571)

In [13]:
## Handling Missing Values
## KNN Imputer

import numpy as np
from sklearn import impute
# create a random numpy array with 10 samples
# and 6 features and values ranging from 1 to 15
X = np.random.randint(1, 15, (10, 6))
# convert the array to float
X = X.astype(float)
# randomly assign 10 elements to NaN (missing)
X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan
# use 2nearest neighbours to fill na values
knn_imputer = impute.KNNImputer(n_neighbors=2)
knn_imputer.fit_transform(X)



array([[ 6. ,  4. ,  9. ,  3. ,  5. ,  5. ],
       [11. , 12.5, 13. ,  5. , 13. ,  2.5],
       [ 5. ,  9. ,  1. ,  8. ,  9. ,  8. ],
       [10. ,  8.5, 10. ,  8. , 10. ,  4. ],
       [ 2. ,  6. ,  5. ,  8. ,  2. ,  6. ],
       [12. ,  5. , 10. ,  9. , 10. ,  4.5],
       [ 5. , 12. ,  2. ,  9. ,  2. ,  5. ],
       [11. , 13. , 10. ,  3. ,  9. ,  1. ],
       [12. , 12. , 10. ,  6. ,  9.5,  2.5],
       [ 4. ,  5. ,  5. , 10. ,  3. ,  7. ]])