In [1]:
%load_ext autoreload

In [2]:
%run ./common_init.ipynb

Setup logging to file: out.log
Figure output directory saved in figure_output at /home/datarian/OneDrive/unine/Master_Thesis/figures


In [3]:
%autoreload 2
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import HashingEncoder, OneHotEncoder, OrdinalEncoder

# Load custom code
import kdd98.data_loader as dl
import kdd98.utils_transformer as ut
from kdd98.transformers import *
from kdd98.config import App

In [4]:
# Where to save the figures
IMAGES_PATH = pathlib.Path(figure_output/'preprocessing')

pathlib.Path(IMAGES_PATH).mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = pathlib.Path(IMAGES_PATH/fig_id + "." + fig_extension)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## Dates

There are several date features. ODATEDW is the date the record was added, DOB the birth date. ADATE_* and RDATE_* are from the promotion history. ADATE_* is the date of a mailing, RDATE_* the date the donation for the corresponding mailing was received. While these dates are not of particular interest (very low variance), the time it took to respond might be.
Furthermore, there are the features MINRDATE, MAXRDATE, MAXADATE, FISTDATE, NEXTDATE and LASTDATE coming from the giving history file.

Three different transformations are applied:

1. ODATEDW, DOB: Years before 1997 -> membership duration, age
2. Giving history features: Relative time in months to 1997/06/01
3. For the promotion history, as specified above, the time for response in months

There are redundant features which can be safely removed, as is shown below:

1. FISTDATE and NEXTDATE are contained in TIMELAG, the number of months between first and second donation
2. DOB, the date of birth, is contained in the feature AGE

In [None]:
print(dl.date_features)

Now, we transform the dates from the giving history. First, we create two dataframes with the sending dates of the mailings and the dates when the gift (donation) for these was received.

In [None]:
don_hist_transformer = ColumnTransformer([
    ("months_to_donation",
     MonthsToDonation(),
     dl.promo_history_dates+dl.giving_history_dates
     )
])

In [None]:
donation_responses = don_hist_transformer.fit_transform(learning)

In [None]:
don_hist_feature_names = [n[n.find('__')+2:]
                 for n in don_hist_transformer.get_feature_names()]

In [None]:
donation_responses = pd.DataFrame(
    donation_responses, index=learning.index, columns=don_hist_feature_names)

In [None]:
learning = learning.merge(donation_responses, on=learning.index.name)

Time delta computation of the remaining features with either a specific reference or the date of the most recent mailing as a reference:

* Time since last donation, minimum- and maximum donation and receiving most recent promotion
* Delta between first and next donation
* Age, years of membership

In [None]:
timedelta_transformer = ColumnTransformer([
    ("time_last_donation", DeltaTime(unit='months'), ['LASTDATE','MINRDATE','MAXRDATE','MAXADATE']),
    ("delta_first_next", DeltaTime(reference_date=learning.NEXTDATE), ['FISTDATE']),
    ("membership_years", DeltaTime(unit='years'),['ODATEDW', 'DOB'])
])

In [None]:
timedeltas = timedelta_transformer.fit_transform(learning)

In [None]:
timedelta_feature_names = [n[n.find('__')+2:]
                 for n in timedelta_transformer.get_feature_names()]

In [None]:
timedeltas = pd.DataFrame(timedeltas, index=learning.index,columns=timedelta_feature_names)

In [None]:
timedeltas.columns

In [None]:
learning = learning.merge(timedeltas, on=learning.index.name)
learning.drop(dl.date_features, axis=1,inplace=True)

Studying redundance of DOB <-> AGE and \[FISTDATE, NEXTDATE\] <-> TIMELAG

In [None]:
ages = pd.DataFrame([learning.AGE, timedeltas.DOB_DELTA_YEARS]).T

In [None]:
ages.loc[ages.AGE != ages.DOB_DELTA_YEARS,:].dropna()

In [None]:
lags = pd.DataFrame([learning.TIMELAG, timedeltas.FISTDATE_NEXTDATE_DELTA_MONTHS]).T

In [None]:
lags.loc[lags.TIMELAG != lags.FISTDATE_NEXTDATE_DELTA_MONTHS,:].dropna()

The transformed feature DOB is represented in the feature AGE already. So we can drop DOB_DELTA_YEARS. TIMELAG already holds the difference in months between FISTDATE and NEXTDATE, so this delta can also be safely removed together with the original features

In [None]:
learning.drop(['DOB_DELTA_YEARS', 'FISTDATE_NEXTDATE_DELTA_MONTHS'], axis=1,inplace=True)

## Ordinal Features