In [1]:
%load_ext autoreload

In [2]:
%run ./common_init.ipynb

Setup logging to file: out.log
Figure output directory saved in figure_output at /home/datarian/OneDrive/unine/Master_Thesis/ma-thesis-report/figures


In [3]:
%autoreload 2
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import HashingEncoder, OneHotEncoder, OrdinalEncoder

# Load custom code
import kdd98.data_handler as dh
import kdd98.utils_transformer as ut
from kdd98.transformers import *
from kdd98.config import Config

Using TensorFlow backend.


In [4]:
# Where to save the figures
IMAGES_PATH = pathlib.Path(figure_output/'preprocessing')

pathlib.Path(IMAGES_PATH).mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = pathlib.Path(IMAGES_PATH, fig_id + "." + fig_extension)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [5]:
data_provider = dh.KDD98DataProvider("cup98LRN.txt")

In [6]:
learning_preprocessed = data_provider.preprocessed_data

In [7]:
from kdd98.transformers import ZipToCoords
from category_encoders import BinaryEncoder, OneHotEncoder

## Feature engineering

### Encode zip codes as coordinates
Instead of encoding the zip codes one-hot, which would lead to a significant increase in dimensionality (there are 16488 zip codes), they are transformed to their centroid coordinates. This gives an intuitive measure of geopgraphical relation between examples.

The coordinates are first searched for in a database from the 2018 US census, if not found there, the HERE geolocator web service is queried.

For military zip codes, there are no coordinates available. These are set to lat=0, lon=0.

In [8]:
len(learning_preprocessed.ZIP.unique())

16488

In [9]:
zip_to_coords = ColumnTransformer([("zip_to_coords", ZipToCoords(),
                                    ["ZIP", "STATE"])])
coords = zip_to_coords.fit_transform(learning_preprocessed)
coords_names = zip_to_coords.get_feature_names()
coords = pd.DataFrame(data=coords, index=learning_preprocessed.index, columns=coords_names)

In [10]:
learning_preprocessed = learning_preprocessed.merge(coords, on=learning_preprocessed.index.name)

In [11]:
learning_preprocessed.drop("ZIP", axis=1, inplace=True)

### Converting dates

There are several date features. ODATEDW is the date the record was added, DOB the birth date. ADATE_* and RDATE_* are from the promotion history. ADATE_* is the date of a mailing, RDATE_* the date the donation for the corresponding mailing was received. While these dates are not of particular interest (very low variance), the time it took to respond might be.
Furthermore, there are the features MINRDATE, MAXRDATE, MAXADATE, FISTDATE, NEXTDATE and LASTDATE coming from the giving history file.

In [12]:
print(dh.DATE_FEATURES)

['ODATEDW', 'DOB', 'ADATE_2', 'ADATE_3', 'ADATE_4', 'ADATE_5', 'ADATE_6', 'ADATE_7', 'ADATE_8', 'ADATE_9', 'ADATE_10', 'ADATE_11', 'ADATE_12', 'ADATE_13', 'ADATE_14', 'ADATE_15', 'ADATE_16', 'ADATE_17', 'ADATE_18', 'ADATE_19', 'ADATE_20', 'ADATE_21', 'ADATE_22', 'ADATE_23', 'ADATE_24', 'RDATE_3', 'RDATE_4', 'RDATE_5', 'RDATE_6', 'RDATE_7', 'RDATE_8', 'RDATE_9', 'RDATE_10', 'RDATE_11', 'RDATE_12', 'RDATE_13', 'RDATE_14', 'RDATE_15', 'RDATE_16', 'RDATE_17', 'RDATE_18', 'RDATE_19', 'RDATE_20', 'RDATE_21', 'RDATE_22', 'RDATE_23', 'RDATE_24', 'LASTDATE', 'MINRDATE', 'MAXRDATE', 'FISTDATE', 'NEXTDATE', 'MAXADATE']


The following helper function updates feature name lists and removes features that are no longer present because they were removed during preprocessing.

In [13]:
ALL_FEATURES = learning_preprocessed.columns.values.tolist()
def filter_features(features):
        return [f for f in features if f in ALL_FEATURES]

In [14]:
learning_preprocessed[filter_features(dh.DATE_FEATURES)]

Unnamed: 0_level_0,ODATEDW,ADATE_5,ADATE_7,ADATE_8,ADATE_9,ADATE_10,ADATE_11,ADATE_12,ADATE_13,ADATE_14,...,RDATE_16,RDATE_17,RDATE_18,RDATE_19,RDATE_21,RDATE_22,RDATE_24,LASTDATE,MINRDATE,MAXRDATE
CONTROLN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
95515,8901,9604,9602,9601,9511,9510,9510,9508,9507,9506,...,9505,9503,,,,,9406,9512,9208,9402
148535,9401,9604,9602,9601,9511,9510,9510,9509,,,...,9504,,,,,,,9512,9310,9512
15078,9001,9604,9602,9601,9511,,9510,9508,9507,9506,...,9504,,9501,,,9409,9406,9512,9111,9207
172556,8701,9604,9602,9601,9511,,9510,9508,9507,9506,...,9505,9503,,,9411,,,9512,8711,9411
7112,8601,9604,9512,9601,9511,9510,9509,9508,9502,9506,...,,,,,,,,9601,9310,9601
47784,9401,9604,9602,9601,9511,9510,9510,9509,9507,9506,...,,,9506,,,,9407,9506,9407,9412
62117,8701,,9602,9601,9511,9510,9510,9508,9507,9506,...,9504,,,,,9410,,9504,8705,9410
109359,9401,9604,9602,9601,9511,9510,9510,9509,9507,9506,...,9504,,,,,,9407,9508,9507,9508
75768,8801,9604,9602,9601,9511,9510,9509,9508,9507,9506,...,,,,,9411,,,9507,8809,9312
49909,9401,,9602,9601,9511,9511,9511,9509,9507,,...,9504,,,,,,,9504,9309,9504


#### Donation history
From ADATE_*, the date a letter was sent, and RDATE_*, the date a donation was received, we can calculate the time in months it took to respond with a donation.

In [15]:
don_history = ColumnTransformer(
    [("months_to_donation",
      MonthsToDonation(reference_date=pd.datetime(1998, 6, 1)),
      filter_features(dh.PROMO_HISTORY_DATES + dh.GIVING_HISTORY_DATES))])
donation_history = don_history.fit_transform(learning_preprocessed)
donation_history_names = [n[n.find('__')+2:]
                 for n in don_history.get_feature_names()]
donation_history = pd.DataFrame(data=donation_history, index=learning_preprocessed.index, columns=donation_history_names)

In [16]:
learning_preprocessed = learning_preprocessed.merge(donation_history, on=learning_preprocessed.index.name)

In [17]:
learning_preprocessed.drop(filter_features(dh.PROMO_HISTORY_DATES + dh.GIVING_HISTORY_DATES), axis=1,inplace=True)

#### Time since donations, membership years
The time deltas for LASTDATE (last time a donation received), MINRDATE (when the smallest donation was received), MAXRDATE (when the largest donation was received) and MAXADATE (when the most recent promotion was sent) are expressed in months before the reference date (which is the sending date of the last promotion

Membership years are also computed against the reference date of the last promotion sent out.

In [18]:
t_deltas = ColumnTransformer(
    [("time_last_donation",
      DeltaTime(reference_date=pd.datetime(1997, 6, 1), unit="months"),
      filter_features(["LASTDATE", "MINRDATE", "MAXRDATE", "MAXADATE"])),
     ("membership_years",
      DeltaTime(reference_date=pd.datetime(1997, 6, 1), unit="years"),
      filter_features(["ODATEDW", "DOB"]))])
timedeltas = t_deltas.fit_transform(learning_preprocessed)
timedeltas_names = [n[n.find('__')+2:]
                 for n in t_deltas.get_feature_names()]
timedeltas = pd.DataFrame(data=timedeltas, index=learning_preprocessed.index, columns=timedeltas_names)

In [19]:
timedeltas

Unnamed: 0_level_0,LASTDATE_DELTA_MONTHS,MINRDATE_DELTA_MONTHS,MAXRDATE_DELTA_MONTHS,ODATEDW_DELTA_YEARS
CONTROLN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
95515,18,58,40,9
148535,18,44,18,4
15078,18,67,59,8
172556,18,115,31,11
7112,17,44,17,12
47784,24,35,30,4
62117,26,121,32,11
109359,22,23,22,4
75768,23,105,42,10
49909,26,45,26,4


In [20]:
learning_preprocessed = learning_preprocessed.merge(timedeltas, on=learning_preprocessed.index.name)

In [21]:
learning_preprocessed.drop(filter_features(["LASTDATE", "MINRDATE", "MAXRDATE", "MAXADATE", "ODATEDW"]), axis=1, inplace=True)

There are redundant features which can be safely removed, as is shown below:

1. FISTDATE and NEXTDATE are contained in TIMELAG, the number of months between first and second donation
2. DOB, the date of birth, is contained in the feature AGE

### Categoricals

In [22]:
CATEGORICAL_FEATURES = learning_preprocessed.select_dtypes(include="category").columns.values.tolist()
BE_CATEGORICALS = ['OSOURCE', 'TCODE', 'STATE', 'CLUSTER']
OHE_CATEGORICALS = [f for f in CATEGORICAL_FEATURES if f not in BE_CATEGORICALS]
OHE_CATEGORICALS

['GENDER',
 'DATASRCE',
 'GEOCODE',
 'LIFESRC',
 'GEOCODE2',
 'RFA_3R',
 'RFA_4R',
 'RFA_5R',
 'RFA_6R',
 'RFA_7R',
 'RFA_8R',
 'RFA_9R',
 'RFA_10R',
 'RFA_11R',
 'RFA_12R',
 'RFA_13R',
 'RFA_14R',
 'RFA_15R',
 'RFA_16R',
 'RFA_17R',
 'RFA_18R',
 'RFA_19R',
 'RFA_20R',
 'RFA_21R',
 'RFA_22R',
 'RFA_23R',
 'RFA_24R',
 'DOMAINUrbanicity']

In [23]:
learning_preprocessed[OHE_CATEGORICALS]

Unnamed: 0_level_0,GENDER,DATASRCE,GEOCODE,LIFESRC,GEOCODE2,RFA_3R,RFA_4R,RFA_5R,RFA_6R,RFA_7R,...,RFA_16R,RFA_17R,RFA_18R,RFA_19R,RFA_20R,RFA_21R,RFA_22R,RFA_23R,RFA_24R,DOMAINUrbanicity
CONTROLN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
95515,F,,,,C,S,S,S,S,S,...,S,S,S,S,S,S,S,S,S,T
148535,M,3,02,,A,A,A,A,A,A,...,L,,,N,N,N,N,,F,S
15078,M,3,,,C,S,S,S,S,S,...,S,,S,S,,,S,S,S,R
172556,F,3,,,C,S,S,S,S,S,...,S,S,S,A,A,A,A,,,R
7112,F,3,,3,A,A,A,A,A,I,...,L,A,A,A,A,A,I,A,A,S
47784,,,,,C,A,A,A,A,A,...,N,N,N,N,N,N,N,,F,T
62117,F,3,,3,D,A,A,,A,S,...,S,S,S,A,,A,A,A,A,T
109359,F,1,,,C,A,A,A,A,A,...,N,N,N,N,N,N,N,,F,T
75768,M,3,,,D,A,A,A,A,S,...,A,A,A,A,A,A,A,A,A,R
49909,M,,03,,B,A,A,,A,A,...,A,A,A,N,N,N,N,,,T


In [25]:
learning_preprocessed.DOMAINUrbanicity.unique()

[T, S, R, U, C, NaN]
Categories (5, object): [T, S, R, U, C]

In [30]:
learning_preprocessed.DOMAINUrbanicity

CONTROLN
95515     T
148535    S
15078     R
172556    R
7112      S
47784     T
62117     T
109359    T
75768     R
49909     T
106016    R
60127     S
85548     R
12890     T
134891    T
143689    U
64667     S
98090     R
35557     S
42556     C
82943     S
72675     R
190166    T
92152     C
82229     T
160963    T
89160     S
102610    T
122772    T
97870     T
         ..
56972     S
22658     T
126131    U
93718     U
157506    S
31573     C
46748     S
139193    T
98104     T
23868     T
132458    R
17039     C
35112     R
104515    R
12322     S
131980    T
78831     S
29549     T
38061     S
109741    C
47945     R
84678     R
58178     R
156106    C
35088     S
184568    C
122706    C
189641    C
4693      C
185114    C
Name: DOMAINUrbanicity, Length: 95412, dtype: category
Categories (5, object): [C, R, S, T, U]

In [32]:
ohe_dom = ColumnTransformer([
    ("oh",
     OneHotEncoder(use_cat_names=True,
                   handle_missing="return_nan"),
                   ["DOMAINUrbanicity"])
])
ohe_dom.fit_transform(learning_preprocessed)

array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.]])

In [33]:
ohe_dom.get_feature_names()

['oh__DOMAINUrbanicity_C',
 'oh__DOMAINUrbanicity_R',
 'oh__DOMAINUrbanicity_S',
 'oh__DOMAINUrbanicity_T',
 'oh__DOMAINUrbanicity_U',
 'oh__DOMAINUrbanicity_nan']

In [39]:
ohe_bare = OneHotEncoder(use_cat_names=True,
                   handle_missing="return_nan",
                   handle_unknown="return_nan")
ohe_bare.fit_transform(learning_preprocessed.DOMAINUrbanicity).isna().sum()

DOMAINUrbanicity_T      2316
DOMAINUrbanicity_S      2316
DOMAINUrbanicity_R      2316
DOMAINUrbanicity_U      2316
DOMAINUrbanicity_C      2316
DOMAINUrbanicity_nan    2316
dtype: int64

#### Binary Encoding

Now, the nominals (categorical features with string levels) are worked on. Those categoricals with high cardinality (many levels) are bianry-encoded so as to not increase dimensionality too much.

The remaining features are one-hot encoded.
https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159

In [28]:
binary_encode = ColumnTransformer([
                    ("be_osource", BinaryEncoder(handle_missing="return_nan"), filter_features(['OSOURCE'])),
                    ("be_state", BinaryEncoder(handle_missing="return_nan"), filter_features(['STATE'])),
                    ("be_cluster", BinaryEncoder(handle_missing="return_nan"), filter_features(['CLUSTER'])),
                    ("be_tcode", BinaryEncoder(handle_missing="return_nan"), filter_features(['TCODE']))
                ])
binary_encoded_categories = binary_encode.fit_transform(learning_preprocessed)
binary_encode_names = [n[n.find('__')+2:]
                 for n in binary_encode.get_feature_names()]
binary_encoded_categories = pd.DataFrame(data=binary_encoded_categories, index=learning_preprocessed.index, columns = binary_encode_names)


In [29]:
learning_preprocessed = learning_preprocessed.merge(binary_encoded_categories, on=learning_preprocessed.index.name)

In [30]:
learning_preprocessed.drop(filter_features(BE_CATEGORICALS), axis=1, inplace=True)

#### One-Hot Encoding

In [31]:
one_hot_encoding = ColumnTransformer([("oh",
                                       OneHotEncoder(
                                           use_cat_names=True,
                                           handle_missing="return_nan"),
                                       OHE_CATEGORICALS)])
oh_encoded_categories = one_hot_encoding.fit_transform(learning_preprocessed)
oh_encoded_categories_names = [n[n.find('__')+2:] for n in one_hot_encoding.get_feature_names()]
oh_encoded_categories = pd.DataFrame(data=oh_encoded_categories, index=learning_preprocessed.index, columns = oh_encoded_categories_names)

In [32]:
learning_preprocessed = learning_preprocessed.merge(oh_encoded_categories, on=learning_preprocessed.index.name)

In [33]:
learning_preprocessed.drop(OHE_CATEGORICALS, axis=1, inplace=True)

### Feature engineering combined

All the above steps are implemented in package kdd98. The data after feature engineering is readily available:

In [34]:
learning_numeric = data_provider.numeric_data

In [35]:
learning_numeric.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95412 entries, 95515 to 185114
Columns: 662 entries, RECINHSE to DOMAINUrbanicity_nan
dtypes: Int64(331), float64(38), int64(293)
memory usage: 512.7 MB
