# Exploratory Data Analysis
This notebook contains all code for the prelimiatory analysis of the KDD Cup 98 datasets

In [1]:
%load_ext autoreload

In [10]:
%autoreload 2
import os
import numpy as np
import pandas as pd
from scipy import stats

if not os.getcwd()[-4:] == 'code':
    os.chdir("../code")
import kdd98.data_loader as dl
import kdd98.utils_transformer as ut
from kdd98.transformers import *
from kdd98.config import App

In [4]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
# seaborn config
import seaborn as sns
sns.set(color_codes=True)
sns.set_style('ticks')
sns.axes_style({'spines.right': False,
                'axes.spines.top': False})
sns.set_palette(App.config("color_palette"))
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (12, 8)

# figures:
# Where to save the figures
PROJECT_ROOT_DIR = "../../"
CHAPTER_ID = "eda"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "figures", CHAPTER_ID)

if not os.path.exists(IMAGES_PATH):
    os.makedirs(IMAGES_PATH)


def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## Loading the learning dataset


Set working directory to main code folder

In [5]:
data_loader = dl.KDD98DataLoader("cup98LRN.txt")
learning = data_loader.get_dataset()

## Overview

A first, general look at the data structure:

In [5]:
learning.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95412 entries, 95515 to 185114
Columns: 478 entries, ODATEDW to GEOCODE2
dtypes: category(24), datetime64[ns](53), float64(50), int64(297), object(54)
memory usage: 333.4+ MB


* There are 478 features
* A total of 95412 examples
* 24 categorical features, 53 datetime features, 50 continuous features, 297 discrete features and 54 string features

In [6]:
learning.head()

Unnamed: 0_level_0,ODATEDW,OSOURCE,TCODE,STATE,ZIP,MAILCODE,PVASTATE,DOB,NOEXCH,RECINHSE,...,TARGET_D,HPHONE_D,RFA_2R,RFA_2F,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,CLUSTER2,GEOCODE2
CONTROLN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
95515,1989-01-01,GRI,0,IL,61081.0,,,1937-12-01,0,,...,0,0,L,4,E,X,X,X,39.0,C
148535,1994-01-01,BOA,1,CA,91326.0,,,1952-02-01,0,,...,0,0,L,2,G,X,X,X,1.0,A
15078,1990-01-01,AMH,1,NC,27017.0,,,NaT,0,,...,0,1,L,4,E,X,X,X,60.0,C
172556,1987-01-01,BRY,0,CA,95953.0,,,1928-01-01,0,,...,0,1,L,4,E,X,X,X,41.0,C
7112,1986-01-01,,0,FL,33176.0,,,1920-01-01,0,X,...,0,1,L,2,F,X,X,X,26.0,A


### Numerical Features

In [7]:
numerical = learning.select_dtypes(include=np.number).columns
print(numerical)

Index(['ZIP', 'AGE', 'NUMCHLD', 'INCOME', 'HIT', 'MBCRAFT', 'MBGARDEN',
       'MBBOOKS', 'MBCOLECT', 'MAGFAML',
       ...
       'RAMNT_24', 'RAMNTALL', 'NGIFTALL', 'CARDGIFT', 'MINRAMNT', 'MAXRAMNT',
       'LASTGIFT', 'TIMELAG', 'AVGGIFT', 'CLUSTER2'],
      dtype='object', length=347)


### Categorical Features

Categories were defined on import of the csv data. The categories were identified in the dataset dictionary.

In [8]:
categories = learning.select_dtypes(include='category').columns
print(categories)

Index(['STATE', 'PVASTATE', 'DOMAIN', 'CLUSTER', 'CHILD03', 'CHILD07',
       'CHILD12', 'CHILD18', 'GENDER', 'WEALTH1', 'DATASRCE', 'SOLP3', 'SOLIH',
       'WEALTH2', 'GEOCODE', 'LIFESRC', 'TARGET_D', 'RFA_2R', 'RFA_2F',
       'RFA_2A', 'MDMAUD_R', 'MDMAUD_F', 'MDMAUD_A', 'GEOCODE2'],
      dtype='object')


In [9]:
learning.loc[:, categories].describe()

Unnamed: 0,STATE,PVASTATE,DOMAIN,CLUSTER,CHILD03,CHILD07,CHILD12,CHILD18,GENDER,WEALTH1,...,GEOCODE,LIFESRC,TARGET_D,RFA_2R,RFA_2F,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,GEOCODE2
count,95412,1458,93096,93096,1146,1566,1811,2847,92455,50680,...,15244,41380,95412,95412,95412,95412,95412,95412,95412,95093
unique,57,2,16,53,3,3,3,3,6,10,...,7,3,71,1,4,4,5,4,5,4
top,CA,P,R2,40,M,M,M,M,F,9,...,12,2,0,L,1,F,X,X,X,A
freq,17343,1453,13623,3979,869,1061,1149,1442,51277,7585,...,3914,20027,90569,95412,47675,46964,95118,95118,95118,34484


### Object Features

These features have mixed datatypes and are encoded as strings. This hints at noisy data and features that will have to be transformed before becoming usable.

In [10]:
objects = learning.select_dtypes(include='object').columns
print(objects)

Index(['OSOURCE', 'TCODE', 'MAILCODE', 'NOEXCH', 'RECINHSE', 'RECP3',
       'RECPGVG', 'RECSWEEP', 'AGEFLAG', 'HOMEOWNR', 'MAJOR', 'COLLECT1',
       'VETERANS', 'BIBLE', 'CATLG', 'HOMEE', 'PETS', 'CDPLAY', 'STEREO',
       'PCOWNERS', 'PHOTO', 'CRAFTS', 'FISHER', 'GARDENIN', 'BOATS', 'WALKER',
       'KIDSTUFF', 'CARDS', 'PLATES', 'PEPSTRFL', 'RFA_3', 'RFA_4', 'RFA_5',
       'RFA_6', 'RFA_7', 'RFA_8', 'RFA_9', 'RFA_10', 'RFA_11', 'RFA_12',
       'RFA_13', 'RFA_14', 'RFA_15', 'RFA_16', 'RFA_17', 'RFA_18', 'RFA_19',
       'RFA_20', 'RFA_21', 'RFA_22', 'RFA_23', 'RFA_24', 'TARGET_B',
       'HPHONE_D'],
      dtype='object')


In [11]:
learning.loc[:, objects].describe()

Unnamed: 0,OSOURCE,TCODE,MAILCODE,NOEXCH,RECINHSE,RECP3,RECPGVG,RECSWEEP,AGEFLAG,HOMEOWNR,...,RFA_17,RFA_18,RFA_19,RFA_20,RFA_21,RFA_22,RFA_23,RFA_24,TARGET_B,HPHONE_D
count,94484,95412,1399,95405,6703,2017,114,1617,65864,73184,...,67762,74149,70920,45212,60200,69764,39138,58439,95412,95412
unique,895,55,1,2,1,1,1,1,2,2,...,117,121,107,79,101,116,86,96,2,2
top,MBC,0,B,0,X,X,X,X,E,H,...,A1E,A1E,A1E,A1E,A1E,A1E,A1F,A1E,0,1
freq,4539,40917,1399,95085,6703,2017,114,1617,57344,52354,...,6773,7186,7248,6408,6729,7233,4607,7227,90569,47765


### Date features
These are imported as strings and will have to be transformed later on to become useful.

In [12]:
dates = learning.loc[:, dl.date_features]
dates.describe()

Unnamed: 0,ODATEDW,DOB,ADATE_2,ADATE_3,ADATE_4,ADATE_5,ADATE_6,ADATE_7,ADATE_8,ADATE_9,...,RDATE_21,RDATE_22,RDATE_23,RDATE_24,LASTDATE,MINRDATE,MAXRDATE,FISTDATE,NEXTDATE,MAXADATE
count,95412,71692,95412,93462,93221,61822,91855,86538,91901,84167,...,9513,20873,7859,17738,95412,95412,95412,95410,85439,95412
unique,54,935,2,2,8,1,2,3,5,3,...,12,13,17,14,24,146,150,176,188,5
top,1995-01-01 00:00:00,1948-01-01 00:00:00,1997-06-01 00:00:00,1996-06-01 00:00:00,1996-04-01 00:00:00,1996-04-01 00:00:00,1996-03-01 00:00:00,1996-02-01 00:00:00,1996-01-01 00:00:00,1995-11-01 00:00:00,...,1994-11-01 00:00:00,1994-09-01 00:00:00,1994-08-01 00:00:00,1994-07-01 00:00:00,1995-12-01 00:00:00,1996-02-01 00:00:00,1995-12-01 00:00:00,1995-01-01 00:00:00,1995-04-01 00:00:00,1997-02-01 00:00:00
freq,15358,1479,95399,93444,92405,61822,91804,81512,85468,80718,...,5006,11195,4522,7861,19896,3041,10563,2957,2253,95014
first,1983-06-01 00:00:00,1901-01-01 00:00:00,1997-04-01 00:00:00,1996-04-01 00:00:00,1995-11-01 00:00:00,1996-04-01 00:00:00,1996-01-01 00:00:00,1995-12-01 00:00:00,1995-11-01 00:00:00,1995-09-01 00:00:00,...,1994-09-01 00:00:00,1994-09-01 00:00:00,1993-09-01 00:00:00,1993-09-01 00:00:00,1995-03-01 00:00:00,1975-06-01 00:00:00,1975-10-01 00:00:00,1949-12-01 00:00:00,1972-11-01 00:00:00,1996-08-01 00:00:00
last,1997-01-01 00:00:00,1997-10-01 00:00:00,1997-06-01 00:00:00,1996-06-01 00:00:00,1996-09-01 00:00:00,1996-04-01 00:00:00,1996-03-01 00:00:00,1996-02-01 00:00:00,1996-05-01 00:00:00,1995-11-01 00:00:00,...,1995-08-01 00:00:00,1995-10-01 00:00:00,1995-07-01 00:00:00,1995-04-01 00:00:00,1997-02-01 00:00:00,1997-02-01 00:00:00,1997-02-01 00:00:00,1996-03-01 00:00:00,1997-02-01 00:00:00,1997-02-01 00:00:00


## Cleaning

We will leverage scikit-learn's transformer classes, and add our own custom transformers. This might on first glance look as a tedious way to clean data. However, it will be very powerful later on. The transformer's parameters are actually hyperparameters in model selection. This means that a grid-search can be employed to evaluate several different strategies for i.e. imputation of missing values, cutoff thresholds for sparse features and so on and find the best preprocessing steps.

sklearn doc:

* http://scikit-learn.org/dev/modules/generated/sklearn.compose.ColumnTransformer.html
* http://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
* http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [79]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
#from category_encoders.hashing import HashingEncoder  # Use custom edits in local file instead

### Binary features

For these, we will convert the values specified as True and False as per the dataset dictionary into 1.0 and 0.0 respectively. Furthermore, input errors are also being treated. In the end, these features will be of dtype float64, having {1.0, 0.0 and NaN} as values.

For features that either have a value representing True or are empty (as specified in the dataset dictionary), all empty cells will be considered False. For features specifically denoting True and False values, these will be coded appropriately and empty cells set to NaN.

In [80]:
print(dl.binary_features)

['MAILCODE', 'NOEXCH', 'RECSWEEP', 'RECINHSE', 'RECP3', 'RECPGVG', 'AGEFLAG', 'HOMEOWNR', 'MAJOR', 'COLLECT1', 'BIBLE', 'CATLG', 'HOMEE', 'PETS', 'CDPLAY', 'STEREO', 'PCOWNERS', 'PHOTO', 'CRAFTS', 'FISHER', 'GARDENIN', 'BOATS', 'WALKER', 'KIDSTUFF', 'CARDS', 'PLATES', 'PEPSTRFL', 'TARGET_B', 'HPHONE_D', 'VETERANS']


In [81]:
%autoreload 2
binary_transformers = ColumnTransformer([
    ("binary_x_bl",
     BinaryFeatureRecode(value_map={'true': 'X', 'false': ' '}, correct_noisy=False),
     ['PEPSTRFL', 'NOEXCH', 'MAJOR', 'RECINHSE', 'RECP3', 'RECPGVG', 'RECSWEEP']
     ),
    ("binary_y_n",
     BinaryFeatureRecode(value_map={'true': 'Y', 'false': 'N'}, correct_noisy=False),
     ['COLLECT1', 'VETERANS', 'BIBLE', 'CATLG', 'HOMEE', 'PETS', 'CDPLAY', 'STEREO',
      'PCOWNERS', 'PHOTO', 'CRAFTS', 'FISHER', 'GARDENIN',  'BOATS', 'WALKER', 'KIDSTUFF',
      'CARDS', 'PLATES']
     ),
    ("binary_e_i",
     BinaryFeatureRecode(value_map={'true': "E", 'false': 'I'}, correct_noisy=False),
     ['AGEFLAG']
     ),
    ("binary_h_u",
     BinaryFeatureRecode(value_map={'true': "H", 'false': 'U'}, correct_noisy=False),
     ['HOMEOWNR']),
    ("binary_b_bl",
     BinaryFeatureRecode(value_map={'true': 'B', 'false': ' '}, correct_noisy=False),
     ['MAILCODE']
     ),
    ("binary_1_0",
     BinaryFeatureRecode(value_map={'true': '1', 'false': '0'}, correct_noisy=False),
     ['HPHONE_D', 'TARGET_B']
     )
])

In [82]:
binarys = binary_transformers.fit_transform(learning)

In [83]:
binary_feature_names = [n[n.find('__')+2:]
                 for n in binary_transformers.get_feature_names()]

In [84]:
binarys = pd.DataFrame(data=binarys, columns=binary_feature_names,
                     index=learning.index)

In [85]:
binarys.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95412 entries, 95515 to 185114
Data columns (total 30 columns):
PEPSTRFL    45269 non-null float64
NOEXCH      95405 non-null float64
MAJOR       294 non-null float64
RECINHSE    6703 non-null float64
RECP3       2017 non-null float64
RECPGVG     114 non-null float64
RECSWEEP    1617 non-null float64
COLLECT1    5202 non-null float64
VETERANS    10426 non-null float64
BIBLE       8871 non-null float64
CATLG       7865 non-null float64
HOMEE       887 non-null float64
PETS        14326 non-null float64
CDPLAY      12254 non-null float64
STEREO      12794 non-null float64
PCOWNERS    10481 non-null float64
PHOTO       4786 non-null float64
CRAFTS      8176 non-null float64
FISHER      7130 non-null float64
GARDENIN    13402 non-null float64
BOATS       2028 non-null float64
WALKER      10501 non-null float64
KIDSTUFF    1536 non-null float64
CARDS       1041 non-null float64
PLATES      560 non-null float64
AGEFLAG     65864 non-null floa

Several features contain only very few actual feature values. These might get dropped by the sparsity transformer later on.

In [86]:
learning[binary_feature_names] = binarys

### Dates

There are several date features. ODATEDW is the date the record was added, DOB the birth date. ADATE_* and RDATE_* are from the promotion history. ADATE_* is the date of a mailing, RDATE_* the date the donation for the corresponding mailing was received. While these dates are not of particular interest (very low variance), the time it took to respond might be.
Furthermore, there are the features MINRDATE, MAXRDATE, MAXADATE, FISTDATE, NEXTDATE and LASTDATE coming from the giving history file.

Three different transformations are applied:

1. ODATEDW, DOB: Years before 1997 -> membership duration, age
2. Giving history features: Relative time in months to 1997/06/01
3. For the promotion history, as specified above, the time for response in months

There are redundant features which can be safely removed, as is shown below:

1. FISTDATE and NEXTDATE are contained in TIMELAG, the number of months between first and second donation
2. DOB, the date of birth, is contained in the feature AGE

In [87]:
print(dl.date_features)

['ODATEDW', 'DOB', 'ADATE_2', 'ADATE_3', 'ADATE_4', 'ADATE_5', 'ADATE_6', 'ADATE_7', 'ADATE_8', 'ADATE_9', 'ADATE_10', 'ADATE_11', 'ADATE_12', 'ADATE_13', 'ADATE_14', 'ADATE_15', 'ADATE_16', 'ADATE_17', 'ADATE_18', 'ADATE_19', 'ADATE_20', 'ADATE_21', 'ADATE_22', 'ADATE_23', 'ADATE_24', 'RDATE_3', 'RDATE_4', 'RDATE_5', 'RDATE_6', 'RDATE_7', 'RDATE_8', 'RDATE_9', 'RDATE_10', 'RDATE_11', 'RDATE_12', 'RDATE_13', 'RDATE_14', 'RDATE_15', 'RDATE_16', 'RDATE_17', 'RDATE_18', 'RDATE_19', 'RDATE_20', 'RDATE_21', 'RDATE_22', 'RDATE_23', 'RDATE_24', 'LASTDATE', 'MINRDATE', 'MAXRDATE', 'FISTDATE', 'NEXTDATE', 'MAXADATE']


Now, we transform the dates from the giving history. First, we create two dataframes with the sending dates of the mailings and the dates when the gift (donation) for these was received.

In [88]:
don_hist_transformer = ColumnTransformer([
    ("months_to_donation",
     MonthsToDonation(),
     dl.promo_history_dates+dl.giving_history_dates
     )
])

In [89]:
donation_responses = don_hist_transformer.fit_transform(learning)

In [90]:
don_hist_feature_names = [n[n.find('__')+2:]
                 for n in don_hist_transformer.get_feature_names()]

In [91]:
donation_responses = pd.DataFrame(
    donation_responses, index=learning.index, columns=don_hist_feature_names)

In [92]:
learning = learning.merge(donation_responses, on=learning.index.name)

Time delta computation of the remaining features with either a specific reference or the date of the most recent mailing as a reference:

* Time since last donation, minimum- and maximum donation and receiving most recent promotion
* Delta between first and next donation
* Age, years of membership

In [93]:
timedelta_transformer = ColumnTransformer([
    ("time_last_donation", DeltaTime(unit='months'), ['LASTDATE','MINRDATE','MAXRDATE','MAXADATE']),
    ("delta_first_next", DeltaTime(reference_date=learning.NEXTDATE), ['FISTDATE']),
    ("membership_years", DeltaTime(unit='years'),['ODATEDW', 'DOB'])
])

In [94]:
timedeltas = timedelta_transformer.fit_transform(learning)

In [95]:
timedelta_feature_names = [n[n.find('__')+2:]
                 for n in timedelta_transformer.get_feature_names()]

In [96]:
timedeltas = pd.DataFrame(timedeltas, index=learning.index,columns=timedelta_feature_names)

In [97]:
timedeltas.columns

Index(['LASTDATE_DELTA_MONTHS', 'MINRDATE_DELTA_MONTHS',
       'MAXRDATE_DELTA_MONTHS', 'MAXADATE_DELTA_MONTHS',
       'FISTDATE_NEXTDATE_DELTA_MONTHS', 'ODATEDW_DELTA_YEARS',
       'DOB_DELTA_YEARS'],
      dtype='object')

In [98]:
learning = learning.merge(timedeltas, on=learning.index.name)
learning.drop(dl.date_features, axis=1,inplace=True)

Studying redundance of DOB <-> AGE and \[FISTDATE, NEXTDATE\] <-> TIMELAG

In [99]:
ages = pd.DataFrame([learning.AGE, timedeltas.DOB_DELTA_YEARS]).T

In [100]:
ages.loc[ages.AGE != ages.DOB_DELTA_YEARS,:].dropna()

Unnamed: 0_level_0,AGE,DOB_DELTA_YEARS
CONTROLN,Unnamed: 1_level_1,Unnamed: 2_level_1


In [101]:
lags = pd.DataFrame([learning.TIMELAG, timedeltas.FISTDATE_NEXTDATE_DELTA_MONTHS]).T

In [102]:
lags.loc[lags.TIMELAG != lags.FISTDATE_NEXTDATE_DELTA_MONTHS,:].dropna()

Unnamed: 0_level_0,TIMELAG,FISTDATE_NEXTDATE_DELTA_MONTHS
CONTROLN,Unnamed: 1_level_1,Unnamed: 2_level_1


The transformed feature DOB is represented in the feature AGE already. So we can drop DOB_DELTA_YEARS. TIMELAG already holds the difference in months between FISTDATE and NEXTDATE, so this delta can also be safely removed together with the original features

In [103]:
learning.drop(['DOB_DELTA_YEARS', 'FISTDATE_NEXTDATE_DELTA_MONTHS'], axis=1,inplace=True)

### Categories


Some categories are already created on import of the data. Additionally, we will have to treat some special cases:

* Multibyte features. These are features that group together several related nominal features. These are mainly the promotion history codes. Recency, Frequency and Amount as of a particular mailing are glued together in one feature. For RFA_2 and additionally MDMAUD, the major donor matrix, the features were already spread out by the supplier of the data. These two were dropped on import of the CSV file and their spread out features kept.

* OSOURCE: It identifies the origin of the data for a particular record. However, it has so many levels that the feature space would get inflated heavily by one-hot encoding. For this feature, hasing is employed.

* TCODE: Special treatment will also be necessary for the TCODE feature. It describes the title code (Ms., Hon., and so on) in an unfortunate integer coding ranging from 1e0 to 1e4. We will also use the hasing encoder for these features

After having the categorical features ready, missing values are assigned their own category, 'missing'. Then, all non-hashed categorical features are one-hot encoded.

In [104]:
learning.select_dtypes(include="category").columns

Index(['STATE', 'PVASTATE', 'DOMAIN', 'CLUSTER', 'CHILD03', 'CHILD07',
       'CHILD12', 'CHILD18', 'GENDER', 'WEALTH1', 'DATASRCE', 'SOLP3', 'SOLIH',
       'WEALTH2', 'GEOCODE', 'LIFESRC', 'TARGET_D', 'RFA_2R', 'RFA_2F',
       'RFA_2A', 'MDMAUD_R', 'MDMAUD_F', 'MDMAUD_A', 'GEOCODE2'],
      dtype='object')

#### Treating multibyte features, OSOURCE and TCODE:

In [105]:
print(dl.nominal_features)

['OSOURCE', 'TCODE', 'RFA_3', 'RFA_4', 'RFA_5', 'RFA_6', 'RFA_7', 'RFA_8', 'RFA_9', 'RFA_10', 'RFA_11', 'RFA_12', 'RFA_13', 'RFA_14', 'RFA_15', 'RFA_16', 'RFA_17', 'RFA_18', 'RFA_19', 'RFA_20', 'RFA_21', 'RFA_22', 'RFA_23', 'RFA_24']


https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087

https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159

The hashing transformer hashes the nominal feature values into an 8 bit representation. If more than one feature is passed in, they all get encoded into the same 8 bits, therefore in effect reducing the dimensionality of the data.

In [108]:
hash_transformer = ColumnTransformer([
    ("hash_osource", HashingEncoder(), ['OSOURCE'])
])

multibyte_transformer = ColumnTransformer([
    ("promotion_history_spreader",
     MultiByteExtract(["R", "F", "A"]),
     dl.nominal_features[2:]
    )
])

Apply the transormations to all RFA_* features and the OSOURCE feature and extract the new feature names to build a pandas dataframe:

In [109]:
hashes = hash_transformer.fit_transform(learning)
feature_names_h = [n[n.find('__')+2:]
                 for n in hash_transformer.get_feature_names()]

In [110]:
multibytes = multibyte_transformer.fit_transform(learning)
feature_names_m = [n[n.find('__')+2:]
                 for n in multibyte_transformer.get_feature_names()]

Merge learning and the new nominal features, then drop the originals

In [111]:
multibytes = pd.DataFrame(data=multibytes, columns=feature_names_m,
                   index=learning.index).astype("category")
learning = learning.merge(multibytes, on=learning.index.name)

In [112]:
hashes = pd.DataFrame(data=hashes, columns=feature_names_h,
                   index=learning.index)

In [113]:
learning = learning.merge(hashes, on=learning.index.name)
learning = learning.drop(dl.nominal_features, axis=1)

In [116]:
for cat in learning.select_dtypes(include="category").columns:
    learning[cat] = learning[cat].cat.remove_unused_categories()
    print("Feature: {}\n{}".format(cat, learning[cat].cat.categories))

Feature: STATE
Index(['AA', 'AE', 'AK', 'AL', 'AP', 'AR', 'AS', 'AZ', 'CA', 'CO', 'CT', 'DC',
       'DE', 'FL', 'GA', 'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA',
       'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH',
       'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN',
       'TX', 'UT', 'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY'],
      dtype='object')
Feature: PVASTATE
Index(['E', 'P'], dtype='object')
Feature: DOMAIN
Index(['C1', 'C2', 'C3', 'R1', 'R2', 'R3', 'S1', 'S2', 'S3', 'T1', 'T2', 'T3',
       'U1', 'U2', 'U3', 'U4'],
      dtype='object')
Feature: CLUSTER
Index(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36',
       '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48',
       '49', '50', '51', '52', '53'],
      dtype='

#### Ordinal features

Several ordinal features are present. We need to ensure to encode the levels correctly.

When the order is obvious, no order has to be passed in (i.e. 0 < 1 < 2 < 3 < ... and alphabetical)

In [155]:
def make_ordered(feature):
    try:
        learning[feature] = learning[feature].cat.as_ordered()
    except AttributeError as e:
         learning[feature] = learning[feature].astype("category").cat.as_ordered()

In [156]:
for c in ['WEALTH1','WEALTH2','INCOME']+list(learning.filter(regex="RFA_\d{1,2}F").columns)+list(learning.filter(regex="RFA_\d{1,2}A").columns):
    make_ordered(c)

In [157]:
for x in X_trans:
    print(learning[x].cat.ordered)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


Adding a new level 'missing' to each category to encode NaN's

In [158]:
int_cols = learning.select_dtypes(include="int64").columns
learning[int_cols] = learning[int_cols].astype("float64")

In [159]:
learning.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95412 entries, 95515 to 185114
Columns: 502 entries, STATE to OSOURCE_7
dtypes: category(91), float64(411)
memory usage: 308.2 MB


## Actual EDA

A look at the label (amount donated in US dollars)

In [None]:
fig = sns.barplot(x = [0,1], y = learning.groupby('TARGET_B')['TARGET_B'].count()/len(learning.index),
                  palette=App.config("color_palette_binary"));
fig.set_xticklabels(["No", "Yes"]);
plt.xlabel("Donated");
plt.ylabel("Percentage of examples");
save_fig(fig_id="label_ratio_binary");

In [None]:
fig = sns.distplot(learning.loc[learning.TARGET_D > 0, ('TARGET_D')], bins=100, hist_kws={'alpha': 0.9}, color=App.config("color_palette")[0])
plt.ylabel("Percentage of donors");
save_fig('label_distribution')

In [None]:
learning.loc[learning.TARGET_D > 0.0, 'TARGET_D'].median()

* The label is imbalanced, with roughly 95% / 5%
* Most donations are below 20 dollars. The median is 13 \$
* Spikes are visible for 5, 10, 15, 20, 25, 50, 100 and 200 $
* The distribution is right-skewed

Checking the claim from the documentation that donations are positively correlated with the time since the last donation. We plot the duration since the last gift against the donation amount for the current campaign. The marker size indicates the total amount an example has donated so far.

It is evident that from a lag of &geq; 15 months, donations increase indeed, and over the whole spectrum of amounts. We see a marked difference in 100- and 50 $ donations.

In [None]:
sns.scatterplot(x='LASTDATE_DELTA_MONTHS',y='TARGET_D', size='RAMNTALL', alpha=0.6, data=learning.loc[learning.TARGET_D > 0,:],
                palette=App.config("color_palette_binary"))
plt.xlabel("Months passed without a donation before the current donation");
plt.ylabel("Amount donated, $");
save_fig(fig_id="donations_vs_time_since_last")

### Socio-economic environment and label

Donations by living environment (C=City, R=Rural, S=Suburban, T=Town,U=Urban; lowest numbers represent highest socio-economic ranking). Major donors versus non-major donors.

Surprisingly, one of the top donations came from a rural region of low socio-economic status. Major donors that donated this time are not present in the lowest socio-economic environments.

In [None]:
sns.violinplot(y="TARGET_D", x="DOMAIN", hue='MAJOR',cut=0, data=learning.loc[learning.TARGET_D > 0,:],
               palette=App.config("color_palette_binary"))
plt.xlabel("Living environment and socio-economic status");
plt.ylabel("Amount donated, $");
save_fig(fig_id="donations_vs_living_environment")

All-time donations by environment. The y- axis is in log scale. We see now that each socio-economic environment also harbours major donors.

In [None]:
fig=sns.boxplot(y="RAMNTALL", x="DOMAIN", hue='MAJOR', data=learning,palette=App.config("color_palette_binary"))
fig.set_yscale('log')
plt.xlabel("Living environment and socio-economic status");
plt.ylabel("Lifetime amount donated, $");
save_fig(fig_id="donations_vs_living_environment")

### Correlations

Since there are so many features, we will plot those who have a significant correlation only.

In [None]:
corr_all = learning.drop(['TARGET_B','TARGET_D'], axis=1).corr()

In [None]:
mask_all = np.zeros_like(corr_all, dtype=np.bool)
mask_all[np.triu_indices_from(mask_all)] = True

sns.heatmap(corr_all,
            cmap=App.config("color_map_diverging"), mask=mask_all, vmax=1.0, center = 0.0, square=True,
            linewidths = 0)

### Correlations between numerical features, excluding US census data

In [None]:
data_exclude_census_numeric = learning[learning.columns.difference(dl.us_census_features)].select_dtypes(include=["float64"])

In [None]:
data_exclude_census_corr = data_exclude_census_numeric[data_exclude_census_numeric.columns.difference(['TARGET_B','TARGET_D'])].corr()

In [None]:
mask_census = np.zeros_like(data_exclude_census_corr, dtype=np.bool)
mask_census[np.triu_indices_from(mask_census)] = True

sns.heatmap(data_exclude_census_corr, mask=mask_census, cmap=App.config("color_map_diverging"), vmax=1.0, center=0,
            square=True, linewidths=.1, cbar_kws={"shrink": .5}, xticklabels=True,yticklabels=True)

### Promotion history correlations

In [None]:
prom_hist_f = list(donation_responses.columns)+list(multibytes.columns)+dl.promo_history_summary
promotion_history_features = learning.reindex(columns=prom_hist_f)
prom_hist_corr = promotion_history_features[promotion_history_features.columns.difference(['TARGET_B','TARGET_D'])].corr()

In [None]:
mask_promo = np.zeros_like(prom_hist_corr, dtype=np.bool)
mask_promo[np.triu_indices_from(mask_promo)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 20))

sns.heatmap(prom_hist_corr, mask=mask_promo, cmap=App.config("color_map_diverging"), vmax=1.0, center=0,
            square=True, linewidths=.3, cbar_kws={"shrink": .5}, xticklabels=True,yticklabels=True)
save_fig(fig_id="correlations_promotion_giving_history")

### Giving history correlations

In [None]:
giving_hist_f = list(donation_responses.columns) + dl.giving_history + dl.giving_history_summary +['LASTDATE_DELTA_MONTHS', 'MINRDATE_DELTA_MONTHS',
       'MAXRDATE_DELTA_MONTHS', 'MAXADATE_DELTA_MONTHS']
giving_history_features = learning.loc[:,giving_hist_f]
giving_corr = giving_history_features[giving_history_features.columns.difference(['TARGET_B','TARGET_D'])]

In [None]:
mask_giving = np.zeros_like(giving_corr, dtype=np.bool)
mask_giving[np.triu_indices_from(mask_giving)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 20))

sns.heatmap(giving_corr, mask=mask_giving, cmap=App.config("color_map_diverging"), vmax=1.0, center=0,
            square=True, linewidths=.1, cbar_kws={"shrink": .5}, xticklabels=True,yticklabels=True)
save_fig(fig_id="correlations_giving_history")

### Puttting donors on a map

In [None]:
num_donors_by_zip = learning[['ZIP', 'TARGET_B']].groupby('ZIP', as_index=False).agg('sum') # number of people who donated
num_members_by_zip = learning[['ZIP', 'TARGET_B']].groupby('ZIP', as_index=False).agg('count') # number of people who are registered at that ZIP
cum_donation_by_zip = learning[['ZIP', 'TARGET_D']].groupby('ZIP', as_index=False).agg('sum')
zip_states = learning[['ZIP','STATE']].drop_duplicates()

In [None]:
data_by_zip = cum_donation_by_zip.merge(num_members_by_zip, on='ZIP').merge(zip_states, on='ZIP')
data_by_zip.columns = ["ZIP", "CumDonation", "MemberCount", "State"]

In [None]:
def rel_donation(row):
    if row.CumDonation != 0.0:
        return row.CumDonation/(1.0 if row.MemberCount == 0.0 else row.MemberCount)
    else:
        return 0.0

data_by_zip['RelDonation'] = data_by_zip.apply(rel_donation,axis=1)

In [None]:
from geopy.geocoders import Here
from geopy.extra.rate_limiter import RateLimiter
from geopy.exc import GeocoderTimedOut

def do_geo_query(q):
    geolocator = Here(app_id="ZJBxigwxa1QPHlWrtWH6", app_code="OJBun02aepkFbuHmYn1bOg")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=0.01, max_retries=4)
    try:
        return geolocator.geocode(query=q, exactly_one=True)
    except GeocoderTimedOut:
        return do_geo_query(q)

def get_loc(example):
    if example.ZIP:
        zip = str(int(example.ZIP)).rjust(5, '0')
        q = {'postalcode': zip, 'state': example.State}
        return do_geo_query(q)
    else:
        return None
    
def extract_coords(location):
    return [location.latitude, location.longitude]

In [None]:
import pickle
from tqdm import tqdm

tqdm.pandas()

try:
    zip_data = open("zip_data.pkl", "rb")
    locations = pickle.load(zip_data)
    zip_data.close()
except Exception as e:
    locations = data_by_zip.progress_apply(get_loc, axis=1)
    locations = pd.DataFrame(locations, columns="location")
    locations['ZIP'] = data_by_zip.ZIP
    zip_data = open("zip_data.pkl", "wb")
    pickle.dump(locations, zip_data)
    zip_data.close()


In [None]:
data_by_zip = data_by_zip.merge(locations, on='ZIP')

In [None]:
data_by_zip.loc[:,'longitude'] = data_by_zip.location.apply(lambda l: l.longitude if l != None else None)
data_by_zip.loc[:,'latitude'] = data_by_zip.location.apply(lambda l: l.latitude if l != None else None)

AA, AE and AP stand for armed services. ZIP codes don't work here, they point anywhere. Also, we only include locations where someone has actually donated by filtering on CumDonation.

In [None]:
data_by_zip1 = data_by_zip.loc[data_by_zip.State != ['AA','AE','AP'],:]
data_by_zip2 = data_by_zip1.loc[data_by_zip1.CumDonation > 0.0,:]

In [None]:
import cartopy.crs as ccrs
import cartopy.io.img_tiles as cimgt
import cartopy.feature as cfeature
fig = plt.figure(figsize=(20,16))

osm_terrain = cimgt.OSM()


ax = fig.add_subplot(1, 1, 1, projection=osm_terrain.crs)

ax.set_extent([-166, -65, 10, 65], crs=ccrs.PlateCarree())
ax.add_image(osm_terrain, 6)

lon = data_by_zip2.longitude
lat = data_by_zip2.latitude
mc = data_by_zip2.MemberCount
cd = data_by_zip2.CumDonation
rd = data_by_zip2.RelDonation

data_by_zip2.plot(kind="scatter",x="longitude",y="latitude",ax=ax,
                  s=cd, c=rd, label="Cumulative Donations",
                  legend=True, alpha=0.5, cmap=App.config("color_map"),
                  subplots=True, colorbar=True, transform=ccrs.PlateCarree())
            
save_fig(fig_id="donations_geographical")
plt.show()

* Most donations come from the urban areas, especially San Francisco, Los Angeles, Miami, Chicago and Detroit. To a lesser extent, cities like Houston, Dallas, Minneapolis, Atlanta, Tampa, Seattle and Phoenix can be made out.
* Interestingly, the East Coast has not donated, despite featuring some large metropolitan areas like New York, Boston, or Washington

### Categorical features

In [200]:
categories = learning.select_dtypes("category").copy()
target = categories['TARGET_D']
categories = categories.drop('TARGET_D', axis=1)
#categories['TARGET_B'] = learning.TARGET_B.astype("category")
#categories['TARGET_D'] = learning.TARGET_D
#categories_grouped = categories.groupby('TARGET_B')

In [202]:
from sklearn.linear_model import LogisticRegression

In [210]:
lm = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
lm.fit(np.ndarray(categories),y=np.ndarray(target))

ValueError: sequence too large; cannot be greater than 32

In [194]:
pd.crosstab(categories.TARGET_D,[categories.INCOME],margins=True)

INCOME,1.0,2.0,3.0,4.0,5.0,6.0,7.0,All
TARGET_D,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,8646,12482,8135,12092,14639,7347,7045,70386
1,0,1,0,1,1,0,0,3
10,92,136,92,122,133,66,79,720
10.7,0,0,0,0,1,0,0,1
100,0,3,1,2,4,3,3,16
11,10,19,10,18,19,9,13,98
12,12,22,11,33,24,12,8,122
12.5,0,0,0,0,6,2,2,10
13,8,9,4,7,10,6,4,48
13.92,0,1,0,0,0,0,0,1


### The US census data

In [None]:
census = learning[dl.us_census_features]
census_corr = census.corr()

In [None]:
mask = np.zeros_like(census_corr, dtype=np.binary)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(12, 12))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(census_corr, mask=mask, cmap=cmap, vmax=1.0, center=0,
            square=True, linewidths=.2, cbar_kws={"shrink": .5})
save_fig(fig_id="correlation_census")

In [None]:
census.select_dtypes(include="int64")

### Income, Wealth and donations

In [None]:
inc_targ = sns.violinplot(x="INCOME", y="TARGET_D", data=learning.loc[learning.TARGET_D > 0.0, ["INCOME","TARGET_D"]])
inc_targ.set_yscale('log')
plt.show()

In [None]:
weal1_targ = sns.violinplot(x="WEALTH1", y="TARGET_D", data=learning.loc[learning.TARGET_D > 0.0, ["WEALTH1","TARGET_D"]])
weal1_targ.set_yscale('log')
plt.show()

In [None]:
weal2_targ = sns.violinplot(x="WEALTH2", y="TARGET_D", data=learning.loc[learning.TARGET_D > 0.0, ["WEALTH2","TARGET_D"]])
weal2_targ.set_yscale('log')
plt.show()

In [None]:
sns.catplot(x="WEALTH2", y="TARGET_D", hue="MAJOR",
            kind="violin", inner="stick", split=True, data=learning.loc[learning.TARGET_D > 0.0,:])

In [None]:
sns.catplot(x="CLUSTER", y="TARGET_D", kind="box", data=learning)

In [None]:
sns.distplot(learning.loc[learning.TARGET_D > 0.0,
                          'TARGET_D'], bins=50, kde=False, rug=True)

In [None]:
learning.select_dtypes(include=np.float).hist(bins=50, figsize=(50, 50))
plt.show()
save_fig("float_feature_histograms")

### Some promising fetures and their impact on the label

### Boruta

In [225]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

In [239]:
X = learning.dropna(axis=1)
y = X['TARGET_D'].values
X = X.drop(['TARGET_B','TARGET_D'],axis=1)
cats = X.select_dtypes(include='category').columns
dummies = pd.get_dummies(X[cats])
X = X.drop(cats, axis=1)
X = X.merge(dummies,on=X.index).dropna().values

In [241]:
X

array([[9.55150e+04, 6.10810e+04, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.48535e+05, 9.13260e+04, 1.60000e+01, ..., 0.00000e+00,
        0.00000e+00, 1.00000e+00],
       [1.50780e+04, 2.70170e+04, 2.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [1.89641e+05, 4.89100e+04, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [4.69300e+03, 9.13200e+04, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 0.00000e+00],
       [1.85114e+05, 2.84090e+04, 3.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 1.00000e+00]])

In [240]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X)

TypeError: must be str, not ValueError

# PCA

A first look at important features

In [None]:
from sklearn import decomposition
from kdd98.transformers import *

In [None]:
X = learning.drop(["TARGET_B", "TARGET_D", "TCODE"], axis=1)
X.info()

In [None]:
X.select_dtypes(include="category").columns

In [None]:
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("one_hot",  OneHotEncoder(impute_missing=True,use_cat_names=True,return_df=True))
])

categories_transformer = ColumnTransformer([
    ("cat_encoder",
     cat_pipe,
     list(X.select_dtypes(include="category").columns))
])

In [None]:
cats = categories_transformer.fit_transform(X)

In [None]:
print(list(categories_transformer.named_transformers_.cat_encoder.named_steps.one_hot.get_feature_names()))

In [None]:
cats = pd.DataFrame(cats, columns = list(categories_transformer.named_transformers_.cat_encoder.named_steps.one_hot.get_feature_names()), index=X.index)

In [None]:
X = X.merge(cats, on=X.index)

In [None]:
X.drop(X.select_dtypes(include="category").columns,axis=1,inplace=True)

In [None]:
X_centered = X - X.mean()

In [None]:
X_centered = X_centered.dropna()

In [None]:
X_centered[X.select_dtypes(include="object").columns] = X_centered[X.select_dtypes(include="object").columns].astype("float64")

In [None]:
X_centered.describe()

In [None]:
pca = decomposition.PCA()
pca.fit(X_centered,)
result = pd.DataFrame(pca.transform(X_centered), columns=[
                      "PCA%i" % i for i in range(n_comp)], index=X.index)