# Exploratory Data Analysis
This notebook contains all code for the prelimiatory analysis of the KDD Cup 98 datasets

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
import os
import numpy as np
import pandas as pd
from scipy import stats

os.chdir("../")
import util.data_loader as dl
from kdd98.transformers import *

In [3]:
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = [20, 16]
plt.rcParams['image.cmap'] = 'viridis'
# seaborn config
import seaborn as sns
sns.set(color_codes=True)
sns.set_style('ticks')

# figures:
# Where to save the figures
PROJECT_ROOT_DIR = "../../"
CHAPTER_ID = "eda"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "figures", CHAPTER_ID)

if not os.path.exists(IMAGES_PATH):
    os.makedirs(IMAGES_PATH)


def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## Loading the learning dataset


Set working directory to main code folder

In [4]:
%autoreload 2  # automatically reloads modules
data_loader = dl.KDD98DataLoader("cup98LRN.txt")
learning = data_loader.get_dataset()

## Overview

A first, general look at the data structure:

In [5]:
learning.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95412 entries, 95515 to 185114
Columns: 478 entries, ODATEDW to GEOCODE2
dtypes: category(24), float64(51), int64(302), object(101)
memory usage: 333.4+ MB


### Numerical Features

In [6]:
numerical = learning.select_dtypes(include=np.number).columns
print(numerical)

Index(['AGE', 'NUMCHLD', 'INCOME', 'HIT', 'MBCRAFT', 'MBGARDEN', 'MBBOOKS',
       'MBCOLECT', 'MAGFAML', 'MAGFEM',
       ...
       'MAXRAMNT', 'MAXRDATE', 'LASTGIFT', 'LASTDATE', 'FISTDATE', 'NEXTDATE',
       'TIMELAG', 'AVGGIFT', 'TARGET_D', 'CLUSTER2'],
      dtype='object', length=353)


### Categorical Features

Categories were defined on import of the csv data. The categories were identified in the dataset dictionary.

In [7]:
categories = learning.select_dtypes(include='category').columns
print(categories)

Index(['TCODE', 'STATE', 'PVASTATE', 'DOMAIN', 'CLUSTER', 'CHILD03', 'CHILD07',
       'CHILD12', 'CHILD18', 'GENDER', 'WEALTH1', 'DATASRCE', 'SOLP3', 'SOLIH',
       'WEALTH2', 'GEOCODE', 'LIFESRC', 'RFA_2R', 'RFA_2F', 'RFA_2A',
       'MDMAUD_R', 'MDMAUD_F', 'MDMAUD_A', 'GEOCODE2'],
      dtype='object')


In [8]:
learning.loc[:, categories].describe()

Unnamed: 0,TCODE,STATE,PVASTATE,DOMAIN,CLUSTER,CHILD03,CHILD07,CHILD12,CHILD18,GENDER,...,WEALTH2,GEOCODE,LIFESRC,RFA_2R,RFA_2F,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,GEOCODE2
count,95412,95412,95412.0,95412,95412,95412.0,95412.0,95412.0,95412.0,95412,...,51589,95412.0,95412.0,95412,95412,95412,95412,95412,95412,95280
unique,55,57,3.0,17,54,4.0,4.0,4.0,4.0,7,...,10,8.0,4.0,1,4,4,5,4,5,5
top,0,CA,,R2,40,,,,,F,...,9,,,L,1,F,X,X,X,A
freq,40917,17343,93954.0,13623,3979,94266.0,93846.0,93601.0,92565.0,51277,...,6523,80168.0,54032.0,95412,47675,46964,95118,95118,95118,34484


### Object Features

These features have mixed datatypes. This hints at noisy data and features that will have to be transformed before becoming usable.

In [9]:
objects = learning.select_dtypes(include='object').columns
print(objects)

Index(['ODATEDW', 'OSOURCE', 'ZIP', 'MAILCODE', 'DOB', 'NOEXCH', 'RECINHSE',
       'RECP3', 'RECPGVG', 'RECSWEEP',
       ...
       'RDATE_17', 'RDATE_18', 'RDATE_19', 'RDATE_20', 'RDATE_21', 'RDATE_22',
       'RDATE_23', 'RDATE_24', 'TARGET_B', 'HPHONE_D'],
      dtype='object', length=101)


In [10]:
learning.loc[:, objects].describe()

Unnamed: 0,ODATEDW,OSOURCE,ZIP,MAILCODE,DOB,NOEXCH,RECINHSE,RECP3,RECPGVG,RECSWEEP,...,RDATE_17,RDATE_18,RDATE_19,RDATE_20,RDATE_21,RDATE_22,RDATE_23,RDATE_24,TARGET_B,HPHONE_D
count,95412,95412,95412,95412.0,95412,95412,95412.0,95412.0,95412.0,95412.0,...,9401,19778,15877,7888,9513,20873,7859,17738,95412,95412
unique,54,896,19938,2.0,947,3,2.0,2.0,2.0,2.0,...,11,14,13,10,12,13,17,14,2,2
top,9501,MBC,85351,,0,0,,,,,...,9503,9501,9412,9412,9411,9409,9408,9407,0,1
freq,15358,4539,61,94013.0,23661,95085,88709.0,93395.0,95298.0,93795.0,...,4729,10665,12504,4516,5006,11195,4522,7861,90569,47765


### Date features
These are imported as strings and will have to be transformed later on to become useful.

In [11]:
dates = learning.loc[:, dl.date_features]
dates.describe()

Unnamed: 0,ODATEDW,DOB,ADATE_2,ADATE_3,ADATE_4,ADATE_5,ADATE_6,ADATE_7,ADATE_8,ADATE_9,...,RDATE_15,RDATE_16,RDATE_17,RDATE_18,RDATE_19,RDATE_20,RDATE_21,RDATE_22,RDATE_23,RDATE_24
count,95412,95412,95412,93462,93221,61822,91855,86538,91901,84167,...,7262,26994,9401,19778,15877,7888,9513,20873,7859,17738
unique,54,947,2,2,8,1,2,3,5,3,...,16,18,11,14,13,10,12,13,17,14
top,9501,0,9706,9606,9604,9604,9603,9602,9601,9511,...,9505,9504,9503,9501,9412,9412,9411,9409,9408,9407
freq,15358,23661,95399,93444,92405,61822,91804,81512,85468,80718,...,4862,16670,4729,10665,12504,4516,5006,11195,4522,7861


## Cleaning

We will leverage scikit's transformer classes, and add our own transformers. This might on first glance look as a tedious way to clean data. However, it will be very powerful later on. The transformer's parameters are actually hyperparameters in model selection. This means that a grid-search can be employed to evaluate several different strategies for i.e. imputation of missing values, cutoff thresholds for sparse features and so on and find the best preprocessing steps.

sklearn doc:

* http://scikit-learn.org/dev/modules/generated/sklearn.compose.ColumnTransformer.html
* http://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
* http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

### Boolean features

In [13]:
print(dl.boolean_features)

['MAILCODE', 'NOEXCH', 'RECSWEEP', 'RECINHSE', 'RECP3', 'RECPGVG', 'AGEFLAG', 'HOMEOWNR', 'MAJOR', 'COLLECT1', 'BIBLE', 'CATLG', 'HOMEE', 'PETS', 'CDPLAY', 'STEREO', 'PCOWNERS', 'PHOTO', 'CRAFTS', 'FISHER', 'GARDENIN', 'BOATS', 'WALKER', 'KIDSTUFF', 'CARDS', 'PLATES', 'PEPSTRFL', 'TARGET_B', 'HPHONE_D', 'VETERANS']


In [14]:
%autoreload 2
bool_transformers = ColumnTransformer([
    ("bool_x_bl",
     BooleanFeatureRecode(value_map={'true': 'X', 'false': ' '}),
     ['PEPSTRFL', 'NOEXCH', 'MAJOR', 'RECINHSE', 'RECP3', 'RECPGVG', 'RECSWEEP']
     ),
    ("bool_y_n",
     BooleanFeatureRecode(value_map={'true': 'Y', 'false': 'N'}),
     ['COLLECT1', 'VETERANS', 'BIBLE', 'CATLG', 'HOMEE', 'PETS', 'CDPLAY', 'STEREO',
      'PCOWNERS', 'PHOTO', 'CRAFTS', 'FISHER', 'GARDENIN',  'BOATS', 'WALKER', 'KIDSTUFF',
      'CARDS', 'PLATES']
     ),
    ("bool_e_i",
     BooleanFeatureRecode(value_map={'true': "E", 'false': 'I'}),
     ['AGEFLAG']
     ),
    ("bool_h_u",
     BooleanFeatureRecode(value_map={'true': "H", 'false': 'U'}),
     ['HOMEOWNR']),
    ("bool_b_bl",
     BooleanFeatureRecode(value_map={'true': 'B', 'false': ' '}),
     ['MAILCODE']
     ),
    ("bool_1_0",
     BooleanFeatureRecode(value_map={'true': '1', 'false': '0'}),
     ['HPHONE_D', 'TARGET_B']
     )
])

In [15]:
booleans = bool_transformers.fit_transform(learning)

In [16]:
feature_names = [n[n.find('__')+2:]
                 for n in bool_transformers.get_feature_names()]

In [17]:
bools = pd.DataFrame(data=booleans, columns=feature_names,
                     index=learning.index)

In [18]:
bools.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95412 entries, 95515 to 185114
Data columns (total 30 columns):
PEPSTRFL    95412 non-null bool
NOEXCH      95412 non-null bool
MAJOR       95412 non-null bool
RECINHSE    95412 non-null bool
RECP3       95412 non-null bool
RECPGVG     95412 non-null bool
RECSWEEP    95412 non-null bool
COLLECT1    95412 non-null bool
VETERANS    95412 non-null bool
BIBLE       95412 non-null bool
CATLG       95412 non-null bool
HOMEE       95412 non-null bool
PETS        95412 non-null bool
CDPLAY      95412 non-null bool
STEREO      95412 non-null bool
PCOWNERS    95412 non-null bool
PHOTO       95412 non-null bool
CRAFTS      95412 non-null bool
FISHER      95412 non-null bool
GARDENIN    95412 non-null bool
BOATS       95412 non-null bool
WALKER      95412 non-null bool
KIDSTUFF    95412 non-null bool
CARDS       95412 non-null bool
PLATES      95412 non-null bool
AGEFLAG     95412 non-null bool
HOMEOWNR    95412 non-null bool
MAILCODE    95412 non-

In [19]:
learning[dl.boolean_features] = bools

### Zipcode

In [20]:
zip_transformer = ColumnTransformer([
    ("truncate_zip",
     ZipCodeFormatter(),
     ['ZIP']
     )
])

In [21]:
zip = zip_transformer.fit_transform(learning)

In [22]:
learning.ZIP.head()

CONTROLN
95515     61081
148535    91326
15078     27017
172556    95953
7112      33176
Name: ZIP, dtype: object

In [23]:
zip[0:5]

array([['61081'],
       ['91326'],
       ['27017'],
       ['95953'],
       ['33176']], dtype=object)

In [24]:
learning.ZIP = zip
learning.ZIP = learning.ZIP.astype("int", copy=False)

### Categories

Categories identified from the dataset dictionary:

In [25]:
print(dl.categorical_features)

['DOMAIN', 'TCODE', 'STATE', 'PVASTATE', 'CLUSTER', 'CHILD03', 'CHILD07', 'CHILD12', 'CHILD18', 'GENDER', 'DATASRCE', 'SOLP3', 'SOLIH', 'WEALTH1', 'WEALTH2', 'GEOCODE', 'LIFESRC', 'RFA_2R', 'RFA_2A', 'RFA_2F', 'MDMAUD_R', 'MDMAUD_F', 'MDMAUD_A', 'GEOCODE2']


There is one additional categorical feature, OSOURCE. It identifies the origin of the data for a particular record. However, it has so many levels that the feature space would get inflated heavily by one-hot encoding. It is therefore ignored for the moment.

The following are already coded as categories in the imported dataset:

In [26]:
learning.select_dtypes(include="category").columns

Index(['TCODE', 'STATE', 'PVASTATE', 'DOMAIN', 'CLUSTER', 'CHILD03', 'CHILD07',
       'CHILD12', 'CHILD18', 'GENDER', 'WEALTH1', 'DATASRCE', 'SOLP3', 'SOLIH',
       'WEALTH2', 'GEOCODE', 'LIFESRC', 'RFA_2R', 'RFA_2F', 'RFA_2A',
       'MDMAUD_R', 'MDMAUD_F', 'MDMAUD_A', 'GEOCODE2'],
      dtype='object')

Additionally, We will also have to add the multibyte features. These are features that group together several related categorical features.

These are mainly the promotion history codes. Recency, Frequency and Amount as of a particular mailing are glued together in one feature. For RFA_2 and additionally MDMAUD, the major donor matrix, the features were already spread out by the supplier of the data. These two were dropped on import of the CSV file and their spread out features kept.

In [27]:
print(dl.nominal_features)

['RFA_3', 'RFA_4', 'RFA_5', 'RFA_6', 'RFA_7', 'RFA_8', 'RFA_9', 'RFA_10', 'RFA_11', 'RFA_12', 'RFA_13', 'RFA_14', 'RFA_15', 'RFA_16', 'RFA_17', 'RFA_18', 'RFA_19', 'RFA_20', 'RFA_21', 'RFA_22', 'RFA_23', 'RFA_24']


In [28]:
multibyte_transformer = ColumnTransformer([
    ("rfa_spread",
     MultiByteExtract(["R", "F", "A"]),
     dl.nominal_features)
])

Apply the transormation to all RFA_* features and extract the new feature names to build a pandas dataframe:

In [29]:
multibytes = multibyte_transformer.fit_transform(learning)
feature_names = [n[n.find('__')+2:]
                 for n in multibyte_transformer.get_feature_names()]

In [30]:
rfa = pd.DataFrame(data=multibytes, columns=feature_names,
                   index=learning.index).astype("category")

Merge learning and the rfa features, then drop the original RFA_* features

In [31]:
# copy = False prevents copying the whole learning dataset around
learning = learning.merge(rfa, on=learning.index.name, copy=False)
learning = learning.drop(dl.nominal_features, axis=1)

In [32]:
print(list(learning.select_dtypes(include="category").columns))

['TCODE', 'STATE', 'PVASTATE', 'DOMAIN', 'CLUSTER', 'CHILD03', 'CHILD07', 'CHILD12', 'CHILD18', 'GENDER', 'WEALTH1', 'DATASRCE', 'SOLP3', 'SOLIH', 'WEALTH2', 'GEOCODE', 'LIFESRC', 'RFA_2R', 'RFA_2F', 'RFA_2A', 'MDMAUD_R', 'MDMAUD_F', 'MDMAUD_A', 'GEOCODE2', 'RFA_3_R', 'RFA_3_F', 'RFA_3_A', 'RFA_4_R', 'RFA_4_F', 'RFA_4_A', 'RFA_5_R', 'RFA_5_F', 'RFA_5_A', 'RFA_6_R', 'RFA_6_F', 'RFA_6_A', 'RFA_7_R', 'RFA_7_F', 'RFA_7_A', 'RFA_8_R', 'RFA_8_F', 'RFA_8_A', 'RFA_9_R', 'RFA_9_F', 'RFA_9_A', 'RFA_10_R', 'RFA_10_F', 'RFA_10_A', 'RFA_11_R', 'RFA_11_F', 'RFA_11_A', 'RFA_12_R', 'RFA_12_F', 'RFA_12_A', 'RFA_13_R', 'RFA_13_F', 'RFA_13_A', 'RFA_14_R', 'RFA_14_F', 'RFA_14_A', 'RFA_15_R', 'RFA_15_F', 'RFA_15_A', 'RFA_16_R', 'RFA_16_F', 'RFA_16_A', 'RFA_17_R', 'RFA_17_F', 'RFA_17_A', 'RFA_18_R', 'RFA_18_F', 'RFA_18_A', 'RFA_19_R', 'RFA_19_F', 'RFA_19_A', 'RFA_20_R', 'RFA_20_F', 'RFA_20_A', 'RFA_21_R', 'RFA_21_F', 'RFA_21_A', 'RFA_22_R', 'RFA_22_F', 'RFA_22_A', 'RFA_23_R', 'RFA_23_F', 'RFA_23_A', 'RFA_24

### Dates

There are several date features. ODATEDW is the date the record was added, DOB the birth date. ADATE_* and RDATE_* are from the promotion history. ADATE_* is the date of a mailing, RDATE_* the date the donation for the corresponding mailing was received. While these dates are not of particular interest (very low variance), the time it took to respond might be.

Two different transformations are applied:

1. ODATEDW, DOB: Get transformed to years before 1997 -> membership duration, age
2. For the promotion history, as specified above, the time for response in months

In [33]:
print(dl.date_features)

['ODATEDW', 'DOB', 'ADATE_2', 'ADATE_3', 'ADATE_4', 'ADATE_5', 'ADATE_6', 'ADATE_7', 'ADATE_8', 'ADATE_9', 'ADATE_10', 'ADATE_11', 'ADATE_12', 'ADATE_13', 'ADATE_14', 'ADATE_15', 'ADATE_16', 'ADATE_17', 'ADATE_18', 'ADATE_19', 'ADATE_20', 'ADATE_21', 'ADATE_22', 'ADATE_23', 'ADATE_24', 'RDATE_3', 'RDATE_4', 'RDATE_5', 'RDATE_6', 'RDATE_7', 'RDATE_8', 'RDATE_9', 'RDATE_10', 'RDATE_11', 'RDATE_12', 'RDATE_13', 'RDATE_14', 'RDATE_15', 'RDATE_16', 'RDATE_17', 'RDATE_18', 'RDATE_19', 'RDATE_20', 'RDATE_21', 'RDATE_22', 'RDATE_23', 'RDATE_24']


First, parse all date features into datetime values. This also takes care of dates that by the default pivot fall into the 21st century by subtracting 100 years for these.
Invalid dates (shorter than 3 digits, 3 digits are fixed as usually the format is yym) are set to NaT

In [34]:
date_transformer = ColumnTransformer([
    ("parse_dates",
     ParseDates(treat_errors='ignore'),
     dl.date_features
     )
])

In [35]:
dates = date_transformer.fit_transform(learning)

In [36]:
feature_names = [n[n.find('__')+2:]
                 for n in date_transformer.get_feature_names()]
print(feature_names) 

['ODATEDW', 'DOB', 'ADATE_2', 'ADATE_3', 'ADATE_4', 'ADATE_5', 'ADATE_6', 'ADATE_7', 'ADATE_8', 'ADATE_9', 'ADATE_10', 'ADATE_11', 'ADATE_12', 'ADATE_13', 'ADATE_14', 'ADATE_15', 'ADATE_16', 'ADATE_17', 'ADATE_18', 'ADATE_19', 'ADATE_20', 'ADATE_21', 'ADATE_22', 'ADATE_23', 'ADATE_24', 'RDATE_3', 'RDATE_4', 'RDATE_5', 'RDATE_6', 'RDATE_7', 'RDATE_8', 'RDATE_9', 'RDATE_10', 'RDATE_11', 'RDATE_12', 'RDATE_13', 'RDATE_14', 'RDATE_15', 'RDATE_16', 'RDATE_17', 'RDATE_18', 'RDATE_19', 'RDATE_20', 'RDATE_21', 'RDATE_22', 'RDATE_23', 'RDATE_24']


In [37]:
dates = pd.DataFrame(dates, columns=list(
    dl.date_features), index=learning.index)


In [38]:
learning[dl.date_features] = dates

Now, we transform the dates from the giving history. First, we create two dataframes with the sending dates of the mailings and the dates when the gift (donation) for these was received.

In [39]:
don_hist_transformer = ColumnTransformer([
    ("months_to_donation",
     MonthsToDonation(),
     dl.don_hist_dates
     )
])

In [40]:
donation_responses = don_hist_transformer.fit_transform(learning)

In [41]:
feature_names = [n[n.find('__')+2:]
                 for n in don_hist_transformer.get_feature_names()]

In [42]:
donation_responses = pd.DataFrame(
    donation_responses, index=learning.index, columns=feature_names)

In [43]:
donation_responses.MONTHS_TO_DONATION_8.value_counts()

 0.0     73943
 2.0     10239
 1.0      8478
 3.0      2489
 4.0       200
-2.0        29
 5.0         9
-3.0         7
 6.0         6
 7.0         4
-8.0         3
 8.0         2
-11.0        1
-9.0         1
 11.0        1
Name: MONTHS_TO_DONATION_8, dtype: int64

In [44]:
learning = learning.merge(donation_responses, on=learning.index.name)
learning = learning.drop(dl.don_hist_dates, axis=1, inplace=True)

Finally, we compute the membership years and age of the records

In [45]:
age_transformer = ColumnTransformer([
    ("relative_age",
     ComputeAge(),
    ['ODATEDW', 'DOB'])
])

In [46]:
rel_ages = age_transformer.fit_transform(learning)

ValueError: Expected 2D array, got scalar array instead:
array=None.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
feature_names = [n[n.find('__')+2:]
                 for n in age_transformer.get_feature_names()]

In [None]:
rel_ages =pd.DataFrame(rel_ages, index=learning.index,columns=feature_names)

In [None]:
learning[['ODATEDW', 'DOB']] = rel_ages

In [None]:
learning.info()

## Digging through the data

In [None]:
learning.select_dtypes(include=np.float).hist(bins=50, figsize=(50, 50))
save_fig("float_feature_histograms")

### Some promising fetures and their impact on the label

In [None]:
%matplotlib inline
sns.catplot(x="WEALTH2", y="TARGET_D", hue="MAJOR",
            kind="violin", inner="stick", split=True, data=learning)

### Income, Wealth and donations

In [None]:
sns.violinplot(x="INCOME", y="TARGET_D", palette="pastel", data=learning)

In [None]:
sns.violinplot(x="WEALTH1", y="TARGET_D", palette="pastel", data=learning)

### Interests and donations

In [None]:
data = learning_raw.loc[:, dl.interest_features+["TARGET_D"]].fillna(0)
interests = pd.melt(data, value_vars=dl.interest_features,
                    value_name="Interest")
data.head()

Features with constant values:

In [None]:
learning_raw.nunique(axis=1)

### Individual feature properties

Value range, distribution, outliers

### Correlations

-> Product moment covariance

In [None]:
# calculate the correlation matrix
corr = learning_raw.drop(['TARGET_B', 'TARGET_D'], axis=1).corr()

In [None]:
# plot the heatmap
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(12, 12))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.8, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

### Target variable (labels)

In [None]:
%matplotlib inline
sns.catplot(x="WEALTH2", y="TARGET_D", hue="MAJOR",
            kind="violin", inner="stick", split=True,
            palette="pastel", data=learning)

In [None]:
sns.catplot(x="CLUSTER", y="TARGET_D", kind="box", data=learning)

In [None]:
%matplotlib inline
sns.distplot(learning.loc[learning.TARGET_D > 0.0,
                          'TARGET_D'], bins=50, kde=False, rug=True)

### US census data

In [None]:
us_census = ["POP901", "POP902", "POP903", "POP90C1", "POP90C2", "POP90C3", "POP90C4", "POP90C5", "ETH1", "ETH2", "ETH3", "ETH4", "ETH5", "ETH6", "ETH7", "ETH8", "ETH9", "ETH10", "ETH11", "ETH12", "ETH13", "ETH14", "ETH15", "ETH16", "AGE901", "AGE902", "AGE903", "AGE904", "AGE905", "AGE906", "AGE907", "CHIL1", "CHIL2", "CHIL3", "AGEC1", "AGEC2", "AGEC3", "AGEC4", "AGEC5", "AGEC6", "AGEC7", "CHILC1", "CHILC2", "CHILC3", "CHILC4", "CHILC5", "HHAGE1", "HHAGE2", "HHAGE3", "HHN1", "HHN2", "HHN3", "HHN4", "HHN5", "HHN6", "MARR1", "MARR2", "MARR3", "MARR4", "HHP1", "HHP2", "DW1", "DW2", "DW3", "DW4", "DW5", "DW6", "DW7", "DW8", "DW9", "HV1", "HV2", "HV3", "HV4", "HU1", "HU2", "HU3", "HU4", "HU5", "HHD1", "HHD2", "HHD3", "HHD4", "HHD5", "HHD6", "HHD7", "HHD8", "HHD9", "HHD10", "HHD11", "HHD12", "ETHC1", "ETHC2", "ETHC3", "ETHC4", "ETHC5", "ETHC6", "HVP1", "HVP2", "HVP3", "HVP4", "HVP5", "HVP6", "HUR1", "HUR2", "RHP1", "RHP2", "RHP3", "RHP4", "HUPA1", "HUPA2", "HUPA3", "HUPA4", "HUPA5", "HUPA6", "HUPA7", "RP1", "RP2", "RP3", "RP4", "MSA", "ADI", "DMA", "IC1", "IC2", "IC3", "IC4", "IC5", "IC6", "IC7", "IC8", "IC9", "IC10", "IC11", "IC12", "IC13", "IC14", "IC15", "IC16", "IC17",
             "IC18", "IC19", "IC20", "IC21", "IC22", "IC23", "HHAS1", "HHAS2", "HHAS3", "HHAS4", "MC1", "MC2", "MC3", "TPE1", "TPE2", "TPE3", "TPE4", "TPE5", "TPE6", "TPE7", "TPE8", "TPE9", "PEC1", "PEC2", "TPE10", "TPE11", "TPE12", "TPE13", "LFC1", "LFC2", "LFC3", "LFC4", "LFC5", "LFC6", "LFC7", "LFC8", "LFC9", "LFC10", "OCC1", "OCC2", "OCC3", "OCC4", "OCC5", "OCC6", "OCC7", "OCC8", "OCC9", "OCC10", "OCC11", "OCC12", "OCC13", "EIC1", "EIC2", "EIC3", "EIC4", "EIC5", "EIC6", "EIC7", "EIC8", "EIC9", "EIC10", "EIC11", "EIC12", "EIC13", "EIC14", "EIC15", "EIC16", "OEDC1", "OEDC2", "OEDC3", "OEDC4", "OEDC5", "OEDC6", "OEDC7", "EC1", "EC2", "EC3", "EC4", "EC5", "EC6", "EC7", "EC8", "SEC1", "SEC2", "SEC3", "SEC4", "SEC5", "AFC1", "AFC2", "AFC3", "AFC4", "AFC5", "AFC6", "VC1", "VC2", "VC3", "VC4", "ANC1", "ANC2", "ANC3", "ANC4", "ANC5", "ANC6", "ANC7", "ANC8", "ANC9", "ANC10", "ANC11", "ANC12", "ANC13", "ANC14", "ANC15", "POBC1", "POBC2", "LSC1", "LSC2", "LSC3", "LSC4", "VOC1", "VOC2", "VOC3", "HC1", "HC2", "HC3", "HC4", "HC5", "HC6", "HC7", "HC8", "HC9", "HC10", "HC11", "HC12", "HC13", "HC14", "HC15", "HC16", "HC17", "HC18", "HC19", "HC20", "HC21", "MHUC1", "MHUC2", "AC1", "AC2"]
len(us_census)

## Feature Selection
Meant to reduce dimensionality by selecting only features that are 'interesting enough' to be considered in order to boost performance of calculations / improve accuracy of the estimator
- By variance threshold
- Recursive Feature Elimination by Cross-Validation
- L1-based feature selection (Logistic Regression, Lasso, SVM)
- Tree-based feature selection

See [scikit-learn: feature selection](http://scikit-learn.org/stable/modules/feature_selection.html#feature-selection)

### Removing constant features (zero variance)

In [None]:
for column in learning.columns:
    if len(learning[column].unique()) == 1:
        print(column)

### Sparse Features

In [None]:
sparse_features = []
for column in learning:
    top_freq = learning[column].value_counts(normalize=True).iloc[0]
    if top_freq > 0.995:
        sparse_features.append(column)
        print(column+" has a top frequency of: " + str(top_freq))
        print(learning[column].value_counts(normalize=True))

In [None]:
sparse_features

## Feature Extraction
All explanatory fields have to be numerical for the subsequent operations with scikit-learn. Here, the necessary feature extractions are performed.

See [scikit-learn: feature extraction](http://scikit-learn.org/stable/modules/feature_extraction.html)

In [None]:
import pandas as pd

In [None]:
symbolic_features = []
symbolic_features.append(tds.SymbolicFeatureSpreader(
    "DOMAIN", ["U", "S"]))  # Urbanicity, SocioEconomicStatus
# RFA_2 is already spread out
for i in range(3, 25):
    feature = "_".join(["RFA", str(i)])
    symbolic_features.append(tds.SymbolicFeatureSpreader(
        feature, ["R", "F", "A"]))  # Recency, Frequency, Amount

spread_multibyte = pd.DataFrame(index=learning_raw.index)
for f in symbolic_features:
    f.set_tidy_dataset_ref(learning_raw)
    spread_multibyte = pd.concat(
        [spread_multibyte, f.spread(inplace=False)], axis=1)

In [None]:
spread_multibyte.info()

# PCA

A first look at important features

In [None]:
from sklearn import decomposition

In [None]:
X = learning.drop(["TARGET_B", "TARGET_D"], axis=1)

In [None]:
n_comp = 3
pca = decomposition.PCA(n_components=n_comp)
pca.fit(X)
result = pd.DataFrame(pca.transform(X), columns=[
                      "PCA%i" % i for i in range(n_comp)], index=X.index)

In [None]:
import cProfile
domain_spreader = tds.SymbolicFieldToDummies(
    learning, "RFA_24", ["Recency", "Frequency", "Amount"])
cProfile.run('domain_spreader.spread()', sort='time')

In [None]:
learning.head()

In [None]:
import os
import numpy as np
import sys
os.getcwd()
proj_dir = os.path.split(os.getcwd())[0]
if proj_dir not in sys.path:
    sys.path.append(proj_dir)

In [None]:
import eda.tidy_dataset as tds
tidy = tds.TidyDataset("cup98LRN.txt")

In [None]:
raw = tidy.get_raw_data()

In [None]:
spreader = tds.SymbolicFieldToDummies(
    raw, "RFA_24", ["Recency", "Frequency", "Amount"])
spreader.spread()