# Exploratory Data Analysis
This notebook contains all code for the prelimiatory analysis of the KDD Cup 98 datasets

In [6]:
%load_ext autoreload

In [7]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats

os.chdir("../")
import util.data_loader as dl
%matplotlib inline

In [8]:
# seaborn config
sns.set(color_codes=True)
sns.set_style('ticks')
plt.rcParams['figure.figsize'] = [40, 40]

## Loading the learning dataset


Set working directory to main code folder

In [11]:
%autoreload 2 # automatically reloads modules
lrn = dl.KDD98DataLoader("cup98LRN.txt")
learning_raw = lrn.get_dataset()

## Overview

A first, general look at the data structure:

In [12]:
learning_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95412 entries, 95515 to 185114
Columns: 480 entries, ODATEDW to GEOCODE2
dtypes: category(23), float64(96), int64(306), object(55)
memory usage: 335.6+ MB


### Numerical Features

In [15]:
numerical = learning_raw.select_dtypes(include=['float64', 'int64']).columns
print(numerical)

Index(['ODATEDW', 'DOB', 'AGE', 'NUMCHLD', 'INCOME', 'WEALTH1', 'HIT',
       'MBCRAFT', 'MBGARDEN', 'MBBOOKS',
       ...
       'MAXRDATE', 'LASTGIFT', 'LASTDATE', 'FISTDATE', 'NEXTDATE', 'TIMELAG',
       'AVGGIFT', 'TARGET_D', 'RFA_2F', 'CLUSTER2'],
      dtype='object', length=402)


### Categorical Features

Categories were defined on import of the csv data. The categories were identified in the dataset dictionary.

In [13]:
categories = learning_raw.select_dtypes(include='category').columns
print(categories)

Index(['OSOURCE', 'TCODE', 'STATE', 'PVASTATE', 'CLUSTER', 'CHILD03',
       'CHILD07', 'CHILD12', 'CHILD18', 'GENDER', 'DATASRCE', 'SOLP3', 'SOLIH',
       'WEALTH2', 'GEOCODE', 'LIFESRC', 'TARGET_B', 'RFA_2R', 'RFA_2A',
       'MDMAUD_R', 'MDMAUD_F', 'MDMAUD_A', 'GEOCODE2'],
      dtype='object')


In [14]:
learning_raw.loc[:,categories].describe()

Unnamed: 0,OSOURCE,TCODE,STATE,PVASTATE,CLUSTER,CHILD03,CHILD07,CHILD12,CHILD18,GENDER,...,WEALTH2,GEOCODE,LIFESRC,TARGET_B,RFA_2R,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,GEOCODE2
count,95412,95412,95412,95412.0,95412,95412.0,95412.0,95412.0,95412.0,95412,...,51589,95412.0,95412.0,95412,95412,95412,95412,95412,95412,95280
unique,896,55,57,3.0,54,4.0,4.0,4.0,4.0,7,...,10,8.0,4.0,2,1,4,5,4,5,5
top,MBC,0,CA,,40,,,,,F,...,9,,,0,L,F,X,X,X,A
freq,4539,40917,17343,93954.0,3979,94266.0,93846.0,93601.0,92565.0,51277,...,6523,80168.0,54032.0,90569,95412,46964,95118,95118,95118,34484


### Object Features

These features have mixed datatypes. This hints at noisy data and features that will have to be transformed before becoming usable.

In [16]:
objects = learning_raw.select_dtypes(include='object').columns
print(objects)

Index(['ZIP', 'MAILCODE', 'NOEXCH', 'RECINHSE', 'RECP3', 'RECPGVG', 'RECSWEEP',
       'MDMAUD', 'DOMAIN', 'AGEFLAG', 'HOMEOWNR', 'MAJOR', 'COLLECT1',
       'VETERANS', 'BIBLE', 'CATLG', 'HOMEE', 'PETS', 'CDPLAY', 'STEREO',
       'PCOWNERS', 'PHOTO', 'CRAFTS', 'FISHER', 'GARDENIN', 'BOATS', 'WALKER',
       'KIDSTUFF', 'CARDS', 'PLATES', 'PEPSTRFL', 'RFA_2', 'RFA_3', 'RFA_4',
       'RFA_5', 'RFA_6', 'RFA_7', 'RFA_8', 'RFA_9', 'RFA_10', 'RFA_11',
       'RFA_12', 'RFA_13', 'RFA_14', 'RFA_15', 'RFA_16', 'RFA_17', 'RFA_18',
       'RFA_19', 'RFA_20', 'RFA_21', 'RFA_22', 'RFA_23', 'RFA_24', 'HPHONE_D'],
      dtype='object')


In [17]:
learning_raw.loc[:,objects].describe()

Unnamed: 0,ZIP,MAILCODE,NOEXCH,RECINHSE,RECP3,RECPGVG,RECSWEEP,MDMAUD,DOMAIN,AGEFLAG,...,RFA_16,RFA_17,RFA_18,RFA_19,RFA_20,RFA_21,RFA_22,RFA_23,RFA_24,HPHONE_D
count,95412,95412.0,95412,95412.0,95412.0,95412.0,95412.0,95412,95412,95412,...,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0,95412
unique,19938,2.0,4,2.0,2.0,2.0,2.0,28,17,3,...,123.0,118.0,122.0,108.0,80.0,102.0,117.0,87.0,97.0,2
top,85351,,0,,,,,XXXX,R2,E,...,,,,,,,,,,1
freq,61,94013.0,95085,88709.0,93395.0,95298.0,93795.0,95118,13623,57344,...,20417.0,27650.0,21263.0,24492.0,50200.0,35212.0,25648.0,56274.0,36973.0,47765


### Date features
These are imported as floats and will have to be transformed later on to become useful.

In [18]:
dates = learning_raw.loc[:,dl.date_features]
dates.describe()

Unnamed: 0,ODATEDW,DOB,ADATE_2,ADATE_3,ADATE_4,ADATE_5,ADATE_6,ADATE_7,ADATE_8,ADATE_9,...,ADATE_15,ADATE_16,ADATE_17,ADATE_18,ADATE_19,ADATE_20,ADATE_21,ADATE_22,ADATE_23,ADATE_24
count,95412.0,95412.0,95412.0,93462.0,93221.0,61822.0,91855.0,86538.0,91901.0,84167.0,...,29935.0,75048.0,67762.0,74149.0,70932.0,45212.0,60200.0,69764.0,39142.0,58439.0
mean,9141.363256,2723.602933,9705.999727,9605.999615,9604.015383,9604.0,9602.99889,9601.818507,9594.791384,9510.927074,...,9504.0,9503.018362,9501.920339,9464.210805,9410.955253,9411.00031,9409.92799,9410.041956,9406.940397,9405.995243
std,343.454752,2132.241295,0.023344,0.027753,0.955062,0.0,0.047114,3.354988,22.674375,0.362638,...,0.0,0.146045,0.271803,43.832112,0.293832,0.017594,0.258507,10.344987,2.252852,0.068808
min,8306.0,0.0,9704.0,9604.0,9511.0,9604.0,9601.0,9512.0,9511.0,9509.0,...,9504.0,9502.0,9501.0,9409.0,9409.0,9411.0,9409.0,9408.0,9312.0,9405.0
25%,8801.0,201.0,9706.0,9606.0,9604.0,9604.0,9603.0,9602.0,9601.0,9511.0,...,9504.0,9503.0,9502.0,9412.0,9411.0,9411.0,9410.0,9409.0,9407.0,9406.0
50%,9201.0,2610.0,9706.0,9606.0,9604.0,9604.0,9603.0,9602.0,9601.0,9511.0,...,9504.0,9503.0,9502.0,9501.0,9411.0,9411.0,9410.0,9409.0,9407.0,9406.0
75%,9501.0,4601.0,9706.0,9606.0,9604.0,9604.0,9603.0,9602.0,9601.0,9511.0,...,9504.0,9503.0,9502.0,9501.0,9411.0,9411.0,9410.0,9409.0,9407.0,9406.0
max,9701.0,9710.0,9706.0,9606.0,9609.0,9604.0,9603.0,9602.0,9605.0,9511.0,...,9504.0,9504.0,9503.0,9508.0,9411.0,9412.0,9410.0,9506.0,9407.0,9406.0


In [None]:
learning.select_dtypes(include= np.number).hist(bins=50, figsize=(200,200))
plt.show()

## Digging deeper

### Some promising fetures and their impact on the label

In [None]:
%matplotlib inline
sns.catplot(x="WEALTH2", y="TARGET_D", hue="MAJOR",
            kind="violin", inner="stick", split=True,
            palette="pastel", data=learning);

### Interests and donations

In [None]:
data = learning_raw.loc[:,dl.interest_features+["TARGET_D"]].fillna(0)
interests = pd.melt(data,value_vars=dl.interest_features, value_name="Interest")
data.head()

Features with constant values:

In [None]:
learning_raw.nunique(axis=1)

### Individual feature properties

Value range, distribution, outliers

### Correlations

-> Product moment covariance

In [None]:
# calculate the correlation matrix
corr = learning_raw.drop(['TARGET_B','TARGET_D'],axis=1).corr()

In [None]:
# plot the heatmap
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(12, 12))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.8, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

### Target variable (labels)

In [None]:
%matplotlib inline
sns.catplot(x="WEALTH2", y="TARGET_D", hue="MAJOR",
            kind="violin", inner="stick", split=True,
            palette="pastel", data=learning);

In [None]:
sns.catplot(x="CLUSTER", y="TARGET_D", kind="box", data=learning);

In [None]:
%matplotlib inline
sns.distplot(learning.loc[learning.TARGET_D > 0.0, 'TARGET_D'], bins=50, kde=False, rug=True);

### US census data

In [None]:
us_census = ["POP901", "POP902", "POP903", "POP90C1", "POP90C2", "POP90C3", "POP90C4", "POP90C5", "ETH1", "ETH2", "ETH3", "ETH4", "ETH5", "ETH6", "ETH7", "ETH8", "ETH9", "ETH10", "ETH11", "ETH12", "ETH13", "ETH14", "ETH15", "ETH16", "AGE901", "AGE902", "AGE903", "AGE904", "AGE905", "AGE906", "AGE907", "CHIL1", "CHIL2", "CHIL3", "AGEC1", "AGEC2", "AGEC3", "AGEC4", "AGEC5", "AGEC6", "AGEC7", "CHILC1", "CHILC2", "CHILC3", "CHILC4", "CHILC5", "HHAGE1", "HHAGE2", "HHAGE3", "HHN1", "HHN2", "HHN3", "HHN4", "HHN5", "HHN6", "MARR1", "MARR2", "MARR3", "MARR4", "HHP1", "HHP2", "DW1", "DW2", "DW3", "DW4", "DW5", "DW6", "DW7", "DW8", "DW9", "HV1", "HV2", "HV3", "HV4", "HU1", "HU2", "HU3", "HU4", "HU5", "HHD1", "HHD2", "HHD3", "HHD4", "HHD5", "HHD6", "HHD7", "HHD8", "HHD9", "HHD10", "HHD11", "HHD12", "ETHC1", "ETHC2", "ETHC3", "ETHC4", "ETHC5", "ETHC6", "HVP1", "HVP2", "HVP3", "HVP4", "HVP5", "HVP6", "HUR1", "HUR2", "RHP1", "RHP2", "RHP3", "RHP4", "HUPA1", "HUPA2", "HUPA3", "HUPA4", "HUPA5", "HUPA6", "HUPA7", "RP1", "RP2", "RP3", "RP4", "MSA", "ADI", "DMA", "IC1", "IC2", "IC3", "IC4", "IC5", "IC6", "IC7", "IC8", "IC9", "IC10", "IC11", "IC12", "IC13", "IC14", "IC15", "IC16", "IC17", "IC18", "IC19", "IC20", "IC21", "IC22", "IC23", "HHAS1", "HHAS2", "HHAS3", "HHAS4", "MC1", "MC2", "MC3", "TPE1", "TPE2", "TPE3", "TPE4", "TPE5", "TPE6", "TPE7", "TPE8", "TPE9", "PEC1", "PEC2", "TPE10", "TPE11", "TPE12", "TPE13", "LFC1", "LFC2", "LFC3", "LFC4", "LFC5", "LFC6", "LFC7", "LFC8", "LFC9", "LFC10", "OCC1", "OCC2", "OCC3", "OCC4", "OCC5", "OCC6", "OCC7", "OCC8", "OCC9", "OCC10", "OCC11", "OCC12", "OCC13", "EIC1", "EIC2", "EIC3", "EIC4", "EIC5", "EIC6", "EIC7", "EIC8", "EIC9", "EIC10", "EIC11", "EIC12", "EIC13", "EIC14", "EIC15", "EIC16", "OEDC1", "OEDC2", "OEDC3", "OEDC4", "OEDC5", "OEDC6", "OEDC7", "EC1", "EC2", "EC3", "EC4", "EC5", "EC6", "EC7", "EC8", "SEC1", "SEC2", "SEC3", "SEC4", "SEC5", "AFC1", "AFC2", "AFC3", "AFC4", "AFC5", "AFC6", "VC1", "VC2", "VC3", "VC4", "ANC1", "ANC2", "ANC3", "ANC4", "ANC5", "ANC6", "ANC7", "ANC8", "ANC9", "ANC10", "ANC11", "ANC12", "ANC13", "ANC14", "ANC15", "POBC1", "POBC2", "LSC1", "LSC2", "LSC3", "LSC4", "VOC1", "VOC2", "VOC3", "HC1", "HC2", "HC3", "HC4", "HC5", "HC6", "HC7", "HC8", "HC9", "HC10", "HC11", "HC12", "HC13", "HC14", "HC15", "HC16", "HC17", "HC18", "HC19", "HC20", "HC21", "MHUC1", "MHUC2", "AC1", "AC2"]
len(us_census)

## Feature Selection
Meant to reduce dimensionality by selecting only features that are 'interesting enough' to be considered in order to boost performance of calculations / improve accuracy of the estimator
- By variance threshold
- Recursive Feature Elimination by Cross-Validation
- L1-based feature selection (Logistic Regression, Lasso, SVM)
- Tree-based feature selection

See [scikit-learn: feature selection](http://scikit-learn.org/stable/modules/feature_selection.html#feature-selection)

### Removing constant features (zero variance)

In [None]:
for column in learning.columns:
        if len(learning[column].unique()) == 1:
            print(column)

### Sparse Features

In [None]:
sparse_features = []
for column in learning:
    top_freq = learning[column].value_counts(normalize=True).iloc[0]
    if top_freq > 0.995:
        sparse_features.append(column)
        print(column+" has a top frequency of: " + str(top_freq))
        print(learning[column].value_counts(normalize=True))

In [None]:
sparse_features

## Feature Extraction
All explanatory fields have to be numerical for the subsequent operations with scikit-learn. Here, the necessary feature extractions are performed.

See [scikit-learn: feature extraction](http://scikit-learn.org/stable/modules/feature_extraction.html)

In [None]:
import pandas as pd

In [None]:
symbolic_features = []
symbolic_features.append(tds.SymbolicFeatureSpreader(
    "DOMAIN", ["U", "S"])) #Urbanicity, SocioEconomicStatus
# RFA_2 is already spread out
for i in range(3, 25):
    feature = "_".join(["RFA", str(i)])
    symbolic_features.append(tds.SymbolicFeatureSpreader(
        feature, ["R", "F", "A"])) # Recency, Frequency, Amount

spread_multibyte = pd.DataFrame(index=learning_raw.index)
for f in symbolic_features:
    f.set_tidy_dataset_ref(learning_raw)
    spread_multibyte = pd.concat([spread_multibyte,f.spread(inplace=False)],axis=1)

In [None]:
spread_multibyte.info()

# PCA

A first look at important features

In [None]:
from sklearn import decomposition

In [None]:
X = learning.drop(["TARGET_B","TARGET_D"],axis=1)

In [None]:
n_comp = 3
pca = decomposition.PCA(n_components = n_comp)
pca.fit(X)
result = pd.DataFrame(pca.transform(X), columns=["PCA%i" % i for i in range(n_comp)], index=X.index)

In [None]:
import cProfile
domain_spreader = tds.SymbolicFieldToDummies(learning,"RFA_24",["Recency", "Frequency", "Amount"])
cProfile.run('domain_spreader.spread()', sort='time')

In [None]:
learning.head()

In [None]:
import os
import numpy as np
import sys
os.getcwd()
proj_dir = os.path.split(os.getcwd())[0]
if proj_dir not in sys.path:
    sys.path.append(proj_dir)

In [None]:
import eda.tidy_dataset as tds
tidy = tds.TidyDataset("cup98LRN.txt")

In [None]:
raw = tidy.get_raw_data()

In [None]:
spreader = tds.SymbolicFieldToDummies(
    raw, "RFA_24", ["Recency", "Frequency", "Amount"])
spreader.spread()