# Exploratory Data Analysis
This notebook contains all code for the prelimiatory analysis of the KDD Cup 98 datasets

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
import os
import numpy as np
import pandas as pd
from scipy import stats

os.chdir("../")
import util.data_loader as dl
from kdd98.transformers import *

In [3]:
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = [20, 16]
plt.rcParams['image.cmap'] = 'viridis'
# seaborn config
import seaborn as sns
sns.set(color_codes=True)
sns.set_style('ticks')

# figures:
# Where to save the figures
PROJECT_ROOT_DIR = "../../"
CHAPTER_ID = "eda"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "figures", CHAPTER_ID)

if not os.path.exists(IMAGES_PATH):
    os.makedirs(IMAGES_PATH)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## Loading the learning dataset


Set working directory to main code folder

In [4]:
%autoreload 2 # automatically reloads modules
data_loader = dl.KDD98DataLoader("cup98LRN.txt")
learning = data_loader.get_dataset()

## Overview

A first, general look at the data structure:

In [5]:
learning.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95412 entries, 95515 to 185114
Columns: 478 entries, ODATEDW to GEOCODE2
dtypes: category(24), float64(95), int64(305), object(54)
memory usage: 333.4+ MB


### Numerical Features

In [None]:
numerical = learning.select_dtypes(include=np.number).columns
print(numerical)

### Categorical Features

Categories were defined on import of the csv data. The categories were identified in the dataset dictionary.

In [None]:
categories = learning.select_dtypes(include='category').columns
print(categories)

In [None]:
learning.loc[:,categories].describe()

### Object Features

These features have mixed datatypes. This hints at noisy data and features that will have to be transformed before becoming usable.

In [None]:
objects = learning.select_dtypes(include='object').columns
print(objects)

In [None]:
learning.loc[:,objects].describe()

### Date features
These are imported as floats and will have to be transformed later on to become useful.

In [None]:
dates = learning.loc[:,dl.date_features]
dates.describe()

## Cleaning

We will leverage scikit's transformer classes, and add our own transformers.

sklearn doc:

* http://scikit-learn.org/dev/modules/generated/sklearn.compose.ColumnTransformer.html
* http://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
* http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

### Boolean features

In [None]:
print(dl.boolean_features)

In [6]:
%autoreload 2
bool_transformers = ColumnTransformer([
    ("bool_x_bl",
    BooleanFeatureRecode(value_map={'true': 'X', 'false': ' '}),
    ['PEPSTRFL', 'NOEXCH', 'MAJOR', 'RECINHSE', 'RECP3', 'RECPGVG', 'RECSWEEP']
    ),
    ("bool_y_n",
     BooleanFeatureRecode(value_map={'true': 'Y', 'false': 'N'}),
     ['COLLECT1', 'VETERANS', 'BIBLE', 'CATLG', 'HOMEE', 'PETS','CDPLAY', 'STEREO',
      'PCOWNERS', 'PHOTO', 'CRAFTS', 'FISHER', 'GARDENIN',  'BOATS', 'WALKER', 'KIDSTUFF',
      'CARDS', 'PLATES']
    ),
    ("bool_e_i",
     BooleanFeatureRecode(value_map={'true': "E", 'false': 'I'}),
     ['AGEFLAG']
    ),
    ("bool_h_u",
     BooleanFeatureRecode(value_map={'true': "H", 'false': 'U'}),
     ['HOMEOWNR']),
    ("bool_b_bl",
     BooleanFeatureRecode(value_map={'true': 'B', 'false': ' '}),
     ['MAILCODE']
    ),
    ("bool_1_0",
     BooleanFeatureRecode(value_map={'true': '1', 'false': '0'}),
     ['HPHONE_D']
    )
])

In [7]:
booleans = bool_transformers.fit_transform(learning)

In [8]:
feature_names = [n[n.find('__')+2:] for n in bool_transformers.get_feature_names()]

In [9]:
bools = pd.DataFrame(data=booleans, columns=feature_names,index=learning.index)

In [10]:
bools.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95412 entries, 95515 to 185114
Data columns (total 29 columns):
PEPSTRFL    95412 non-null bool
NOEXCH      95412 non-null bool
MAJOR       95412 non-null bool
RECINHSE    95412 non-null bool
RECP3       95412 non-null bool
RECPGVG     95412 non-null bool
RECSWEEP    95412 non-null bool
COLLECT1    95412 non-null bool
VETERANS    95412 non-null bool
BIBLE       95412 non-null bool
CATLG       95412 non-null bool
HOMEE       95412 non-null bool
PETS        95412 non-null bool
CDPLAY      95412 non-null bool
STEREO      95412 non-null bool
PCOWNERS    95412 non-null bool
PHOTO       95412 non-null bool
CRAFTS      95412 non-null bool
FISHER      95412 non-null bool
GARDENIN    95412 non-null bool
BOATS       95412 non-null bool
WALKER      95412 non-null bool
KIDSTUFF    95412 non-null bool
CARDS       95412 non-null bool
PLATES      95412 non-null bool
AGEFLAG     95412 non-null bool
HOMEOWNR    95412 non-null bool
MAILCODE    95412 non-

In [11]:
learning[dl.boolean_features] = bools

### Zipcode

In [12]:
zip_transformer = ColumnTransformer([
    ("truncate_zip",
     ZipCodeFormatter(),
     ['ZIP']
    )
])

In [13]:
zip = zip_transformer.fit_transform(learning)

In [14]:
learning.ZIP.head()

CONTROLN
95515     61081
148535    91326
15078     27017
172556    95953
7112      33176
Name: ZIP, dtype: object

In [15]:
zip[0:5]

array([['61081'],
       ['91326'],
       ['27017'],
       ['95953'],
       ['33176']], dtype=object)

In [16]:
learning.ZIP = zip
learning.ZIP = learning.ZIP.astype("int", copy=False)

### Categories

uses category_encoders from: https://contrib.scikit-learn.org/categorical-encoding/index.html

In [None]:
print(dl.categorical_features)

In [17]:
import category_encoders as ce

learning.select_dtypes(include="category").columns

Index(['OSOURCE', 'TCODE', 'STATE', 'PVASTATE', 'DOMAIN', 'CLUSTER', 'CHILD03',
       'CHILD07', 'CHILD12', 'CHILD18', 'GENDER', 'WEALTH1', 'DATASRCE',
       'SOLP3', 'SOLIH', 'WEALTH2', 'GEOCODE', 'LIFESRC', 'RFA_2R', 'RFA_2F',
       'RFA_2A', 'MDMAUD_R', 'MDMAUD_F', 'MDMAUD_A', 'GEOCODE2'],
      dtype='object')

We will also have to add the multibyte features:


In [18]:
dl.nominal_features

['RFA_3',
 'RFA_4',
 'RFA_5',
 'RFA_6',
 'RFA_7',
 'RFA_8',
 'RFA_9',
 'RFA_10',
 'RFA_11',
 'RFA_12',
 'RFA_13',
 'RFA_14',
 'RFA_15',
 'RFA_16',
 'RFA_17',
 'RFA_18',
 'RFA_19',
 'RFA_20',
 'RFA_21',
 'RFA_22',
 'RFA_23',
 'RFA_24']

In [18]:
multibyte_transformer = ColumnTransformer([
    ("rfa_spread",
    MultiByteExtract(["R", "F", "A"]),
    dl.nominal_features)
])

In [19]:
multibytes = multibyte_transformer.fit_transform(learning)
feature_names = [n[n.find('__')+2:] for n in multibyte_transformer.get_feature_names()]

In [20]:
print(feature_names)

['RFA_3_R', 'RFA_3_F', 'RFA_3_A', 'RFA_4_R', 'RFA_4_F', 'RFA_4_A', 'RFA_5_R', 'RFA_5_F', 'RFA_5_A', 'RFA_6_R', 'RFA_6_F', 'RFA_6_A', 'RFA_7_R', 'RFA_7_F', 'RFA_7_A', 'RFA_8_R', 'RFA_8_F', 'RFA_8_A', 'RFA_9_R', 'RFA_9_F', 'RFA_9_A', 'RFA_10_R', 'RFA_10_F', 'RFA_10_A', 'RFA_11_R', 'RFA_11_F', 'RFA_11_A', 'RFA_12_R', 'RFA_12_F', 'RFA_12_A', 'RFA_13_R', 'RFA_13_F', 'RFA_13_A', 'RFA_14_R', 'RFA_14_F', 'RFA_14_A', 'RFA_15_R', 'RFA_15_F', 'RFA_15_A', 'RFA_16_R', 'RFA_16_F', 'RFA_16_A', 'RFA_17_R', 'RFA_17_F', 'RFA_17_A', 'RFA_18_R', 'RFA_18_F', 'RFA_18_A', 'RFA_19_R', 'RFA_19_F', 'RFA_19_A', 'RFA_20_R', 'RFA_20_F', 'RFA_20_A', 'RFA_21_R', 'RFA_21_F', 'RFA_21_A', 'RFA_22_R', 'RFA_22_F', 'RFA_22_A', 'RFA_23_R', 'RFA_23_F', 'RFA_23_A', 'RFA_24_R', 'RFA_24_F', 'RFA_24_A']


In [25]:
rfa = pd.DataFrame(data=multibytes, columns=feature_names,index=learning.index).astype("category")

In [22]:
rfa

Unnamed: 0_level_0,RFA_3_R,RFA_3_F,RFA_3_A,RFA_4_R,RFA_4_F,RFA_4_A,RFA_5_R,RFA_5_F,RFA_5_A,RFA_6_R,...,RFA_21_A,RFA_22_R,RFA_22_F,RFA_22_A,RFA_23_R,RFA_23_F,RFA_23_A,RFA_24_R,RFA_24_F,RFA_24_A
CONTROLN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
95515,S,4,E,S,4,E,S,4,E,S,...,E,S,4,E,S,4,E,S,4,E
148535,A,2,G,A,2,G,A,2,G,A,...,E,N,1,E,,,,F,1,E
15078,S,4,E,S,4,E,S,4,E,S,...,,S,4,D,S,4,D,S,3,D
172556,S,4,E,S,4,E,S,4,E,S,...,D,A,1,D,,,,,,
7112,A,2,F,A,2,F,A,2,F,A,...,D,I,4,E,A,3,D,A,3,D
47784,A,1,F,A,1,F,A,1,F,A,...,E,N,2,E,,,,F,1,E
62117,A,1,E,A,1,E,,,,A,...,E,A,2,E,A,3,E,A,3,E
109359,A,3,E,A,3,E,A,3,E,A,...,D,N,2,D,,,,F,1,D
75768,A,1,F,A,1,F,A,1,F,A,...,F,A,1,F,A,1,F,A,1,F
49909,A,1,F,A,1,F,,,,A,...,E,N,2,E,,,,,,


In [26]:
learning = learning.merge(rfa, on=learning.index.name, copy=False)
learning = learning.drop(dl.nominal_features, axis=1)

In [27]:
list(learning.select_dtypes(include="category").columns)

['OSOURCE',
 'TCODE',
 'STATE',
 'PVASTATE',
 'DOMAIN',
 'CLUSTER',
 'CHILD03',
 'CHILD07',
 'CHILD12',
 'CHILD18',
 'GENDER',
 'WEALTH1',
 'DATASRCE',
 'SOLP3',
 'SOLIH',
 'WEALTH2',
 'GEOCODE',
 'LIFESRC',
 'RFA_2R',
 'RFA_2F',
 'RFA_2A',
 'MDMAUD_R',
 'MDMAUD_F',
 'MDMAUD_A',
 'GEOCODE2',
 'RFA_3_R',
 'RFA_3_F',
 'RFA_3_A',
 'RFA_4_R',
 'RFA_4_F',
 'RFA_4_A',
 'RFA_5_R',
 'RFA_5_F',
 'RFA_5_A',
 'RFA_6_R',
 'RFA_6_F',
 'RFA_6_A',
 'RFA_7_R',
 'RFA_7_F',
 'RFA_7_A',
 'RFA_8_R',
 'RFA_8_F',
 'RFA_8_A',
 'RFA_9_R',
 'RFA_9_F',
 'RFA_9_A',
 'RFA_10_R',
 'RFA_10_F',
 'RFA_10_A',
 'RFA_11_R',
 'RFA_11_F',
 'RFA_11_A',
 'RFA_12_R',
 'RFA_12_F',
 'RFA_12_A',
 'RFA_13_R',
 'RFA_13_F',
 'RFA_13_A',
 'RFA_14_R',
 'RFA_14_F',
 'RFA_14_A',
 'RFA_15_R',
 'RFA_15_F',
 'RFA_15_A',
 'RFA_16_R',
 'RFA_16_F',
 'RFA_16_A',
 'RFA_17_R',
 'RFA_17_F',
 'RFA_17_A',
 'RFA_18_R',
 'RFA_18_F',
 'RFA_18_A',
 'RFA_19_R',
 'RFA_19_F',
 'RFA_19_A',
 'RFA_20_R',
 'RFA_20_F',
 'RFA_20_A',
 'RFA_21_R',
 'RFA_21_F',


In [None]:
print([f for f in learning.columns if "RFA" in f])

In [None]:
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("one_hot",  ce.OneHotEncoder(drop_invariant=True,impute_missing=True,use_cat_names=True,return_df=True))
])

categories_transformer = ColumnTransformer([
    ("cat_encoder",
     cat_pipe,
     list(learning.select_dtypes(include="category").columns))
])

In [None]:
categories = categories_transformer.fit_transform(learning)

In [None]:
categories.shape
# TODO: How to get the feature names for construction the dataframe to pass back???

In [None]:
cat_pipe.named_steps['one_hot']

In [28]:
learning.select_dtypes(include="object").columns

Index(['RECSWEEP'], dtype='object')

## Digging through the data

In [None]:
learning.select_dtypes(include= np.float).hist(bins=50, figsize=(50,50))
save_fig("float_feature_histograms")

### Some promising fetures and their impact on the label

In [None]:
%matplotlib inline
sns.catplot(x="WEALTH2", y="TARGET_D", hue="MAJOR",
            kind="violin", inner="stick", split=True, data=learning);

### Income, Wealth and donations

In [None]:
sns.violinplot(x="INCOME", y="TARGET_D", palette="pastel", data=learning);

In [None]:
sns.violinplot(x="WEALTH1", y="TARGET_D", palette="pastel", data=learning);

### Interests and donations

In [None]:
data = learning_raw.loc[:,dl.interest_features+["TARGET_D"]].fillna(0)
interests = pd.melt(data,value_vars=dl.interest_features, value_name="Interest")
data.head()

Features with constant values:

In [None]:
learning_raw.nunique(axis=1)

### Individual feature properties

Value range, distribution, outliers

### Correlations

-> Product moment covariance

In [None]:
# calculate the correlation matrix
corr = learning_raw.drop(['TARGET_B','TARGET_D'],axis=1).corr()

In [None]:
# plot the heatmap
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(12, 12))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.8, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

### Target variable (labels)

In [None]:
%matplotlib inline
sns.catplot(x="WEALTH2", y="TARGET_D", hue="MAJOR",
            kind="violin", inner="stick", split=True,
            palette="pastel", data=learning);

In [None]:
sns.catplot(x="CLUSTER", y="TARGET_D", kind="box", data=learning);

In [None]:
%matplotlib inline
sns.distplot(learning.loc[learning.TARGET_D > 0.0, 'TARGET_D'], bins=50, kde=False, rug=True);

### US census data

In [None]:
us_census = ["POP901", "POP902", "POP903", "POP90C1", "POP90C2", "POP90C3", "POP90C4", "POP90C5", "ETH1", "ETH2", "ETH3", "ETH4", "ETH5", "ETH6", "ETH7", "ETH8", "ETH9", "ETH10", "ETH11", "ETH12", "ETH13", "ETH14", "ETH15", "ETH16", "AGE901", "AGE902", "AGE903", "AGE904", "AGE905", "AGE906", "AGE907", "CHIL1", "CHIL2", "CHIL3", "AGEC1", "AGEC2", "AGEC3", "AGEC4", "AGEC5", "AGEC6", "AGEC7", "CHILC1", "CHILC2", "CHILC3", "CHILC4", "CHILC5", "HHAGE1", "HHAGE2", "HHAGE3", "HHN1", "HHN2", "HHN3", "HHN4", "HHN5", "HHN6", "MARR1", "MARR2", "MARR3", "MARR4", "HHP1", "HHP2", "DW1", "DW2", "DW3", "DW4", "DW5", "DW6", "DW7", "DW8", "DW9", "HV1", "HV2", "HV3", "HV4", "HU1", "HU2", "HU3", "HU4", "HU5", "HHD1", "HHD2", "HHD3", "HHD4", "HHD5", "HHD6", "HHD7", "HHD8", "HHD9", "HHD10", "HHD11", "HHD12", "ETHC1", "ETHC2", "ETHC3", "ETHC4", "ETHC5", "ETHC6", "HVP1", "HVP2", "HVP3", "HVP4", "HVP5", "HVP6", "HUR1", "HUR2", "RHP1", "RHP2", "RHP3", "RHP4", "HUPA1", "HUPA2", "HUPA3", "HUPA4", "HUPA5", "HUPA6", "HUPA7", "RP1", "RP2", "RP3", "RP4", "MSA", "ADI", "DMA", "IC1", "IC2", "IC3", "IC4", "IC5", "IC6", "IC7", "IC8", "IC9", "IC10", "IC11", "IC12", "IC13", "IC14", "IC15", "IC16", "IC17", "IC18", "IC19", "IC20", "IC21", "IC22", "IC23", "HHAS1", "HHAS2", "HHAS3", "HHAS4", "MC1", "MC2", "MC3", "TPE1", "TPE2", "TPE3", "TPE4", "TPE5", "TPE6", "TPE7", "TPE8", "TPE9", "PEC1", "PEC2", "TPE10", "TPE11", "TPE12", "TPE13", "LFC1", "LFC2", "LFC3", "LFC4", "LFC5", "LFC6", "LFC7", "LFC8", "LFC9", "LFC10", "OCC1", "OCC2", "OCC3", "OCC4", "OCC5", "OCC6", "OCC7", "OCC8", "OCC9", "OCC10", "OCC11", "OCC12", "OCC13", "EIC1", "EIC2", "EIC3", "EIC4", "EIC5", "EIC6", "EIC7", "EIC8", "EIC9", "EIC10", "EIC11", "EIC12", "EIC13", "EIC14", "EIC15", "EIC16", "OEDC1", "OEDC2", "OEDC3", "OEDC4", "OEDC5", "OEDC6", "OEDC7", "EC1", "EC2", "EC3", "EC4", "EC5", "EC6", "EC7", "EC8", "SEC1", "SEC2", "SEC3", "SEC4", "SEC5", "AFC1", "AFC2", "AFC3", "AFC4", "AFC5", "AFC6", "VC1", "VC2", "VC3", "VC4", "ANC1", "ANC2", "ANC3", "ANC4", "ANC5", "ANC6", "ANC7", "ANC8", "ANC9", "ANC10", "ANC11", "ANC12", "ANC13", "ANC14", "ANC15", "POBC1", "POBC2", "LSC1", "LSC2", "LSC3", "LSC4", "VOC1", "VOC2", "VOC3", "HC1", "HC2", "HC3", "HC4", "HC5", "HC6", "HC7", "HC8", "HC9", "HC10", "HC11", "HC12", "HC13", "HC14", "HC15", "HC16", "HC17", "HC18", "HC19", "HC20", "HC21", "MHUC1", "MHUC2", "AC1", "AC2"]
len(us_census)

## Feature Selection
Meant to reduce dimensionality by selecting only features that are 'interesting enough' to be considered in order to boost performance of calculations / improve accuracy of the estimator
- By variance threshold
- Recursive Feature Elimination by Cross-Validation
- L1-based feature selection (Logistic Regression, Lasso, SVM)
- Tree-based feature selection

See [scikit-learn: feature selection](http://scikit-learn.org/stable/modules/feature_selection.html#feature-selection)

### Removing constant features (zero variance)

In [None]:
for column in learning.columns:
        if len(learning[column].unique()) == 1:
            print(column)

### Sparse Features

In [None]:
sparse_features = []
for column in learning:
    top_freq = learning[column].value_counts(normalize=True).iloc[0]
    if top_freq > 0.995:
        sparse_features.append(column)
        print(column+" has a top frequency of: " + str(top_freq))
        print(learning[column].value_counts(normalize=True))

In [None]:
sparse_features

## Feature Extraction
All explanatory fields have to be numerical for the subsequent operations with scikit-learn. Here, the necessary feature extractions are performed.

See [scikit-learn: feature extraction](http://scikit-learn.org/stable/modules/feature_extraction.html)

In [None]:
import pandas as pd

In [None]:
symbolic_features = []
symbolic_features.append(tds.SymbolicFeatureSpreader(
    "DOMAIN", ["U", "S"])) #Urbanicity, SocioEconomicStatus
# RFA_2 is already spread out
for i in range(3, 25):
    feature = "_".join(["RFA", str(i)])
    symbolic_features.append(tds.SymbolicFeatureSpreader(
        feature, ["R", "F", "A"])) # Recency, Frequency, Amount

spread_multibyte = pd.DataFrame(index=learning_raw.index)
for f in symbolic_features:
    f.set_tidy_dataset_ref(learning_raw)
    spread_multibyte = pd.concat([spread_multibyte,f.spread(inplace=False)],axis=1)

In [None]:
spread_multibyte.info()

# PCA

A first look at important features

In [None]:
from sklearn import decomposition

In [None]:
X = learning.drop(["TARGET_B","TARGET_D"],axis=1)

In [None]:
n_comp = 3
pca = decomposition.PCA(n_components = n_comp)
pca.fit(X)
result = pd.DataFrame(pca.transform(X), columns=["PCA%i" % i for i in range(n_comp)], index=X.index)

In [None]:
import cProfile
domain_spreader = tds.SymbolicFieldToDummies(learning,"RFA_24",["Recency", "Frequency", "Amount"])
cProfile.run('domain_spreader.spread()', sort='time')

In [None]:
learning.head()

In [None]:
import os
import numpy as np
import sys
os.getcwd()
proj_dir = os.path.split(os.getcwd())[0]
if proj_dir not in sys.path:
    sys.path.append(proj_dir)

In [None]:
import eda.tidy_dataset as tds
tidy = tds.TidyDataset("cup98LRN.txt")

In [None]:
raw = tidy.get_raw_data()

In [None]:
spreader = tds.SymbolicFieldToDummies(
    raw, "RFA_24", ["Recency", "Frequency", "Amount"])
spreader.spread()