<img src="https://kaggle2.blob.core.windows.net/competitions/kaggle/4654/logos/front_page.png"/>
# <span style="color:blue;text-align:center;">Trip Type Classification: v2 Feature Engineering</span>

Walmart uses both art and science to continually make progress on their core mission of better understanding and serving their customers. One way Walmart is able to improve customers' shopping experiences is by segmenting their store visits into different trip types.
<img src="https://kaggle2.blob.core.windows.net/competitions/kaggle/4654/media/walmart_triptypes640.png"/>

## Import Packages

In [1]:
using DataFrames
using MLBase
using Gadfly

  likely near /Users/diego/.julia/v0.4/MLBase/src/modeltune.jl:5
  likely near /Users/diego/.julia/v0.4/MLBase/src/modeltune.jl:5
  likely near /Users/diego/.julia/v0.4/MLBase/src/modeltune.jl:5
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:104
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:105
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:163
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:163
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:163


## Load Data

In [2]:
train = readtable("data/train.csv.gz")
test = readtable("data/test.csv.gz")
full = vcat(train, test);

## Visualize Sample Data

In [None]:
@show size(train)
@show size(test)
@show size(full);

In [None]:
head(full)

In [None]:
showcols(train)
showcols(test)

## Data Uniquiness

In [None]:
plot(x=map(c -> length(Set(dropna(full[c])))/length(dropna(full[c])), names(full)), 
     y=names(full), Geom.bar(orientation=:horizontal))

## Handle Missing Data

In [3]:
apply_default_null_column!(df, column, value) = df[isna(df[column]), column] = value;

In [4]:
[apply_default_null_column!(df, :Upc, -1) for df in [train, test, full]]
[apply_default_null_column!(df, :FinelineNumber, -1) for df in [train, test, full]]
apply_default_null_column!(full, :TripType, median(dropna(full[:TripType])));

## Feature Encoding

In [5]:
categorical_features = [:TripType, :Weekday, :Upc, :DepartmentDescription, :FinelineNumber];

In [6]:
apply_encoding!(df, column, label) = df[column] = labelencode(label, df[column])

apply_encoding! (generic function with 1 method)

In [7]:
labels = Dict({column => labelmap(convert(Array, dropna(full[column]))) for column in categorical_features});


Use "Dict{Any,Any}([a=>b for (a,b) in c])" instead.


In [8]:
[apply_encoding!(train, column, labels[column]) for column in categorical_features]
[apply_encoding!(test, column, labels[column]) for column in setdiff(categorical_features, [:TripType])]
[apply_encoding!(full, column, labels[column]) for column in categorical_features];

## Feature Engineering 

In [12]:
all_upc = Set{Int32}(full[:Upc])
all_fineline_number = Set{Int16}(full[:FinelineNumber])
all_departments = Set{Int8}(full[:DepartmentDescription]);

In [30]:
function add_relationship_feature!(df, column, all_possible_values_list)
    data = Array{Int8,1}[]
    for subdf in groupby(df, column)
        row = Array{Int8,1}()
        for value in all_possible_values_list
            push!(row, value in subdf[column]? 1 : 0)
        end
        
        for i=1:size(subdf, 1)
            push!(data, row)
        end
    end
    
    nrows, ncols = size(data)[1], size(data[1])[1]
    data = reshape(vcat(data...), (ncols, nrows))
    for (i, value) in enumerate(all_possible_values_list)
        df[symbol(string(column, "_", value))] = vcat(data[i, :]...)
    end
    
    return df
end

add_relationship_feature! (generic function with 1 method)

### Remove Unknown Products without Motivation to Return

In [15]:
train = train[train[:ScanCount] .!= -1, :]
test = test[test[:ScanCount] .!= -1, :]
full = full[full[:ScanCount] .!= -1, :];

### Selected Features

In [19]:
selected_features = [:VisitNumber, :Weekday, :DepartmentDescription, :FinelineNumber, :TripType];

In [21]:
train = train[:, selected_features]
test = test[:, setdiff(selected_features, [:TripType])]
full = full[:, setdiff(selected_features, [:TripType])];

### Add UPC as Relational Feature

In [None]:
add_relationship_feature!(train, :Upc, all_upc)
add_relationship_feature!(test, :Upc, all_upc)
add_relationship_feature!(full, :Upc, all_upc);

### Add FinelineNumber as Relational Feature

In [31]:
add_relationship_feature!(train, :FinelineNumber, all_fineline_number)
add_relationship_feature!(test, :FinelineNumber, all_fineline_number)
add_relationship_feature!(full, :FinelineNumber, all_fineline_number);

### Add Department as Relational Feature

In [32]:
add_relationship_feature!(train, :DepartmentDescription, all_departments)
add_relationship_feature!(test, :DepartmentDescription, all_departments)
add_relationship_feature!(full, :DepartmentDescription, all_departments);

### Group Data by VisitNumber

In [43]:
function group_by_visit_number(df)
    
    cols_to_apply = setdiff(names(df), selected_features)
    instances = Array{Float32}[]
    
    for subdf in groupby(df, :VisitNumber)    
        instance = subdf[1, :]
        summarized_values = vcat(colwise(sum, subdf[:, cols_to_apply])...)
        instance[:, cols_to_apply] = summarized_values
        push!(instances, Array{Float32}(instance))
    end
            
    nrows, ncols = size(instances, 1), size(instances[1], 1)
    instances = reshape(vcat(instances'...), (nrows, ncols))
    instances_df = convert(DataFrame, instances)
    names!(instances_df, names(df))
           
    return instances_df
end

group_by_visit_number (generic function with 1 method)

In [41]:
train_aggregated = group_by_visit_number(train)

LoadError: LoadError: UndefVarError: group_by_visit_number not defined
while loading In[41], in expression starting on line 1

### Delete Outdated Features

In [38]:
:Upc in names(train) && delete!(train, :Upc)
:FinelineNumber in names(train) && delete!(train, :FinelineNumber)
:DepartmentDescription in names(train) && delete!(train, :DepartmentDescription)

:Upc in names(test) && delete!(test, :Upc)
:FinelineNumber in names(test) && delete!(test, :FinelineNumber)
:DepartmentDescription in names(test) && delete!(test, :DepartmentDescription)

:Upc in names(full) && delete!(full, :Upc)
:FinelineNumber in names(full) && delete!(full, :FinelineNumber)
:DepartmentDescription in names(full) && delete!(full, :DepartmentDescription);

### Store Feature Engineering

In [None]:
writetable("data/train_featured_fn_x_dept_rship.tsv", train, separator='\t')
writetable("data/test_featured_fn_x_dept_rship.tsv", test, separator='\t')
writetable("data/full_featured_fn_x_dept_rship.tsv", full, separator='\t');