<img src="https://kaggle2.blob.core.windows.net/competitions/kaggle/4654/logos/front_page.png"/>
# <span style="color:blue;text-align:center;">Trip Type Classification: v3 Baeysian Model</span>

Walmart uses both art and science to continually make progress on their core mission of better understanding and serving their customers. One way Walmart is able to improve customers' shopping experiences is by segmenting their store visits into different trip types.
<img src="https://kaggle2.blob.core.windows.net/competitions/kaggle/4654/media/walmart_triptypes640.png"/>

## Import Packages

In [1]:
using DataFrames
using MLBase
using Distances

  likely near /Users/diego/.julia/v0.4/MLBase/src/modeltune.jl:5
  likely near /Users/diego/.julia/v0.4/MLBase/src/modeltune.jl:5
  likely near /Users/diego/.julia/v0.4/MLBase/src/modeltune.jl:5
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:104
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:105
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:163
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:163
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:163


## Load Data

In [2]:
train = readtable("data/train_shallow_featured.tsv", separator='\t')
test = readtable("data/test_shallow_featured.tsv", separator='\t')
full = vcat(train, test)

original_full = vcat(readtable("data/train.csv.gz"), readtable("data/test.csv.gz"));

## Feature Encoding

In [3]:
categorical_features = [:TripType, :Weekday, :Upc, :DepartmentDescription, :FinelineNumber];

In [4]:
labels = Dict({column => labelmap(convert(Array, dropna(full[column]))) for column in categorical_features});


Use "Dict{Any,Any}([a=>b for (a,b) in c])" instead.


# Train Model

In [5]:
train = train[:, :]
test = test[:, :];

In [6]:
features = [:Weekday, :Upc, :ScanCount, :DepartmentDescription, :FinelineNumber]
label = :TripType;

In [7]:
function split_train_val(df; train_size=.85, random_state=1)
    srand(random_state)
    nrows = size(df, 1)
    indexes = shuffle(collect(1:nrows))
    train = df[indexes[1:round(Int, nrows*train_size)], :]
    validation = df[indexes[(round(Int, nrows*train_size)+1):end], :] 
    return train, validation
end

split_train_val (generic function with 1 method)

### Prepare Training Data

In [78]:
all_upc = Set{Int32}(full[:Upc])
all_fineline_number = Set{Int32}(full[:FinelineNumber])
all_trip_type = convert(Array{Int16, 1}, sort(collect(Set(dropna(original_full[:TripType])))));

In [9]:
X_train, X_val = split_train_val(train, train_size=.85, random_state=1)
X_test = test

sort!(X_train, cols=[:VisitNumber])
sort!(X_val, cols=[:VisitNumber])
sort!(X_test, cols=[:VisitNumber]);

### Train Bayesian Model

In [10]:
type BayesianModel{S <: AbstractString, T <: Real}
    instances::Dict{S, Array{T, 1}}
    prob_class::Dict{S, Array{T, 1}}
    
    column::Symbol
    all_values::Set{Int32}
    
    BayesianModel(instances, prob_class, column, all_values) = new(instances, prob_class, column, all_values)
    BayesianModel(column, all_values) = new(Dict{S, Array{T, 1}}(), Dict{S, Array{T, 1}}(), column, all_values)
end

In [64]:
"""
Training Functions
"""
function fit_bayesian(df, column, all_values)

    model = BayesianModel{AbstractString, Float64}(column, all_values)

    for subdf in groupby(df, :VisitNumber)

        products = Dict{Int32, Float64}()
        products_found = Set{Int32}()
        trip_type_distribution = fill(0.0, length(all_trip_type)) 

        for i = 1:size(subdf, 1)

            product, scan_count = subdf[i, model.column], subdf[i, :ScanCount]
            products[product] = score_scan_count(scan_count)
            scan_count != 0 && push!(products_found, product)

            trip_type_distribution[subdf[i, :TripType]] += 1.0
        end

        key = create_key_from_products(products_found)
        if !haskey(model.instances, key)
            model.instances[key] = fill(0.0, length(model.all_values))
            model.prob_class[key] = fill(0.0, length(all_trip_type))
        end

        model.instances[key] += [get(products, value, 0.0) for value in model.all_values]
        model.prob_class[key] += trip_type_distribution
    end
    
    for key in keys(model.prob_class)
        model.prob_class[key] ./= sum(model.prob_class[key])
    end

    return model
end

create_key_from_products(products) = AbstractString(sort(collect(products)))

"""
Prediction Functions
"""
function predict_bayesian(fitted_model, to_predict_data, k=5)
    
    predicted_data = Array[]
    
    for subdf in groupby(to_predict_data, :VisitNumber)
        instance = get_instance(subdf, fitted_model.column, fitted_model.all_values)
        averaged_prob_classes = fill(0.0, length(all_tr))
        # averaged_prob_classes = get_averaged_prob_classes_similar_instances(fitted_model.instances, 
        #                                                                    fitted_model.prob_class, 
        #                                                                    instance, k)
        push!(predicted_data, vcat(subdf[1, :VisitNumber], averaged_prob_classes))
    end
    
    predicted_df = predicted_data_to_dataframe(predicted_data)
    
    return predicted_df
end

function predict_bayesian(fitted_model, to_predict_data, k=5)
    
    predicted_data = Array{Float64}[]
    
    for subdf in groupby(to_predict_data, :VisitNumber)
        key, instance = get_instance(subdf, fitted_model.column, fitted_model.all_values)
        if haskey(fitted_model.prob_class, key)
            averaged_prob_classes = fitted_model.prob_class[key]
        else
            # averaged_prob_classes = fill(0.0, length(all_trip_type))
            averaged_prob_classes = get_averaged_prob_classes_similar_instances(fitted_model.instances, 
                                                                            fitted_model.prob_class, 
                                                                            instance, k)
        end
        
        push!(predicted_data, vcat(subdf[1, :VisitNumber], averaged_prob_classes))
    end
    
    predicted_df = predicted_data_to_dataframe(predicted_data)
    
    return predicted_df
end

function get_instance(df, column, all_values)
    
    products = Dict{Int32, Float64}()
    products_found = Set{Int32}()

    for i = 1:size(df, 1)
        product, scan_count = df[i, column], df[i, :ScanCount]
        products[product] = score_scan_count(scan_count)
        scan_count != 0 && push!(products_found, product)
    end

    return create_key_from_products(products_found), 
           [get(products, value, 0.0) for value in all_values]
end

score_scan_count(value) = value < 0? log(2) : ( value == 0? 0 : log(value + 1) )
# score_scan_count(value) = value < 0? 1 : ( value == 0? 0 : (value + 1) )

function get_averaged_prob_classes_similar_instances(instances, prob_class, instance, k)
    
    top_similars = get_similars(instances, instance, k)
    selected_prob_classes = Array[prob_class[last(tuple)] for tuple in top_similars]
    
    nrows, ncols = size(selected_prob_classes, 1), size(selected_prob_classes[1], 1)
    selected_prob_classes = reshape(vcat(selected_prob_classes'...), (nrows, ncols))
    
    averaged_prob_class_values = [sum(selected_prob_classes[:, i])/nrows for i = 1:ncols]
    
    return averaged_prob_class_values
end

function get_similars(instances, instance, k)
    similars = [(cosine_dist(instances[key], instance), key) for key in keys(instances)]
    return sort(similars, rev=true)[1:k]
end

function predicted_data_to_dataframe(predicted_data)
    
    nrows, ncols = size(predicted_data, 1), size(predicted_data[1], 1)
    predicted_data = reshape(vcat(predicted_data'...), (nrows, ncols))
    predicted_df = convert(DataFrame, predicted_data)
    names!(predicted_df, vcat(:VisitNumber, map(k -> symbol("TripType_$(round(Int, k))"), all_trip_type)))
    
    return predicted_df
end

calculate_distance_matrix (generic function with 1 method)

In [65]:
@time train_model = fit_bayesian(X_train, :FinelineNumber, all_fineline_number);

263.107127 seconds (15.05 M allocations: 10.713 GB, 9.06% gc time)


### Predict

In [66]:
tic(); predicted_train = predict_bayesian(train_model, X_train, 5); toc();
tic(); predicted_val = predict_bayesian(train_model, X_val, 5); toc();
tic(); predicted_test = predict_bayesian(train_model, X_test, 5); toc();

elapsed time: 214

LoadError: LoadError: InterruptException:
while loading In[66], in expression starting on line 2

## Evaluate Model

1. Evaluate by **Accuracy (Acc)**: $\frac{TP+TN}{TP+FP+FN+TN}$
2. Evaluate by **Multi-Class Logarithmic Loss (MCLL)**: $-1\frac{1}{N}\sum_{i=1}^{N}\sum_{j=1}^{M}\delta_{ij}log(p_{ij})$  
    where N is the number of visits, M is the number of trip type, $y_{ij}$ is the [Kroneckler Delta](https://en.wikipedia.org/wiki/Kronecker_delta) when the observations exists in test file and $p_{ij}$ is the corresponding prediction.

In [96]:
function create_visit_number_dict(groundtruth)
    visit_number_dict = Dict{Int64, Set{Int64}}()
    for i = 1:size(groundtruth, 1)
        visit_number = groundtruth[i, :VisitNumber]
        if !haskey(visit_number_dict, visit_number)
            visit_number_dict[visit_number] = Set{Int64}()
        end
        push!(visit_number_dict[visit_number], groundtruth[i, label])
    end
    
    return visit_number_dict
end

function eval_mcll(groundtruth, df)
    vn_dict = create_visit_number_dict(groundtruth)
    
    epsilon = 1e-15
    total_score_v1 = 0
    total_score_v2 = 0

    N, M = size(df)
    
    for i = 1:N, j = 2:M
        visit_number = df[i, :VisitNumber]
        trip_type = all_trip_type[j-1]
        yhat = df[i, j]
        in_gt = trip_type in vn_dict[visit_number]? 1 : 0
        total_score_v1 += in_gt == 1? max(min(log(yhat), 1-epsilon), epsilon) : 0
        total_score_v2 += in_gt * yhat
    end
    
    return -1/N * total_score_v1, total_score_v2 / N
end

eval_mcll (generic function with 1 method)

In [90]:
eval_mcll_train_v1, eval_mcll_train_v2 = eval_mcll(X_train, predicted_train) 
#eval_mcll_val_v1, eval_mcll_val_v2 = eval_mcll(X_val, predicted_val)
eval_mcll_val_v1, eval_mcll_val_v2 = 0, 0
println("MCLLv1 - Score-Train: $eval_mcll_train_v1\tScore-Val: $eval_mcll_val_v1")
println("MCLLv2 - Score-Train: $eval_mcll_train_v2\tScore-Val: $eval_mcll_val_v2")

MCLLv1 - Score-Train: -6.699462068819703e-16	Score-Val: 0
MCLLv2 - Score-Train: 0.0006019549557655793	Score-Val: 0


## Generate Submission Files

In [40]:
writetable("data/submission_v1_bayesian_model.csv", predicted_test);

## Submit Predictions to Kaggle

v1 Raw Attr. + H. Miss. + Bayesian Model **33.60892** (MCLL-T: 0.040 MCLL-V: 0.082)  
v1 Raw Attr. + H. Miss. + XGBoost **31.74538** (Acc-T: .06 Score-Val: .03 MCLL-T: .2813 MCLL-V: .025)  
v1.1 Raw Attr. + H. Miss. + RF(RFeat5, Trees100, Subs.5) + All Data **33.57726** (Acc-Train: .11	Acc-Val: .07, MCLL-Train: .353 MCLL-Val: .079)  
v1 Raw Attr. + H. Miss. + RF(RFeat5, Trees100, Subs.5) + SubSample **34.13327** (Acc - Score-Train: 0.32	Score-Val: 0.15)