<img src="https://kaggle2.blob.core.windows.net/competitions/kaggle/4654/logos/front_page.png"/>
# <span style="color:blue;text-align:center;">Trip Type Classification: v1 Initial Steps</span>

Walmart uses both art and science to continually make progress on their core mission of better understanding and serving their customers. One way Walmart is able to improve customers' shopping experiences is by segmenting their store visits into different trip types.
<img src="https://kaggle2.blob.core.windows.net/competitions/kaggle/4654/media/walmart_triptypes640.png"/>

## Import Packages

In [5]:
using DataFrames
using XGBoost
using MLBase
using DecisionTree
using Gadfly

  likely near /Users/diego/.julia/v0.4/MLBase/src/modeltune.jl:5
  likely near /Users/diego/.julia/v0.4/MLBase/src/modeltune.jl:5
  likely near /Users/diego/.julia/v0.4/MLBase/src/modeltune.jl:5
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:104
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:105
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:163
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:163
  likely near /Users/diego/.julia/v0.4/MLBase/src/deprecated/datapre.jl:163


## Load Data

In [19]:
train = readtable("data/train.csv.gz")
test = readtable("data/test.csv.gz")
full = vcat(train, test);



## Visualize Sample Data

In [None]:
@show size(train)
@show size(test)
@show size(full);

In [None]:
head(full)

In [None]:
showcols(train)
showcols(test)

## Data Uniquiness

In [None]:
plot(x=map(c -> length(Set(dropna(full[c])))/length(dropna(full[c])), names(full)), 
     y=names(full), Geom.bar(orientation=:horizontal))

## Handle Missing Data

In [None]:
apply_default_null_column!(df, column, value) = df[isna(df[column]), column] = value;

In [None]:
[apply_default_null_column!(df, :Upc, -1) for df in [train, test, full]]
[apply_default_null_column!(df, :FinelineNumber, -1) for df in [train, test, full]]
apply_default_null_column!(full, :TripType, median(dropna(full[:TripType])));

## Feature Encoding

In [16]:
categorical_features = [:TripType, :Weekday, :Upc, :DepartmentDescription, :FinelineNumber];

In [17]:
apply_encoding!(df, column, label) = df[column] = labelencode(label, df[column])

apply_encoding! (generic function with 1 method)

In [20]:
labels = Dict({column => labelmap(convert(Array, dropna(full[column]))) for column in categorical_features});


Use "Dict{Any,Any}([a=>b for (a,b) in c])" instead.


In [None]:
[apply_encoding!(train, column, labels[column]) for column in categorical_features]
[apply_encoding!(test, column, labels[column]) for column in setdiff(categorical_features, [:TripType])]
[apply_encoding!(full, column, labels[column]) for column in categorical_features];

### Store Shallow Feature Engineering

In [None]:
writetable("data/train_shallow_featured.tsv", train, separator='\t')
writetable("data/test_shallow_featured.tsv", test, separator='\t')
writetable("data/full_shallow_featured.tsv", full, separator='\t');

## Train 1st Model

In [69]:
train = readtable("data/train_shallow_featured.tsv", separator='\t')
test = readtable("data/test_shallow_featured.tsv", separator='\t');

In [70]:
train = train[:, :]
test = test[:, :];

In [15]:
features = [:Weekday, :Upc, :ScanCount, :DepartmentDescription, :FinelineNumber]
label = :TripType;

In [9]:
function split_train_val(df; train_size=.85, random_state=1)
    srand(random_state)
    nrows = size(df, 1)
    indexes = shuffle(collect(1:nrows))
    train = df[indexes[1:round(Int, nrows*train_size)], :]
    validation = df[indexes[(round(Int, nrows*train_size)+1):end], :] 
    return train, validation
end

split_train_val (generic function with 1 method)

### Prepare Training Data

In [71]:
X_train, X_val = split_train_val(train, train_size=.85, random_state=1)
train_x = convert(Array{Float64,2}, X_train[:, features])
train_y = convert(Array{Float64,1}, X_train[label])
val_x = convert(Array{Float64,2}, X_val[:, features])
val_y = convert(Array{Float64,1}, X_val[label])
test_x = Array{Float64,2}(test[:, features]);

In [72]:
dtrain = DMatrix(train_x, label=train_y)
dval = DMatrix(val_x, label=val_y);

#### XGBoost (Slow)

In [None]:
num_rounds = 100
params = Dict({"objective" => "multi:softmax",
               "booster" => "gbtree",
               "eta" => 0.1,
               "max_depth" => 5,
               "subsample" => 0.85
              })
watchlist = [(dtrain, "train"), (dval, "eval")]

println("Base Model")
tic()
num_class = length(labels[label])+1
model = XGBoost.xgboost(dtrain, num_rounds, param=params, 
                        num_class=num_class, watchlist=watchlist)
toc()

In [None]:
X_train[:YHat] = convert(Array{Int32,1}, XGBoost.predict(model, train_x))
X_val[:YHat] = convert(Array{Int32,1}, XGBoost.predict(model, val_x))
test[:YHat] = convert(Array{Int32,1}, XGBoost.predict(model, test_x));

#### Random Forest

In [None]:
model = build_forest(train_y, train_x, 5, 100, 0.5)

In [None]:
X_train[:YHat] = round(Int32, apply_forest(model, train_x))
X_val[:YHat] = round(Int32, apply_forest(model, val_x));
# test[:YHat] = round(Int32, apply_forest(model, test_x));

## Group Probabilities Predictions by Visit Number

In [23]:
trip_type_sorted_list = map(Float64, sort(keys(labels[label])))
function group_probabilities_by_visit_number(df, column)
    data = Array[]
    for subdf in groupby(df, column)
        row = fill(0.0, length(trip_type_sorted_list)+1)
        row[1] = round(Int, subdf[1, column])
        trip_type_prob = proportionmap(subdf[:YHat])
        index = 2
        for trip_type_key in trip_type_sorted_list
            if haskey(trip_type_prob, trip_type_key)
                row[index] = trip_type_prob[trip_type_key]
            end
            index += 1
        end
        push!(data, row)
    end
    nrows = size(data,1)
    ncols = size(data[1],1)
    dataframe = convert(DataFrame, reshape(vcat(data'...), (nrows, ncols)))
    new_columns_name = vcat(:VisitNumber, map(k -> symbol("TripType_$(round(Int, k))"), trip_type_sorted_list))
    names!(dataframe, new_columns_name)
    dataframe[:VisitNumber] = Array{Int64}(dataframe[:VisitNumber])
    return dataframe
end

group_probabilities_by_visit_number (generic function with 1 method)

In [None]:
train_data = group_probabilities_by_visit_number(X_train, :VisitNumber)
val_data = group_probabilities_by_visit_number(X_val, :VisitNumber)
# submission_data = sort(group_probabilities_by_visit_number(test, :VisitNumber), cols=[:VisitNumber]);

## Evaluate 1st Model

1. Evaluate by **Accuracy (Acc)**: $\frac{TP+TN}{TP+FP+FN+TN}$
2. Evaluate by **Multi-Class Logarithmic Loss (MCLL)**: $-1\frac{1}{N}\sum_{i=1}^{N}\sum_{j=1}^{M}\delta_{ij}log(p_{ij})$  
    where N is the number of visits, M is the number of trip type, $y_{ij}$ is the [Kroneckler Delta](https://en.wikipedia.org/wiki/Kronecker_delta) when the observations exists in test file and $p_{ij}$ is the corresponding prediction.

In [58]:
function create_visit_number_dict(groundtruth)
    visit_number_dict = Dict{Int64, Set{Int64}}()
    for i = 1:size(groundtruth, 1)
        visit_number = groundtruth[i, :VisitNumber]
        if !haskey(visit_number_dict, visit_number)
            visit_number_dict[visit_number] = Set{Int64}()
        end
        push!(visit_number_dict[visit_number], groundtruth[i, label])
    end
    
    return visit_number_dict
end

function eval_mcll(groundtruth, df)
    vn_dict = create_visit_number_dict(groundtruth)
    
    total_score = 0
    c = 0
    N, M = size(df)
    for i = 1:N, j = 2:M
        visit_number = df[i, :VisitNumber]
        trip_type = trip_type_sorted_list[j-1]
        yhat = df[i, j]
        in_gt = trip_type in vn_dict[visit_number]
        total_score += in_gt? (yhat > 0? log(yhat) : 0) : 0
    end
    
    return -1/N * total_score
end

function eval_acc(y, yhat)
    N = length(y)
    TP = sum([y[i] == yhat[i]? 1 : 0 for i = 1:N])
    return round(TP/N, 2)
end

eval_acc (generic function with 1 method)

In [None]:
eval_acc_train = eval_acc(X_train[label], X_train[:YHat])
eval_acc_val = eval_acc(X_val[label], X_val[:YHat])
eval_mcll_train = eval_mcll(X_train, train_data) 
eval_mcll_val = eval_mcll(X_val, val_data)
println("Acc - Score-Train: $eval_acc_train\tScore-Val: $eval_acc_val")
println("MCLL - Score-Train: $eval_mcll_train\tScore-Val: $eval_mcll_val")

## Generate Submission Files

In [None]:
writetable("data/submission_v1.1_rforest.csv", submission_data);

## Submit Predictions to Kaggle

v1 - Raw Attributes + Handle Missing Values with Median **.** (Acc-Tra:.98, Acc-Val:.78, MCLL-Tra:.0, MCLL-Val:.0)
train-merror:0.034118	eval-merror:0.260000
v1.1 - Raw Attr. + Handle Missing + RF(RFeat5, Trees100, Subs.5) + All Data **.** (Acc - Score-Train: 0.11	Score-Val: 0.07)  
v1 - Raw Attr. + Handle Missing + RF(RFeat5, Trees100, Subs.5) + SubSample **34.13327** (Acc - Score-Train: 0.32	Score-Val: 0.15)