<div>
<div style="text-align:center; display:block; float:left; padding:80px;"><img width="200px"  src="https://kaggle2.blob.core.windows.net/competitions/kaggle/4651/logos/front_page.png"/><span style="color:red;">**New User Booking**</span></div>
<div style="">
**Objective:** In this recruiting competition, Airbnb challenges you to predict in which country a new user will make his or her first booking.  
  
** Description: ** In this challenge, you are given a list of users along with their demographics, web session records, and some summary statistics. You are asked to predict which country a new user's first booking destination will be. All the users in this dataset are from the USA.
</div>
<img src="https://kaggle2.blob.core.windows.net/competitions/kaggle/4651/media/airbnb_banner.png" />

Author: [Oliveira, D. M.](http://br.linkedin.com/in/dmoliveira)

## <span style="color:blue">Airbnb - New User Booking - v6 Scikit-Learn All Models
Research source: http://scikit-learn.org/stable/

## Import Packages

In [None]:
using DataFrames
using MLBase
using Gadfly
using PyCall

In [None]:
@pyimport sklearn.linear_model as lm
@pyimport sklearn.svm as svm
@pyimport sklearn.neighbors as knn
@pyimport sklearn.naive_bayes as naive_bayes
@pyimport sklearn.tree as tree
@pyimport sklearn.ensemble as ensemble

## Load Data

In [None]:
train = readtable("data/train_v2.tsv", separator='\t')
test  = readtable("data/test_v2.tsv", separator='\t')
full = vcat(train, test);

In [None]:
label    = :country_destination
features = setdiff(names(test), [:id]);

## Prepare Data

In [None]:
function split_train_val(df; train_size=.85, random_state=1)
    
    srand(random_state)
    
    nrows, ntraining_rows = size(df, 1), round(Int, size(df, 1) * train_size)
    indexes               = shuffle(collect(1:nrows))
    train                 = df[indexes[1:ntraining_rows], :]
    validation            = df[indexes[ntraining_rows+1:end], :]
    
    return train, validation
end

In [None]:
train[label]  -= 1
X_train, X_val = split_train_val(train, train_size=.85, random_state=1)

train_x = Array{Float64,2}(X_train[:, features])
train_y = Array{Float64,1}(X_train[label])
val_x   = Array{Float64,2}(X_val[:, features])
val_y   = Array{Float64,1}(X_val[label])
test_x  = Array{Float64,2}(test[:, features]);

In [None]:
nrows, ncols = size(train_x);

## Prepare Training Models

In [None]:
fit(model, X, y) = model[:fit](X[1:end,:], y[1:end])

In [None]:
function create_bagging_model(base_model)
    return ensemble.BaggingClassifier(
        base_model, max_samples=0.5, max_features=0.5, 
        random_state=0, n_jobs=-1)
end

function create_ada_boost_model(base_model)
    return ensemble.AdaBoostClassifier(
        base_model, n_estimators=10, random_state=0,
    algorithm="SAMME")
end

In [None]:
regressor_models = Dict(
    :OLS                => lm.LinearRegression(),
    :Ridge              => lm.Ridge(alpha=.5), 
    :Lasso              => lm.Lasso(alpha=.5), 
    :ElasticNet         => lm.ElasticNet(alpha=.5, l1_ratio=0.5), 
    :LARS               => lm.Lars(), 
    :LassoLARS          => lm.LassoLars(alpha=1),
    :BayesianRidge      => lm.BayesianRidge(),
    :Perceptron         => lm.Perceptron(penalty="elasticnet", alpha=.5)
)
class_models = Dict(
    :LogisticRegression => lm.LogisticRegression(),
    :SGDClassifier      => lm.SGDClassifier(alpha=.5),
    :PassiveAggressiveClassifier => lm.PassiveAggressiveClassifier(),
    :SVMClassifier      => svm.SVC(),
    :kNN                => knn.KNeighborsClassifier(n_neighbors=5, algorithm="ball_tree"),
    :NaiveBayes         => naive_bayes.GaussianNB(),
    :DecisionTree       => tree.DecisionTreeClassifier(),
    :ExtremyTree        => tree.ExtraTreeClassifier())

ensemble_models = Dict(
    :RandomForest => ensemble.RandomForestClassifier(n_estimators=10, max_depth=6, random_state=0),
    :ExtraTrees   => ensemble.ExtraTreesClassifier(n_estimators=10, max_depth=6, random_state=0),
    :GradientBoostingTrees => ensemble.GradientBoostingClassifier(n_estimators=10, learning_rate=0.5, 
                                                                   random_state=0))

bagging_models = Dict([symbol(:Bagging_, key) => create_bagging_model(class_models[key]) 
                       for key in [:SGDClassifier, :SVMClassifier, :DecisionTree, :ExtremyTree]])

boosting_models = Dict([symbol(:Boosting_, key) => create_ada_boost_model(class_models[key])
                        for key in [:SGDClassifier, :SVMClassifier, :DecisionTree, :ExtremyTree]])

all_models = merge(class_models, bagging_models, boosting_models);

## Train

In [None]:
[fit(all_models[key], train_x, train_y) for key in keys(all_models)];

## Evaluate

In [None]:
eval_rmse(y, yhats) = round(sqrt(sum((y - yhats) .^ 2)/length(y)), 4)
eval_precision(y, yhats) = round(sum(Array{Int,1}(y .== yhats))/length(y), 4)

In [None]:
yhats_train = Dict([key => all_models[key][:predict](train_x[1:end, :]) for key in keys(all_models)])
yhats_val = Dict([key => all_models[key][:predict](val_x[1:end, :]) for key in keys(all_models)]);

In [None]:
rmse_train = [eval_rmse(train_y[1:end], yhats_train[key]) for key in keys(yhats_train)]
precision_train = [eval_precision(train_y[1:end], yhats_train[key]) for key in keys(yhats_train)];

In [None]:
rmse_val = [eval_rmse(val_y[1:end], yhats_val[key]) for key in keys(yhats_val)]
precision_val = [eval_precision(val_y[1:end], yhats_val[key]) for key in keys(yhats_val)];

In [None]:
model_names    = vcat(fill(collect(keys(all_models)), 4)...)
half_length    = round(Int, length(model_names)/2)
quarter_length = round(Int, half_length/2)
data_types     = repeach(["Train", "Validation"], half_length)
metrics        = repeach(["RMSE", "Precision", "RMSE", "Precision"], quarter_length)
values         = vcat(rmse_train, precision_train, rmse_val, precision_val)
results        = DataFrame(model=model_names, data_type=data_types, metric=metrics, value=values);

In [None]:
set_default_plot_size(1000px, 450px)
metric = "RMSE"
plot(results[results[:metric] .== metric, :], 
     x=:model, y=:value, color=:data_type, 
     Scale.x_discrete, Geom.line, Geom.point,
     Guide.title("Evaluation - $metric"),
     Guide.ylabel(metric))

In [None]:
set_default_plot_size(1000px, 450px)
metric = "Precision"
plot(results[results[:metric] .== metric, :], 
     x=:model, y=:value, color=:data_type, 
     Scale.x_discrete, Geom.line, Geom.point,
     Guide.title("Evaluation - $metric"),
     Guide.ylabel(metric))

## Export Model