In [23]:
style_css = """
<style>
span.tt {
    font-family: 'Lucida Sans Typewriter', 'Lucida Console', 
                  monaco, 'Bitstream Vera Sans Mono', monospace;
    color: #ff6666;
}

div.method {
    margin-bottom: 10px;
}

div.method_title {
    padding: 4px 4px 4px 20px;
    background-color:#696565; color:#ffa500;
    border-left: 8px solid #fa8072;
}

div.method_body {
    padding: 0px 20px 20px 20px;
    border-left: 8px solid #fa8072;
    background-color: #ffe4e1;
}

div.method_cell {}

div.method_cell_title {
    padding: 20px 0px 0px 0px;
}

div.method_cell_body {}


h1.section {
    border-bottom:5px solid #696565;
}

span.section_number {
    background-color:#696565;
    color:#ff6666;
    padding:3px 5px 3px 5px;
}

span.section_title {
    padding:5px 5px 0px 5px;
    color:#ffa500;
}

</style>
"""
display("text/html", style_css)

function render_doc(method_signature, description, arguments, outputs)
    arguments_html       = field_list_to_html(arguments)
    outputs_html         = field_list_to_html(outputs)
    html_method_template = """
    <div class="method">
        <div class="method_title"><span class="tt">Method: </span><b>$method_signature</b></div>
            <div class="method_body">
                <div class="method_cell">
                    <div class="method_cell_title"><span class="tt"><b>Description</b></span></div>
                    <div class="method_cell_body">
                        <span class="tt">$description.</span>
                    </div>
                </div>
                <div class="method_cell">
                    <div class="method_cell_title"><span class="tt"><b>Arguments</b></span></div>
                    <div class="method_cell_body">
                        <ul>
                            $arguments_html
                        </ul>
                    </div>
                </div>
                <div class="method_cell">
                    <div class="method_cell_title"><span class="tt"><b>Outputs</b></span></div>
                    <div class="method_cell_body">
                        <ul>
                            $outputs_html
                        </ul>
                    </div>
                </div>
            </div>
        </div>
    <div/>
    """
    return display("text/html", html_method_template)
end

function field_list_to_html(list)
    length(list) == 0 && return "<span class=\"tt\">None.</span>"
    html = ""
    for (field_name, field_type, field_description) in list
        html *= "<li><span class=\"tt\">$field_name, <b>$field_type</b>: "
        html *= "$field_description.</span></li>\n"
    end
    return html
end

function render_section(section_number, subsection_number, title)
    html = """<h1 class="section">"""
    if length(section_number) > 0
        html *= """<span class="section_number">$section_number.$subsection_number</span>""" 
    end
    html *= """<span class="section_title">$title</span></h1>"""
    return display("text/html", html)
end
render_section("", "", "Julia Assistents Tools for Machine Learning")

**Author: [Oliveira, D. M.](http://br.linkedin.com/in/dmztheone) - [GitHub](http://www.github.com/dmoliveira)**

In [273]:
render_section("", "", "Introduction")

Write a proper introduction... 

In [155]:
render_section("01", "", "Import Packages")

In [369]:
using Base.Test
using DataFrames
using DataStructures
using DecisionTree
using Iterators
using MLBase
using GLM
using XGBoost
using GZip
using PyCall

In [29]:
@pyimport sklearn.linear_model as sklm
@pyimport sklearn.svm as svm
@pyimport sklearn.neighbors as knn
@pyimport sklearn.naive_bayes as naive_bayes
@pyimport sklearn.tree as tree
@pyimport sklearn.ensemble as ensemble

In [151]:
render_section("02", "", "HTML Assistent Tools")

### HTML Tools

In [223]:
html = """
<script>
    code_show=true; 
    function code_toggle() {
     if (code_show){
     \$('div.input').hide();
     } else {
     \$('div.input').show();
     }
     code_show = !code_show
    } 
    \$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
    <input type="submit" value="Click here to toggle on/off the raw code.">
</form>"""
display("text/html", html)

In [9]:
render_section("02", "", "Summary")

- **Functions for Missing Values**
    - get_default_values(df, features)
    - apply_default_values!(df, default_values)
- **Functions for Label Encoding**
    - get_label_encoding(df, features)
    - apply_encoding!(df, encoding)
- **Functions Features Expansion**
    - get_all_values(df, features)
    - apply_one_hot_encoding!(df, features, all_values)
    - apply_interval_features!(df, features, interval_values)
- **Functions Extract Date and Time**
    - extract_date_features(string_date)
    - extract_time_features(string_time_hhmmss)
- **Functions Extract Age Features**
    - extract_age_features(age)
- **Functions Export CSV/TSV**
    - export_csv(df, file_path)
    - export_tsv(df, file_path)
- **Functions Split Train/Validation Sets**
    - split_train_val (df; train_size=.85, random_state=1)
    - gen_train_val (train, features, label, train_size=.85, random_state=1)
- **Functions XGBoost Auxiliary Tools**
    - gen_dtrain(train, features, label, train_size=.85, random_state=1)
- **Functions GLM Auxiliary Tools**
    - gen_formula (features, label)
    - gen_formulas(features, label)
    - gen_glm(train, formulas, family=Binomial(), link=LogitLink())
- **Functions Scikit Learning**
    - train_scikit_models(models, train_x, train_y)
    - predict_scikit_models (models, X)
- **Functions Evaluation Metrics**
    - eval_rmse(y, yhat)
    - eval_precision(y, yhat)
- **Functions Vowpal Wabbit**
    - generate_vw_file(df, feature_space_indexes, feature_space_names)

In [10]:
render_section("04", "", "Functions: Missing Values")

In [7]:
render_doc("get_default_values (df, features)", 
           """Calculates the mode for all features informed. 
              Usually it is used to replace missing data""", 
           [("df", "DataFrame", "Data used to calculate modes by column"),
            ("features", "Array{Symbol,1}", "Column names that will be considered")],
           [("default_values", "Dict", "Dictionary with the default values")])

render_doc("apply_default_values! (df, default_values)",
           """Apply a dictionary of default values to missing values (NAs) of the
              given dataframe. Only keys identified as column will be used""",
          [("df", "DataFrame", "Data that will be used to replace NAs to default values"),
           ("default_values", "Dict", "Dictionary with features/default values")],
          [("df", "DataFrame", "Original dataframe with NAs replaced by default values")])

In [8]:
function get_default_values(df, features)
    default_values = Dict()
    for feature in features
        default_value = mode(dropna(df[feature]))
        default_values[feature] = default_value
    end
    return default_values
end

function apply_default_values!(df, default_values)
    for feature in keys(default_values)
        df[isna(df[feature]), feature] = default_values[feature]
    end
    return df
end

apply_default_values! (generic function with 1 method)

In [11]:
render_section("05", "", "Functions: Label Encoding")

In [164]:
render_doc("get_label_encoding (df, features)",
           "Get label encoding for features of a given dataframe",
           [("df", "DataFrame", "Data that will be used to create label encoding"), 
            ("features", "Array{Symbol,1}", "Features that will be created label encoding")],
            [("_", "Dict", "A dictionary with the encoding for each feature informed")])

render_doc("apply_encoding! (df, encoding)",
           "Apply encoding to a given dataframe",
           [("df", "DataFrame", "Dataframe that will be encoded"),
            ("encoding", "Dict", "A dictionary with the feature/encoding data")],
           [("df", "DataFrame", "Dataframe with encoded columns")])

In [161]:
get_label_encoding(df, features) = Dict([f => labelmap(dropna(df[f])) for f in features])

function apply_encoding!(df, encoding)
    for feature in keys(encoding)
        df[feature] = labelencode(encoding[feature], df[feature])
    end
    return df
end

apply_encoding! (generic function with 1 method)

In [55]:
render_section("06", "", "Functions: Feature Transformation")

In [61]:
render_section("06", "01", "Feature Transformation: One Hot Encoding")

In [62]:
render_doc("get_all_values (df, features)",
           "Get all unique values from a given dataframe",
           [("df", "DataFrame", "Dataframe used to extract unique values"),
            ("features", "Array{Symbol,1}", "Features that will be extract unique values"),
            ("all_values", "Dict", "")],
           [("_", "Dict", "A dictionary with unique values for each feature informed")])

render_doc("apply_one_hot_encoding! (df, features, all_values)",
           """Transform for each informed feature in N binary features
              where is equal 1 when found the value in the new binary vector.
              * The original columns will be deleted. See more about this
              <a src="http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html">
              subject</a>""",
          [("df", "DataFrame", "Dataframe used to expand with one hot encoding"),
           ("features", "Array{Symbol,1}", "Features to expand with one hot encoding")],
          [("df", "DataFrame", "Dataframe with columns expanded by one hot encoding")])

In [47]:
get_all_values(df, features) = Dict([f => Set(dropna(df[f])) for f in features])

function apply_one_hot_encoding!(df, features, all_values)
    feature_names = get_categorical_features_names(features, all_values)
    categorical_feature_matrix = get_categorical_feature_matrix(df, features, all_values)
    apply_categorical_feature_matrix!(df, feature_names, categorical_feature_matrix)
    remove_features!(df, features)
    return df
end

function get_categorical_features_names(features, all_values)
    feature_names = []
    for feature in features
        for value in all_values[feature]
            push!(feature_names, symbol(feature, "_", value))
        end
    end
    return feature_names
end

function get_categorical_feature_matrix(df, features, all_values)
    new_feature_matrix = []
    nrows = size(df, 1)
    for i=1:nrows, f=1:length(features)
        i % 1000 == 0 && f == 1 &&  println("\tProcessed Categorical Feature $i/$nrows.")
        push!(new_feature_matrix, 
              to_cat_vector(all_values[features[f]], df[i, features[f]])
        )
    end
    new_feature_matrix = vcat(new_feature_matrix...)
    ncols = sum([length(all_values[feature]) for feature in features])
    new_feature_matrix = reshape(new_feature_matrix, ncols, nrows)'
    return new_feature_matrix
end

function apply_categorical_feature_matrix!(df, feature_names, feature_matrix)
    nfeatures = length(feature_names)
    for i=1:nfeatures
        df[feature_names[i]] = feature_matrix[:, i]
    end
    return df
end

function remove_features!(df, feature_names)
    for feature in feature_names
        delete!(df, feature)
    end
    return df
end

to_cat_vector(all_values, actual_value) = [Int(value == actual_value) for value in all_values]

to_cat_vector (generic function with 1 method)

In [59]:
render_section("06", "02", "Features Transformation: Interval Features")

In [63]:
render_doc("apply_interval_features! (df, features, interval_values)",
           """Transform for each informed feature in N interval features given by
              a dictionary. The feature vector iterates throught each interval
              using the function [ei < X < ei+1], e1 >= X and eEnd <= X""",
          [("df", "DataFrame", "Dataframe used to expand with one hot encoding"),
           ("features", "Array{Symbol,1}", "Features to expand with one hot encoding"),
           ("interval_values", "Dict", """A dictionary where the key is the feature name
             and the value is a list with the valid intervals""")],
          [("df", "DataFrame", "Dataframe with columns expanded by interval features")])

In [49]:
function apply_interval_features!(df, features, interval_values)
    feature_names = get_interval_features_names(features, interval_values)
    interval_feature_matrix = get_interval_feature_matrix(df, features, interval_values)
    apply_categorical_feature_matrix!(df, feature_names, interval_feature_matrix)
    remove_features!(df, features)
    return df
end

function get_interval_features_names(features, interval_values)
    feature_names = []
    for feature in features
        intervals = interval_values[feature]
        for i=1:length(intervals) - 1
            push!(feature_names, symbol(feature, "_e", 
                  intervals[i], "x", intervals[i+1]))
        end
        push!(feature_names, symbol(feature, "_", intervals[1]))
        push!(feature_names, symbol(feature, "_e", intervals[end]))
    end
    return feature_names
end

function get_interval_feature_matrix(df, features, interval_values)
    new_feature_matrix = []
    nrows = size(df, 1)
    for i=1:nrows, f=1:length(features)
        i % 1000 == 0 && f == 1 &&  println("\tProcessed Categorical Feature $i/$nrows.")
        push!(new_feature_matrix, 
              to_interval_vector(interval_values[features[f]], df[i, features[f]])
        )
    end
    new_feature_matrix = vcat(new_feature_matrix...)
    ncols = sum([length(interval_values[feature]) + 1 for feature in features])
    new_feature_matrix = reshape(new_feature_matrix, ncols, nrows)'
    return new_feature_matrix
end

function to_interval_vector(interval_values, actual_value)
    features = [Int(interval_values[i] <= actual_value < interval_values[i+1])
                    for i=1:length(interval_values) - 1]
    push!(features, Int(actual_value < interval_values[1]))
    push!(features, Int(interval_values[end] <= actual_value))
    return features
end

to_interval_vector (generic function with 1 method)

In [60]:
render_section("06", "03", "Feature Transformation: Feature Space")

In [71]:

render_doc("gen_feature_space (features, separator='_')",
           """Generate feature space. It groups feature that have the same prefix.
              It can receives a separator to break feature name. For example, if
              exists FeatureA_1 FeatureA_2 and FeatureB, when passed through
              this method we got as feature space FeatureA and Feature B with
              its respective indices""",
           [("features", "Array{Symbol,1}", "Features to be analyzed"),
            ("separator", "Char", "Separator used to get the feature prefix")],
           [("feature_space", "OrderedDict", "Ordered dict with key as key feature"*
             " and value as tuple with start and end index")])

In [76]:
function gen_feature_space(features, separator='_')
    prefix_feature = [Symbol(split(string(feature), separator)[1]) for feature in features]
    key_feature    = Set(prefix_feature)
    feature_space  = [(key, (findfirst(prefix_feature, key), findlast(prefix_feature, key))) for key in key_feature]
    feature_space  = OrderedDict(sort(feature_space, by = v -> v[2]))
    return feature_space
end

gen_feature_space (generic function with 2 methods)

In [13]:
render_section("07", "01", "Features: Extract Date Features")

In [215]:
render_doc("extract_date_features (string_date)",
           "Create feature vector from a raw string date",
         [("string_date", "AbstractString", "A string date in format YYYY-mm-dd")],
           [("_", "Array{Int,1}", "Feature vector of date")])

render_doc("extract_time_features(string_time_hhmmss)",
           "Create feature vector from a raw string time",
           [("string_time_hhmmss", "AbstractString", "A string time in format HHMMSS")],
           [("_", "Array{Int,1}", "Feature vector of time")])

In [214]:
DATE_FEATURES_NAMES = [:is_sunday, :is_monday, :is_tuesday, :is_wednesday, :is_thursday, 
                       :is_friday, :is_saturday, :is_weekday, :is_weekend, :is_middle_week,
                       :is_january, :is_february, :is_march, :is_april, :is_may, :is_june,
                       :is_july, :is_august, :is_september, :is_october, :is_november, 
                       :is_december, :is_first_quarter_year, :is_second_quarter_year, 
                       :is_third_quarter_year, :is_forth_quarter_year, :is_first_trimester_year, 
                       :is_second_trimester_year, :is_third_trimester_year, :is_first_half_year, 
                       :is_second_half_year]

HOUR_FEATURES_NAMES = [:is_morning, :is_afternoon, :is_evenning, :is_night, :is_late_night, 
                       :is_midday, :is_launch_time, :is_end_work_day]

function extract_date_features(string_date)
    
    date = Date(string_date)
    
    dayofweek       = Dates.dayofweek(date)    
    is_sunday       = Dates.Sunday == dayofweek? 1 : 0
    is_monday       = Dates.Monday == dayofweek? 1 : 0
    is_tuesday      = Dates.Tuesday == dayofweek? 1 : 0
    is_wednesday    = Dates.Wednesday == dayofweek? 1 : 0
    is_thursday     = Dates.Thursday == dayofweek? 1 : 0
    is_friday       = Dates.Friday == dayofweek? 1 : 0
    is_saturday     = Dates.Saturday == dayofweek? 1 : 0
    is_weekday      = Dates.Monday <= dayofweek <= Dates.Friday? 1 : 0
    is_weekend      = Dates.Saturday <= dayofweek <= Dates.Sunday? 1 : 0
    is_middle_week  = Dates.Tuesday <= dayofweek <= Dates.Thursday? 1 : 0
    
    month = Dates.month(date)
    is_january               = Dates.January == month? 1 : 0
    is_february              = Dates.February == month? 1 : 0
    is_march                 = Dates.March == month? 1 : 0
    is_april                 = Dates.April == month? 1 : 0
    is_may                   = Dates.May == month? 1 : 0
    is_june                  = Dates.June == month? 1 : 0
    is_july                  = Dates.July == month? 1 : 0
    is_august                = Dates.August == month? 1 : 0
    is_september             = Dates.September == month? 1 : 0
    is_october               = Dates.October == month? 1 : 0
    is_november              = Dates.November == month? 1 : 0
    is_december              = Dates.December == month? 1 : 0
    is_first_quarter_year    = month < 4? 1 : 0
    is_second_quarter_year   = 4 <= month < 7? 1 : 0
    is_third_quarter_year    = 7 <= month < 10? 1 : 0
    is_forth_quarter_year    = month >= 10? 1 : 0
    is_first_trimester_year  = month <= 4? 1 : 0
    is_second_trimester_year = 4 < month <= 8? 1 : 0
    is_third_trimester_year  = month > 8? 1 : 0
    is_first_half_year       = month <= 6? 1 : 0
    is_second_half_year      = month > 6? 1 : 0
    
    return [is_sunday, is_monday, is_tuesday, is_wednesday, is_thursday, 
            is_friday, is_saturday, is_weekday, is_weekend, is_middle_week,
            is_january, is_february, is_march, is_april, is_may, is_june,
            is_july, is_august, is_september, is_october, is_november, 
            is_december, is_first_quarter_year, is_second_quarter_year, 
            is_third_quarter_year, is_forth_quarter_year, is_first_trimester_year, 
            is_second_trimester_year, is_third_trimester_year, is_first_half_year, 
            is_second_half_year]
end

function extract_time_features(string_time_hhmmss)
    hour = parse(Int, string_time_hhmmss[1:2])
    is_morning = 6 <= hour < 12? 1 : 0
    is_afternoon = 13 <= hour < 18? 1 : 0
    is_evenning = 18 <= hour < 20? 1 : 0
    is_night = 20 <= hour <= 22? 1 : 0
    is_late_night = 22 < hour <= 24 || 0 <= hour < 6? 1: 0
    is_midday = 12 <= hour < 13? 1 : 0
    is_launch_time = 11 <= hour <= 14? 1 : 0
    is_end_work_day = 17 <= hour <= 19? 1 : 0
    
    return [is_morning, is_afternoon, is_evenning, is_night, is_late_night, 
            is_midday, is_launch_time, is_end_work_day]
end

extract_time_features (generic function with 1 method)

In [14]:
render_section("07", "02", "Features: Extract Age Features")

In [216]:
render_doc("extract_age_features(age)",
           "Extract age feature vector",
          [("age", "Int", "Age in years")],
          [("_", "Array{Int,1}", "Feature vector of age")])

In [None]:
age_feature_names = [:is_age_under_25, :is_age_between_25_34, :is_age_between_35_44, 
                     :is_age_between_45_54, :is_age_between_55_64, :is_age_above_64,
                     :is_young_adult, :is_old_adult]

function extract_age_features(age)
    
    is_age_under_25      = age < 25? 1 : 0
    is_age_between_25_34 = 25 <= age < 35? 1 : 0
    is_age_between_35_44 = 35 <= age < 45? 1 : 0
    is_age_between_45_54 = 45 <= age < 55? 1 : 0
    is_age_between_55_64 = 55 <= age < 65? 1 : 0
    is_age_above_64      = age >= 65? 1 : 0
    
    is_young_adult       = 25 <= age < 45? 1 : 0
    is_old_adult         = 45 <= age < 65? 1 : 0
    
    return [is_age_under_25, is_age_between_25_34, is_age_between_35_44, 
            is_age_between_45_54, is_age_between_55_64, is_age_above_64,
            is_young_adult, is_old_adult]
end

In [378]:
render_section("08", "", "Functions: Export/Import Files")

In [380]:
render_doc("export_csv (df, file_path)",
           "Export dataframe to CSV format",
           [("df", "DataFrame", "Dataframe to be exported"),
            ("file_path", "AbstractString", "File path to save CSV file")], [])

render_doc("export_tsv (df, file_path)",
           "Export dataframe to TSV format",
           [("df", "DataFrame", "Dataframe to be exported"),
            ("file_path", "AbstractString", "File path to save TSV file")], [])

render_doc("export_gz (data, file_path)",
           "Export string data in Gzip format",
           [("data", "AbstractString", "Data in string format to be compressed and exported"),
            ("file_path", "AbstractString", "File path to store data in gzip format")], [])

In [379]:
export_csv(df, file_path) = writetable(file_path, df)
export_tsv(df, file_path) = writetable(file_path, df, separator="tsv")
function export_gz(data::AbstractString, file_path)
    file = GZip.open("$file_path.gz", "w")
    write(file, data)
    close(file)
end

export_gz (generic function with 1 method)

In [16]:
render_section("09", "", "Functions: Split Training and Validation")

In [183]:
render_doc("split_train_val (df; train_size=.85, random_state=1)",
           "Separate in train and validation dataframes",
           [("df", "DataFrame", "Original dataframe that will be splitted"),
            ("train_size", "Float", """Size in porcentage of the training dataframe.
              Default value = 0.85"""),
            ("random_state", "Int", "Random seed used. Default value = 1")],
           [("train", "DataFrame", "Train dataframe with train_size percentage"),
            ("validation", "DataFrame", "Validation dataframe with 1-train_size percentage")])

render_doc("gen_train_val (train, features, label, train_size=.85, random_state=1)",
           "Generate train and validation for x and y",
           [("train", "DataFrame", "Train dataframe that will be splitted"),
            ("features", "Array{Symbol,1}", "Columns to consider as X (features)"),
            ("label", "Symbol", "Column to consider as Y (output)"),
            ("train_size", "Float", """Size in porcentage of the training dataframe.
              Default value = 0.85"""),
            ("random_state", "Int", "Random seed used. Default value = 1")],
           [("train_x", "Array{Float,2}", "Train matrix with features"),
            ("train_y", "Array{Float,1}", "Train array with outputs"),
            ("val_x", "Array{Float,2}", "Validation dataframe with features"),
            ("val_y", "Array{Float,1}", "Validation array with outputs")])

In [52]:
function split_train_val(df; train_size=.85, random_state=1)
    srand(random_state)
    
    nrows, ntraining_rows = size(df, 1), round(Int, size(df, 1) * train_size)
    indexes               = shuffle(collect(1:nrows))
    train                 = df[indexes[1:ntraining_rows], :]
    validation            = df[indexes[ntraining_rows+1:end], :]
    
    return train, validation
end

function gen_train_val(train, features, label, train_size=.85, random_state=1)
    X_train, X_val = split_train_val(train; train_size=.85, random_state=1)
    train_x = Array{Float64,2}(X_train[:, features])
    train_y = Array{Float64,1}(X_train[label])
    val_x   = Array{Float64,2}(X_val[:, features])
    val_y   = Array{Float64,1}(X_val[label])
    return train_x, train_y, val_x, val_y
end

gen_train_val (generic function with 3 methods)

In [17]:
render_section("10", "", "Functions: XGB Auxiliary Tools")

In [186]:
render_doc("gen_dtrain(train, features, label, train_size=.85, random_state=1)",
           "Generate DMatrices for train and validation to use on XGBoost",
           [("train", "DataFrame", "Train dataframe that will be splitted"),
            ("features", "Array{Symbol,1}", "Columns to consider as X (features)"),
            ("label", "Symbol", "Column to consider as Y (output)"),
            ("train_size", "Float", """Size in porcentage of the training dataframe.
              Default value = 0.85"""),
            ("random_state", "Int", "Random seed used. Default value = 1")],
           [("dtrain", "DMatrix", "Train matrix to XGB"),
            ("dval", "DMatrix", "Validation matrix to XGB")])

In [None]:
function gen_dtrain(train, features, label, train_size=.85, random_state=1)
    train_x, train_y, val_x, val_y = gen_train_val(train, features, label, 
                                                   train_size, random_state)
    dtrain  = DMatrix(train_x, label=train_y)
    dval    = DMatrix(val_x, label=val_y)
    return dtrain, dval
end

In [18]:
render_section("11", "", "Functions: GLM Auxiliary Tools")

In [252]:
render_doc("gen_formula (features, label)",
           "Generate formula y ~ X",
           [("features", "Array{Symbol,1}", "Feature that will be used as X"),
            ("label", "Symbol", "Output label as y")],
           [("formula", "Formula", "Formula to be used in GLM package")])

render_doc("gen_formulas (features, label)",
           "Generate all valids formules between features x label",
          [("features", "Array{Symbol,1}", "Features that will be used in combination"),
           ("label", "Symbol", "Output label (i.e., Y)"),
           ("min_vars_formula", "Int", "Minimum number of variables in formula (Default=1)"),
           ("max_vars_formula", "Int", "Maximum number of variables in formula (Default=Max Features)")],
          [("formulas", "Array{Formula,1}", "Array of formulas to be used in GLM package")])

render_doc("gen_formulas (features, feature_space, label, min_vars_formula, max_vars_formula)",
           "Generate all valids formules between feature space X and label",
          [("features", "Array{Symbol,1}", "Features that will be used in combination"),
           ("feature_space", "OrderedDict", "Feature space with its start and end index"),
           ("label", "Symbol", "Output label (i.e., Y)"),
           ("min_vars_formula", "Int", "Minimum number of variables in formula (Default=1)"),
           ("max_vars_formula", "Int", "Maximum number of variables in formula (Default=Max Features)")],
          [("formulas", "Array{Formula,1}", "Array of formulas to be used in GLM package")])

render_doc("gen_glm (train, formulas, family=Binomial(), link=LogitLink())",
           """Generate Generalized Linear Models (GLMs). Check the 
              <a src="https://github.com/JuliaStats/GLM.jl">original package</a>
              for more information""",
          [("train", "DataFrame", "Train data"),
           ("formulas", "Array{Formula,1}", "Array with the formulas"),
           ("family", "Distribution", "Family discribution (e.g., Binomial, Gamma, etc)"),
           ("link", "", "Link from distribution. Default is LogitLink for logistic regression")],
          [("models", "Dict", "A dict with key as formula and value as GLM model")])

In [251]:
gen_formula(features, label) = eval(parse(string(label, "~", join(features, "+"))))

function gen_formulas(features::Array{Symbol,1},
                      label::Symbol,
                      min_vars_formula::Int64=1,
                      max_vars_formula::Int64=length(features))
    feature_space = gen_feature_space(features)
    formulas = gen_formulas(features, feature_space, label, 
                            min_vars_formula, max_vars_formula) 
    return formulas
end

function gen_formulas(features::Array{Symbol,1}, 
                      feature_space::OrderedDict{Symbol,Tuple{Int64,Int64}},
                      label::Symbol,
                      min_vars_formula::Int64=1,
                      max_vars_formula::Int64=length(feature_space))
    key_features = collect(keys(feature_space))
    key_features_combinations = get_combinations(key_features, min_vars_formula, max_vars_formula)

    formulas = Formula[]
    for feature_combination in key_features_combinations
        expanded_features = expand_feature_space(feature_combination, feature_space, features)
        formula = eval(parse(string(label, "~", join(expanded_features, "+"))))
        push!(formulas, formula)
    end
    return formulas
end

function get_combinations(elements, min, max)
    combinations = []
    for k=min:max
        append!(combinations, collect(subsets(elements, k)))
    end
    return combinations
end

function expand_feature_space(key_features, feature_space, features)
    expanded_features = Symbol[]
    for key in key_features
        start_index, end_index = feature_space[key]
        append!(expanded_features, vcat(features[start_index : end_index]...))
    end
    return expanded_features
end

function gen_glm(train, formulas, family=Binomial(), link=LogitLink())
    models = Dict()
    for formula in formulas
        try
            model = glm(formula, train, family, link)
            models[formula] = model
        catch
        end
    end
    return models
end

gen_glm (generic function with 3 methods)

#### Test Feature Space

Data

In [153]:
df = DataFrame(FeatureA_1=rand(5), FeatureA_2=rand(5), FeatureA_3=rand(5),
               FeatureB=rand(5),   FeatureC_1=rand(5), FeatureC_2=rand(5))
features = names(df);

Scenario 01: Feature Space Generation

In [152]:
feature_space = gen_feature_space(names(df))
result = OrderedDict([(:FeatureA, (1,3)), (:FeatureB, (4,4)), (:FeatureC, (5,6))])
@test feature_space == result

Scenario 02: Expand Feature Space

In [154]:
result1 = [:FeatureA_1, :FeatureA_2, :FeatureA_3]
result2 = [:FeatureA_1, :FeatureA_2, :FeatureA_3, :FeatureC_1, :FeatureC_2]
result3 = [:FeatureA_1, :FeatureA_2, :FeatureA_3, :FeatureB]
result4 = [:FeatureB]
result5 = [:FeatureB, :FeatureC_1, :FeatureC_2]
result6 = [:FeatureC_1, :FeatureC_2]
@test expand_feature_space([:FeatureA], feature_space, features) == result1
@test expand_feature_space([:FeatureA, :FeatureC], feature_space, features) == result2
@test expand_feature_space([:FeatureA, :FeatureB], feature_space, features) == result3
@test expand_feature_space([:FeatureB], feature_space, features) == result4
@test expand_feature_space([:FeatureB, :FeatureC], feature_space, features) == result5
@test expand_feature_space([:FeatureC], feature_space, features) == result6

Scenario 03: Generate Formulas through Expanded Feature Space

In [250]:
result = [y ~ FeatureA_1 + FeatureA_2 + FeatureA_3, 
          y ~ FeatureB,
          y ~ FeatureC_1 + FeatureC_2,
          y ~ FeatureA_1 + FeatureA_2 + FeatureA_3 + FeatureB,
          y ~ FeatureA_1 + FeatureA_2 + FeatureA_3 + FeatureC_1 + FeatureC_2,
          y ~ FeatureB + FeatureC_1 + FeatureC_2,
          y ~ FeatureA_1 + FeatureA_2 + FeatureA_3 + FeatureB + FeatureC_1 + FeatureC_2]
@test string(gen_formulas(features, feature_space, :y)) == string(result)

In [255]:
render_section("12", "", "Functions: Decision Trees, Random Forests, etc")

In [298]:
render_doc("train_decision_tree (train_x, train_y; leaves_purity=.9)",
           """Train a decision tree. <br/><br/> Source package:
              <a href="https://github.com/bensadeghi/DecisionTree.jl">
              https://github.com/bensadeghi/DecisionTree.jl</a>""",
           [("train_x", "Array{Float64,2}", "Matrix with features data"),
            ("train_y", "Array{Float64,1}", "Array with output data"),
            ("leaves_purity", "Float64", "Leaves purity percentage (Default=.9). To prune tree")],
           [("model", "Decision Tree Model", "A decision tree model fitted with the training data")])

render_doc("train_random_forest (train_x, train_y; random_features=2, num_trees=10, portion_samples=.5)",
           """Train a random forest. <br/><br/> Source package:
              <a href="https://github.com/bensadeghi/DecisionTree.jl">
              https://github.com/bensadeghi/DecisionTree.jl</a>""",
           [("train_x", "Array{Float64,2}", "Matrix with features data"),
            ("train_y", "Array{Float64,1}", "Array with output data"),
            ("random_features", "Int64", "Random feature selection (Default=2)"),
            ("num_trees", "Int64", "Number of trees used in model (Default=10)"),
            ("portion_samples", "Float64", "Number of samples per tree (Default=.5)")],
           [("model", "Random Forest Model", "A random forest model fitted with the training data")])

render_doc("train_adaptive_boosted_trees (train_x, train_y; num_iteration=7)",
           """Train Adaptive Boosted Trees. <br/><br/> Source package:
              <a href="https://github.com/bensadeghi/DecisionTree.jl">
              https://github.com/bensadeghi/DecisionTree.jl</a>""",
           [("train_x", "Array{Float64,2}", "Matrix with features data"),
            ("train_y", "Array{Float64,1}", "Array with output data"),
            ("num_iteration", "Int64", "Number of algorithm iteration (Default=7)")],
           [("model", "Adaptive boosted trees Model", """Adaptive boosted trees model
                                                         fitted with the training data"""),
            ("coefficients", "", "Trees coefficients")])

render_doc("predict (model, x)",
           "Predict output from a Decision Tree model and a given input x",
         [("model", "DecisionTree.Node", "Decision tree model"),
          ("x", "Array{Float64,2}", "Matrix with data to be predicted")],
         [("_", "Array", "Predicted output for classification or regression")])

render_doc("predict (model, x)",
           "Predict output from a Random Forest model and a given input x",
         [("model", "DecisionTree.Ensemble", "Decision tree model"),
          ("x", "Array{Float64,2}", "Matrix with data to be predicted")],
         [("_", "Array", "Predicted output for classification or regression")])

render_doc("predict (model, coeffs, x)",
           "Predict output from Adaptive boosted trees model and a given input x",
         [("model", "DecisionTree.Ensemble", "Adaptive boosted tree model"),
          ("coeffs", "Array{Float64,1}", "Coefficients from the adaptive boosted trees"),
          ("x", "Array{Float64,2}", "Matrix with data to be predicted")],
         [("_", "Array", "Predicted output for classification or regression")])

In [299]:
function train_decision_tree(train_x, train_y; leaves_purity=.9)
    model = build_tree(train_y, train_x)
    model = prune_tree(model, leaves_purity)
    return model
end

function train_random_forest(train_x, train_y; random_features=2,
                             num_trees=10, portion_samples=.5)
    model = build_forest(train_y, train_x, random_features,
                         num_trees, portion_samples)
    return model
end

function train_adaptive_boosted_trees(train_x, train_y; num_iteration=7)
    model, coefficients = build_adaboost_stumps(train_y, train_x, num_iteration)
    return model, coefficients
end

predict(model::DecisionTree.Node, x) = apply_tree(model, x)
predict(model::DecisionTree.Ensemble, x) = apply_forest(model, x)
predict(model::DecisionTree.Ensemble, coeffs::Array{Float64,1}, x) = apply_adaboost_stumps(model, coeffs, x)

predict (generic function with 3 methods)

In [256]:
render_section("13", "", "Functions: Scikit Machine Learning Models")

In [308]:
render_section("13", "01", "Define Scikit Models")

In [311]:
function create_bagging_model(base_model)
    return ensemble.BaggingClassifier(
        base_model, max_samples=0.5, max_features=0.5, 
        random_state=0, n_jobs=-1)
end

function create_adaboost_model(base_model)
    return ensemble.AdaBoostClassifier(
        base_model, n_estimators=10, random_state=0,
        algorithm="SAMME")
end

create_adaboost_model (generic function with 1 method)

In [314]:
scikit_regressor_models = Dict(
    :OLS                => sklm.LinearRegression(),
    :Ridge              => sklm.Ridge(alpha=.5), 
    :Lasso              => sklm.Lasso(alpha=.5), 
    :ElasticNet         => sklm.ElasticNet(alpha=.5, l1_ratio=0.5), 
    :LARS               => sklm.Lars(), 
    :LassoLARS          => sklm.LassoLars(alpha=1),
    :BayesianRidge      => sklm.BayesianRidge(),
    :Perceptron         => sklm.Perceptron(penalty="elasticnet", alpha=.5)
)

scikit_classifier_models = Dict(
    :LogisticRegression => sklm.LogisticRegression(),
    :SGDClassifier      => sklm.SGDClassifier(alpha=.5),
    :PassiveAggressiveClassifier => sklm.PassiveAggressiveClassifier(),
    :SVMClassifier      => svm.SVC(), # Too slow!! May crash!
    :kNN                => knn.KNeighborsClassifier(n_neighbors=2, algorithm="ball_tree"),
    :NaiveBayes         => naive_bayes.GaussianNB(),
    :DecisionTree       => tree.DecisionTreeClassifier(),
    :ExtremyTree        => tree.ExtraTreeClassifier()
)

scikit_bagging_classifier_models = Dict([
    symbol(:Bagging_, key) => create_bagging_model(scikit_classifier_models[key]) 
    for key in [:DecisionTree, :ExtremyTree]]
)

scikit_boosting_classifier_models = Dict([
    symbol(:Boosting_, key) => create_adaboost_model(scikit_classifier_models[key])
    for key in [:DecisionTree, :ExtremyTree]]
)

scikit_ensemble_classifier_models = Dict(
    :RandomForest => ensemble.RandomForestClassifier(n_estimators=10, max_depth=6, random_state=0),
    :ExtraTrees   => ensemble.ExtraTreesClassifier(n_estimators=10, max_depth=6, random_state=0),
    :GradientBoostingTrees => ensemble.GradientBoostingClassifier(n_estimators=10, learning_rate=0.5, random_state=0)
)

scikit_all_classifier_models = merge(scikit_classifier_models, 
                                     scikit_bagging_classifier_models,
                                     scikit_boosting_classifier_models,
                                     scikit_ensemble_classifier_models);

In [310]:
render_section("13", "02", "Functions: Scikit Train and Predict Methods")

In [307]:
render_doc("train_scikit_model (model, train_x, train_y)",
           """Fit Scikit Model. For more information go to
              <a scr="http://scikit-learn.org/">this site</a>""",
           [("model", "", "Sckit Model"),
            ("train_x", "Array{Float,2}", "Feature matrix"),
            ("train_y", "Array{Float,1}", "Output array")],
           [])

render_doc("train_scikit_models (models, train_x, train_y)",
           """Fit Scikit Models. For more information go to
              <a scr="http://scikit-learn.org/">this site</a>""",
           [("models", "Dict", "A dictionary with the model name and its instance"),
            ("train_x", "Array{Float,2}", "Feature matrix"),
            ("train_y", "Array{Float,1}", "Output array")],
           [])

render_doc("predict_scikit_model (model, X)",
           "Make predicitons with Scikit model",
           [("model", "", "Scikit Model"),
            ("X", "Array{Float,2}", "Feature matrix")],
           [("predictions", "Array", "A array with its predictions based on X")])

render_doc("predict_scikit_models (models, X)",
           "Make predicitons with Scikit models",
           [("models", "Dict", "A dictionary with the model name and its instance"),
            ("X", "Array{Float,2}", "Feature matrix")],
           [("predictions", "Dict", "A dictionary with the model name and its predictions based on X")])

In [306]:
train_scikit_model(model, train_x, train_y) = model[:fit](train_x, train_y)

function train_scikit_models(models, train_x, train_y)
    for (i, key) in enumerate(keys(models))
        println("Training Model $key ($i/$(length(models)))...")
        tic(); models[key][:fit](train_x, train_y); toc()
    end
end

predict_scikit_model(model, X) = model[:predict](X)

function predict_scikit_models(models, X)
    predictions = Dict()
    for model_name in keys(models)
        model = models[model_name]
        yhat = model[:predict](X)
        predictions[model_name] = yhat
    end
    return predictions
end

predict_scikit_models (generic function with 1 method)

In [257]:
render_section("14", "", "Functions: Evaluation Metrics")

In [258]:
render_section("14", "01", "Evaluation Metrics: RMSE and Precision")

In [297]:
render_doc("eval_rmse (y, yhat)",
           "Calculate Root Mean Squared Error (RMSE)",
           [("y", "Array{Float,1}", "Real output"),
            ("yhat", "Array{Float,1}", "Estimated output from a model")],
           [("_", "Float", "The calculated RMSE metric")])

render_doc("eval_precision (y, yhat)",
           "Calculate precision",
           [("y", "Array{Float,1}", "Real output"),
            ("yhat", "Array{Float,1}", "Estimated output from a model")],
           [("_", "Float", "The calculated precision metric")])

In [None]:
eval_rmse(y, yhats) = round(sqrt(sum((y - yhats) .^ 2)/length(y)), 4)
eval_precision(y, yhats) = round(sum(Array{Int,1}(y .== yhats))/length(y), 4)

In [22]:
render_section("15", "", "Functions: Vowpal Wabbit")

In [365]:
render_doc("generate_vw_file (df, feature_space, label)",
           "Generate from a dataframe a valid Vowpal Wabbit formatted data",
           [("df", "DataFrame", "Data that will be formatted in VW"),
            ("feature_space", "Dict", "Features grouped by feature space"),
            ("label", "Array{Int,1}", "Output label y")],
           [("vw_file", "AbstractString", "VW data file formatted")])

#### VW Constants

In [None]:
VW_LOSS_FUNCTION_SQUARED = "squared"
VW_LOSS_FUNCTION_HINGE = "hinge"
VW_LOSS_FUNCTION_LOGISTIC = "logistic"
VW_LOSS_FUNCTION_QUANTILE = "quantile"

In [377]:
render_doc("""train_vw_binary_classifier (
                  vw_file_path_gzip, output_model_file_path, passes=1,
                  quad_features=[], cubic_features=[], l1=0, l2=0,
                  decay_learning_rate=1, initial_t=0, power_t=.5, learning_rate=.5)""",
           """Train a logistic regression model in Vowpal Wabbit.</br></br>
              For more information about Vowpal Wabbit parameters go to 
              <a href="https://github.com/JohnLangford/vowpal_wabbit/wiki/Command-line-arguments">
              VW Command Line Arguments page</a>""",
           [("vw_file_path_gzip", "AbstractString", "File path gziped with the train data in VW format"),
            ("output_model_file_path", "AbstractString", "File path to export final model"),
            ("passes", "Int64", "Number of Training Passes (Default=1)"),
            ("quad_features", "Array{AbstactString,1}", "Create and use quadratic features (Default=[])"),
            ("cubic_features", "Array{AbstactString,1}", "Create and use cubic features (Default=[])"),
            ("l1", "Float64", "L1 regularization (Default=0)"),
            ("l2", "Float64", "L2 regularization (Default=0)"),
            ("decay_learning_rate", "Float64", "Set Decay factor for learning_rate between passes (Default=1)"),
            ("initial_t", "Float64", "Initial t value (Default=0)"),
            ("power_t", "Float64", "t power value"),
            ("learning_rate", "Float64", "Set (initial) learning Rate")],
           [])

#### VW Methods

In [374]:
function generate_vw_file(df, feature_space, label)
    vw_file = ""
    nrows = size(df,1)
    features = names(df)
    for i=1:nrows
        vw_file *= string(df[i, label], " ")
        for (key, indices) in feature_space 
            start_index, end_index = indices
            sub_features = features[start_index:end_index]
            sub_features_names = vcat(names(df)[start_index:end_index])
            sub_features_values = vcat(DataFrames.columns(df[i, sub_features])...)
            sub_features = [string(sub_features_names[i], ":", sub_features_values[i])
                            for i=1:length(sub_features_names)]
            vw_file *= "|$key $(join(sub_features, ' '))"
        end
        vw_file *= "\n"
    end
    vw_file = strip(vw_file) 
    return vw_file
end

function get_vw_general_parameters(
    vw_file_path_gzip, output_model_file_path, passes=1, quad_features=[],
    cubic_features=[], l1=0, l2=0, decay_learning_rate=1, 
    initial_t=0, power_t=.5, learning_rate=.5, loss_function="squared")
    cmd  = "vw -d '$vw_file_path_gzip' "
    cmd *= "-f '$output_model' "
    cmd *= "--passes $passes "
    cmd *= "--cache "
    cmd *= "--compressed "
    cmd *= length(quad_features) == 0? "" : join(quad_features, " -q ")
    cmd *= length(cubic_features) == 0? "" : string(join(cubic_features, " --cubic "), " ")
    cmd *= "--bfgs "
    cmd *= "--l1 $l1 "
    cmd *= "--l2 $l2 "
    cmd *= "--decay_learning_rate $decay_learning_rate "
    cmd *= "--initial_t $initial_t "
    cmd *= "--power_t $power_t "
    cmd *= "--learning_rate $learning_rate "
    cmd *= "--loss_function $loss_function "
    return cmd
end

function train_vw_binary_classifier(
    vw_file_path_gzip, output_model_file_path, passes=1, quad_features=[],
    cubic_features=[], l1=0, l2=0, decay_learning_rate=1,
    initial_t=0, power_t=.5, learning_rate=.5)
    cmd = get_vw_general_parameters(vw_file_path_gzip, output_model, 
            passes, quad_features, cubic_features, l1, l2, 
            decay_learning_rate, initial_t, power_t, learning_rate,
            VW_LOSS_FUNCTION_LOGISTIC)
    cmd *= "--binary"
    readall(`$cmd`)
end

train_vw_binary_classifier (generic function with 11 methods)